summaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig2
-rw-r--r--arch/x86/Makefile4
-rw-r--r--arch/x86/boot/compressed/ident_map_64.c8
-rw-r--r--arch/x86/entry/common.c2
-rw-r--r--arch/x86/events/amd/core.c24
-rw-r--r--arch/x86/events/intel/uncore_snbep.c12
-rw-r--r--arch/x86/hyperv/hv_apic.c15
-rw-r--r--arch/x86/hyperv/hv_init.c117
-rw-r--r--arch/x86/hyperv/hv_vtl.c3
-rw-r--r--arch/x86/hyperv/ivm.c263
-rw-r--r--arch/x86/include/asm/audit.h7
-rw-r--r--arch/x86/include/asm/boot.h45
-rw-r--r--arch/x86/include/asm/cpufeatures.h1
-rw-r--r--arch/x86/include/asm/efi.h32
-rw-r--r--arch/x86/include/asm/hyperv-tlfs.h10
-rw-r--r--arch/x86/include/asm/kexec.h2
-rw-r--r--arch/x86/include/asm/kvm_host.h49
-rw-r--r--arch/x86/include/asm/kvm_page_track.h67
-rw-r--r--arch/x86/include/asm/linkage.h15
-rw-r--r--arch/x86/include/asm/mman.h15
-rw-r--r--arch/x86/include/asm/mmu_context.h3
-rw-r--r--arch/x86/include/asm/mshyperv.h73
-rw-r--r--arch/x86/include/asm/paravirt_types.h15
-rw-r--r--arch/x86/include/asm/pgtable.h8
-rw-r--r--arch/x86/include/asm/processor.h2
-rw-r--r--arch/x86/include/asm/reboot.h7
-rw-r--r--arch/x86/include/asm/svm.h5
-rw-r--r--arch/x86/include/asm/virtext.h154
-rw-r--r--arch/x86/include/asm/vmx.h2
-rw-r--r--arch/x86/include/asm/xen/hypervisor.h37
-rw-r--r--arch/x86/include/uapi/asm/mman.h8
-rw-r--r--arch/x86/kernel/alternative.c9
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c11
-rw-r--r--arch/x86/kernel/audit_64.c5
-rw-r--r--arch/x86/kernel/callthunks.c1
-rw-r--r--arch/x86/kernel/cpu/amd.c28
-rw-r--r--arch/x86/kernel/cpu/bugs.c17
-rw-r--r--arch/x86/kernel/cpu/common.c10
-rw-r--r--arch/x86/kernel/cpu/mshyperv.c91
-rw-r--r--arch/x86/kernel/cpu/sgx/encl.c30
-rw-r--r--arch/x86/kernel/cpu/sgx/virt.c3
-rw-r--r--arch/x86/kernel/crash.c31
-rw-r--r--arch/x86/kernel/fpu/xstate.c2
-rw-r--r--arch/x86/kernel/i8259.c4
-rw-r--r--arch/x86/kernel/kgdb.c1
-rw-r--r--arch/x86/kernel/paravirt.c67
-rw-r--r--arch/x86/kernel/process.c7
-rw-r--r--arch/x86/kernel/reboot.c66
-rw-r--r--arch/x86/kernel/setup.c8
-rw-r--r--arch/x86/kernel/sev-shared.c69
-rw-r--r--arch/x86/kernel/sev.c3
-rw-r--r--arch/x86/kernel/shstk.c33
-rw-r--r--arch/x86/kernel/smpboot.c14
-rw-r--r--arch/x86/kernel/vmlinux.lds.S2
-rw-r--r--arch/x86/kvm/Kconfig15
-rw-r--r--arch/x86/kvm/cpuid.c40
-rw-r--r--arch/x86/kvm/cpuid.h46
-rw-r--r--arch/x86/kvm/emulate.c2
-rw-r--r--arch/x86/kvm/governed_features.h21
-rw-r--r--arch/x86/kvm/hyperv.c1
-rw-r--r--arch/x86/kvm/kvm_emulate.h1
-rw-r--r--arch/x86/kvm/lapic.c29
-rw-r--r--arch/x86/kvm/mmu.h2
-rw-r--r--arch/x86/kvm/mmu/mmu.c392
-rw-r--r--arch/x86/kvm/mmu/mmu_internal.h42
-rw-r--r--arch/x86/kvm/mmu/page_track.c252
-rw-r--r--arch/x86/kvm/mmu/page_track.h58
-rw-r--r--arch/x86/kvm/mmu/paging_tmpl.h41
-rw-r--r--arch/x86/kvm/mmu/spte.c6
-rw-r--r--arch/x86/kvm/mmu/spte.h21
-rw-r--r--arch/x86/kvm/mmu/tdp_iter.c11
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.c189
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.h5
-rw-r--r--arch/x86/kvm/pmu.c4
-rw-r--r--arch/x86/kvm/reverse_cpuid.h1
-rw-r--r--arch/x86/kvm/svm/avic.c59
-rw-r--r--arch/x86/kvm/svm/nested.c57
-rw-r--r--arch/x86/kvm/svm/sev.c134
-rw-r--r--arch/x86/kvm/svm/svm.c370
-rw-r--r--arch/x86/kvm/svm/svm.h62
-rw-r--r--arch/x86/kvm/vmx/capabilities.h2
-rw-r--r--arch/x86/kvm/vmx/hyperv.c2
-rw-r--r--arch/x86/kvm/vmx/nested.c13
-rw-r--r--arch/x86/kvm/vmx/nested.h2
-rw-r--r--arch/x86/kvm/vmx/pmu_intel.c81
-rw-r--r--arch/x86/kvm/vmx/vmx.c228
-rw-r--r--arch/x86/kvm/vmx/vmx.h3
-rw-r--r--arch/x86/kvm/x86.c88
-rw-r--r--arch/x86/kvm/x86.h1
-rw-r--r--arch/x86/lib/memcpy_64.S2
-rw-r--r--arch/x86/lib/memmove_64.S2
-rw-r--r--arch/x86/lib/memset_64.S2
-rw-r--r--arch/x86/lib/putuser.S4
-rw-r--r--arch/x86/platform/efi/efi_32.c12
-rw-r--r--arch/x86/platform/efi/efi_64.c19
-rw-r--r--arch/x86/purgatory/Makefile4
-rw-r--r--arch/x86/um/Makefile5
-rw-r--r--arch/x86/um/asm/mm_context.h2
-rw-r--r--arch/x86/um/checksum_32.S214
-rw-r--r--arch/x86/xen/efi.c2
-rw-r--r--arch/x86/xen/enlighten.c2
-rw-r--r--arch/x86/xen/enlighten_hvm.c2
-rw-r--r--arch/x86/xen/enlighten_pv.c40
-rw-r--r--arch/x86/xen/mmu_pv.c55
-rw-r--r--arch/x86/xen/multicalls.h4
105 files changed, 2411 insertions, 1760 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 982b777eadc7..66bfabae8814 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1945,6 +1945,7 @@ config EFI
select UCS2_STRING
select EFI_RUNTIME_WRAPPERS
select ARCH_USE_MEMREMAP_PROT
+ select EFI_RUNTIME_MAP if KEXEC_CORE
help
This enables the kernel to use EFI runtime services that are
available (such as the EFI variable services).
@@ -2020,7 +2021,6 @@ config EFI_MAX_FAKE_MEM
config EFI_RUNTIME_MAP
bool "Export EFI runtime maps to sysfs" if EXPERT
depends on EFI
- default KEXEC_CORE
help
Export EFI runtime memory regions to /sys/firmware/efi/runtime-map.
That memory map is required by the 2nd kernel to set up EFI virtual
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 95315d3474a2..5bfe5caaa444 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -335,9 +335,5 @@ define archhelp
echo ' bzdisk/fdimage*/hdimage/isoimage also accept:'
echo ' FDARGS="..." arguments for the booted kernel'
echo ' FDINITRD=file initrd for the booted kernel'
- echo ''
- echo ' kvm_guest.config - Enable Kconfig items for running this kernel as a KVM guest'
- echo ' xen.config - Enable Kconfig items for running this kernel as a Xen guest'
- echo ' x86_debug.config - Enable tip tree debugging options for testing'
endef
diff --git a/arch/x86/boot/compressed/ident_map_64.c b/arch/x86/boot/compressed/ident_map_64.c
index bcc956c17872..08f93b0401bb 100644
--- a/arch/x86/boot/compressed/ident_map_64.c
+++ b/arch/x86/boot/compressed/ident_map_64.c
@@ -59,6 +59,14 @@ static void *alloc_pgt_page(void *context)
return NULL;
}
+ /* Consumed more tables than expected? */
+ if (pages->pgt_buf_offset == BOOT_PGT_SIZE_WARN) {
+ debug_putstr("pgt_buf running low in " __FILE__ "\n");
+ debug_putstr("Need to raise BOOT_PGT_SIZE?\n");
+ debug_putaddr(pages->pgt_buf_offset);
+ debug_putaddr(pages->pgt_buf_size);
+ }
+
entry = pages->pgt_buf + pages->pgt_buf_offset;
pages->pgt_buf_offset += PAGE_SIZE;
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 6c2826417b33..93c60c0c9d4a 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -294,7 +294,7 @@ static void __xen_pv_evtchn_do_upcall(struct pt_regs *regs)
inc_irq_stat(irq_hv_callback_count);
- xen_hvm_evtchn_do_upcall();
+ xen_evtchn_do_upcall();
set_irq_regs(old_regs);
}
diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c
index abadd5f23425..e24976593a29 100644
--- a/arch/x86/events/amd/core.c
+++ b/arch/x86/events/amd/core.c
@@ -534,8 +534,12 @@ static void amd_pmu_cpu_reset(int cpu)
/* Clear enable bits i.e. PerfCntrGlobalCtl.PerfCntrEn */
wrmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_CTL, 0);
- /* Clear overflow bits i.e. PerfCntrGLobalStatus.PerfCntrOvfl */
- wrmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR, amd_pmu_global_cntr_mask);
+ /*
+ * Clear freeze and overflow bits i.e. PerfCntrGLobalStatus.LbrFreeze
+ * and PerfCntrGLobalStatus.PerfCntrOvfl
+ */
+ wrmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR,
+ GLOBAL_STATUS_LBRS_FROZEN | amd_pmu_global_cntr_mask);
}
static int amd_pmu_cpu_prepare(int cpu)
@@ -570,6 +574,7 @@ static void amd_pmu_cpu_starting(int cpu)
int i, nb_id;
cpuc->perf_ctr_virt_mask = AMD64_EVENTSEL_HOSTONLY;
+ amd_pmu_cpu_reset(cpu);
if (!x86_pmu.amd_nb_constraints)
return;
@@ -591,8 +596,6 @@ static void amd_pmu_cpu_starting(int cpu)
cpuc->amd_nb->nb_id = nb_id;
cpuc->amd_nb->refcnt++;
-
- amd_pmu_cpu_reset(cpu);
}
static void amd_pmu_cpu_dead(int cpu)
@@ -601,6 +604,7 @@ static void amd_pmu_cpu_dead(int cpu)
kfree(cpuhw->lbr_sel);
cpuhw->lbr_sel = NULL;
+ amd_pmu_cpu_reset(cpu);
if (!x86_pmu.amd_nb_constraints)
return;
@@ -613,8 +617,6 @@ static void amd_pmu_cpu_dead(int cpu)
cpuhw->amd_nb = NULL;
}
-
- amd_pmu_cpu_reset(cpu);
}
static inline void amd_pmu_set_global_ctl(u64 ctl)
@@ -884,7 +886,7 @@ static int amd_pmu_v2_handle_irq(struct pt_regs *regs)
struct hw_perf_event *hwc;
struct perf_event *event;
int handled = 0, idx;
- u64 status, mask;
+ u64 reserved, status, mask;
bool pmu_enabled;
/*
@@ -909,6 +911,14 @@ static int amd_pmu_v2_handle_irq(struct pt_regs *regs)
status &= ~GLOBAL_STATUS_LBRS_FROZEN;
}
+ reserved = status & ~amd_pmu_global_cntr_mask;
+ if (reserved)
+ pr_warn_once("Reserved PerfCntrGlobalStatus bits are set (0x%llx), please consider updating microcode\n",
+ reserved);
+
+ /* Clear any reserved bits set by buggy microcode */
+ status &= amd_pmu_global_cntr_mask;
+
for (idx = 0; idx < x86_pmu.num_counters; idx++) {
if (!test_bit(idx, cpuc->active_mask))
continue;
diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c
index 4d349986f76a..8250f0f59c2b 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -6474,8 +6474,18 @@ void spr_uncore_cpu_init(void)
type = uncore_find_type_by_id(uncore_msr_uncores, UNCORE_SPR_CHA);
if (type) {
+ /*
+ * The value from the discovery table (stored in the type->num_boxes
+ * of UNCORE_SPR_CHA) is incorrect on some SPR variants because of a
+ * firmware bug. Using the value from SPR_MSR_UNC_CBO_CONFIG to replace it.
+ */
rdmsrl(SPR_MSR_UNC_CBO_CONFIG, num_cbo);
- type->num_boxes = num_cbo;
+ /*
+ * The MSR doesn't work on the EMR XCC, but the firmware bug doesn't impact
+ * the EMR XCC. Don't let the value from the MSR replace the existing value.
+ */
+ if (num_cbo)
+ type->num_boxes = num_cbo;
}
spr_uncore_iio_free_running.num_boxes = uncore_type_max_boxes(uncore_msr_uncores, UNCORE_SPR_IIO);
}
diff --git a/arch/x86/hyperv/hv_apic.c b/arch/x86/hyperv/hv_apic.c
index 187e13b15e9a..97bfe5f0531f 100644
--- a/arch/x86/hyperv/hv_apic.c
+++ b/arch/x86/hyperv/hv_apic.c
@@ -175,8 +175,11 @@ static bool __send_ipi_mask(const struct cpumask *mask, int vector,
(exclude_self && weight == 1 && cpumask_test_cpu(this_cpu, mask)))
return true;
- if (!hv_hypercall_pg)
- return false;
+ /* A fully enlightened TDX VM uses GHCI rather than hv_hypercall_pg. */
+ if (!hv_hypercall_pg) {
+ if (ms_hyperv.paravisor_present || !hv_isolation_type_tdx())
+ return false;
+ }
if ((vector < HV_IPI_LOW_VECTOR) || (vector > HV_IPI_HIGH_VECTOR))
return false;
@@ -229,9 +232,15 @@ static bool __send_ipi_one(int cpu, int vector)
trace_hyperv_send_ipi_one(cpu, vector);
- if (!hv_hypercall_pg || (vp == VP_INVAL))
+ if (vp == VP_INVAL)
return false;
+ /* A fully enlightened TDX VM uses GHCI rather than hv_hypercall_pg. */
+ if (!hv_hypercall_pg) {
+ if (ms_hyperv.paravisor_present || !hv_isolation_type_tdx())
+ return false;
+ }
+
if ((vector < HV_IPI_LOW_VECTOR) || (vector > HV_IPI_HIGH_VECTOR))
return false;
diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
index 507d98331e7c..21556ad87f4b 100644
--- a/arch/x86/hyperv/hv_init.c
+++ b/arch/x86/hyperv/hv_init.c
@@ -7,6 +7,8 @@
* Author : K. Y. Srinivasan <kys@microsoft.com>
*/
+#define pr_fmt(fmt) "Hyper-V: " fmt
+
#include <linux/efi.h>
#include <linux/types.h>
#include <linux/bitfield.h>
@@ -19,6 +21,7 @@
#include <asm/hyperv-tlfs.h>
#include <asm/mshyperv.h>
#include <asm/idtentry.h>
+#include <asm/set_memory.h>
#include <linux/kexec.h>
#include <linux/version.h>
#include <linux/vmalloc.h>
@@ -52,7 +55,7 @@ static int hyperv_init_ghcb(void)
void *ghcb_va;
void **ghcb_base;
- if (!hv_isolation_type_snp())
+ if (!ms_hyperv.paravisor_present || !hv_isolation_type_snp())
return 0;
if (!hv_ghcb_pg)
@@ -80,7 +83,7 @@ static int hyperv_init_ghcb(void)
static int hv_cpu_init(unsigned int cpu)
{
union hv_vp_assist_msr_contents msr = { 0 };
- struct hv_vp_assist_page **hvp = &hv_vp_assist_page[cpu];
+ struct hv_vp_assist_page **hvp;
int ret;
ret = hv_common_cpu_init(cpu);
@@ -90,6 +93,7 @@ static int hv_cpu_init(unsigned int cpu)
if (!hv_vp_assist_page)
return 0;
+ hvp = &hv_vp_assist_page[cpu];
if (hv_root_partition) {
/*
* For root partition we get the hypervisor provided VP assist
@@ -107,8 +111,21 @@ static int hv_cpu_init(unsigned int cpu)
* in hv_cpu_die(), otherwise a CPU may not be stopped in the
* case of CPU offlining and the VM will hang.
*/
- if (!*hvp)
+ if (!*hvp) {
*hvp = __vmalloc(PAGE_SIZE, GFP_KERNEL | __GFP_ZERO);
+
+ /*
+ * Hyper-V should never specify a VM that is a Confidential
+ * VM and also running in the root partition. Root partition
+ * is blocked to run in Confidential VM. So only decrypt assist
+ * page in non-root partition here.
+ */
+ if (*hvp && !ms_hyperv.paravisor_present && hv_isolation_type_snp()) {
+ WARN_ON_ONCE(set_memory_decrypted((unsigned long)(*hvp), 1));
+ memset(*hvp, 0, PAGE_SIZE);
+ }
+ }
+
if (*hvp)
msr.pfn = vmalloc_to_pfn(*hvp);
@@ -176,7 +193,7 @@ void set_hv_tscchange_cb(void (*cb)(void))
struct hv_tsc_emulation_control emu_ctrl = {.enabled = 1};
if (!hv_reenlightenment_available()) {
- pr_warn("Hyper-V: reenlightenment support is unavailable\n");
+ pr_warn("reenlightenment support is unavailable\n");
return;
}
@@ -379,6 +396,40 @@ static void __init hv_get_partition_id(void)
local_irq_restore(flags);
}
+#if IS_ENABLED(CONFIG_HYPERV_VTL_MODE)
+static u8 __init get_vtl(void)
+{
+ u64 control = HV_HYPERCALL_REP_COMP_1 | HVCALL_GET_VP_REGISTERS;
+ struct hv_get_vp_registers_input *input;
+ struct hv_get_vp_registers_output *output;
+ unsigned long flags;
+ u64 ret;
+
+ local_irq_save(flags);
+ input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+ output = (struct hv_get_vp_registers_output *)input;
+
+ memset(input, 0, struct_size(input, element, 1));
+ input->header.partitionid = HV_PARTITION_ID_SELF;
+ input->header.vpindex = HV_VP_INDEX_SELF;
+ input->header.inputvtl = 0;
+ input->element[0].name0 = HV_X64_REGISTER_VSM_VP_STATUS;
+
+ ret = hv_do_hypercall(control, input, output);
+ if (hv_result_success(ret)) {
+ ret = output->as64.low & HV_X64_VTL_MASK;
+ } else {
+ pr_err("Failed to get VTL(error: %lld) exiting...\n", ret);
+ BUG();
+ }
+
+ local_irq_restore(flags);
+ return ret;
+}
+#else
+static inline u8 get_vtl(void) { return 0; }
+#endif
+
/*
* This function is to be invoked early in the boot sequence after the
* hypervisor has been detected.
@@ -399,14 +450,24 @@ void __init hyperv_init(void)
if (hv_common_init())
return;
- hv_vp_assist_page = kcalloc(num_possible_cpus(),
- sizeof(*hv_vp_assist_page), GFP_KERNEL);
+ /*
+ * The VP assist page is useless to a TDX guest: the only use we
+ * would have for it is lazy EOI, which can not be used with TDX.
+ */
+ if (hv_isolation_type_tdx())
+ hv_vp_assist_page = NULL;
+ else
+ hv_vp_assist_page = kcalloc(num_possible_cpus(),
+ sizeof(*hv_vp_assist_page),
+ GFP_KERNEL);
if (!hv_vp_assist_page) {
ms_hyperv.hints &= ~HV_X64_ENLIGHTENED_VMCS_RECOMMENDED;
- goto common_free;
+
+ if (!hv_isolation_type_tdx())
+ goto common_free;
}
- if (hv_isolation_type_snp()) {
+ if (ms_hyperv.paravisor_present && hv_isolation_type_snp()) {
/* Negotiate GHCB Version. */
if (!hv_ghcb_negotiate_protocol())
hv_ghcb_terminate(SEV_TERM_SET_GEN,
@@ -426,12 +487,32 @@ void __init hyperv_init(void)
* Setup the hypercall page and enable hypercalls.
* 1. Register the guest ID
* 2. Enable the hypercall and register the hypercall page
+ *
+ * A TDX VM with no paravisor only uses TDX GHCI rather than hv_hypercall_pg:
+ * when the hypercall input is a page, such a VM must pass a decrypted
+ * page to Hyper-V, e.g. hv_post_message() uses the per-CPU page
+ * hyperv_pcpu_input_arg, which is decrypted if no paravisor is present.
+ *
+ * A TDX VM with the paravisor uses hv_hypercall_pg for most hypercalls,
+ * which are handled by the paravisor and the VM must use an encrypted
+ * input page: in such a VM, the hyperv_pcpu_input_arg is encrypted and
+ * used in the hypercalls, e.g. see hv_mark_gpa_visibility() and
+ * hv_arch_irq_unmask(). Such a VM uses TDX GHCI for two hypercalls:
+ * 1. HVCALL_SIGNAL_EVENT: see vmbus_set_event() and _hv_do_fast_hypercall8().
+ * 2. HVCALL_POST_MESSAGE: the input page must be a decrypted page, i.e.
+ * hv_post_message() in such a VM can't use the encrypted hyperv_pcpu_input_arg;
+ * instead, hv_post_message() uses the post_msg_page, which is decrypted
+ * in such a VM and is only used in such a VM.
*/
guest_id = hv_generate_guest_id(LINUX_VERSION_CODE);
wrmsrl(HV_X64_MSR_GUEST_OS_ID, guest_id);
- /* Hyper-V requires to write guest os id via ghcb in SNP IVM. */
- hv_ghcb_msr_write(HV_X64_MSR_GUEST_OS_ID, guest_id);
+ /* With the paravisor, the VM must also write the ID via GHCB/GHCI */
+ hv_ivm_msr_write(HV_X64_MSR_GUEST_OS_ID, guest_id);
+
+ /* A TDX VM with no paravisor only uses TDX GHCI rather than hv_hypercall_pg */
+ if (hv_isolation_type_tdx() && !ms_hyperv.paravisor_present)
+ goto skip_hypercall_pg_init;
hv_hypercall_pg = __vmalloc_node_range(PAGE_SIZE, 1, VMALLOC_START,
VMALLOC_END, GFP_KERNEL, PAGE_KERNEL_ROX,
@@ -472,6 +553,7 @@ void __init hyperv_init(void)
wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64);
}
+skip_hypercall_pg_init:
/*
* Some versions of Hyper-V that provide IBT in guest VMs have a bug
* in that there's no ENDBR64 instruction at the entry to the
@@ -488,7 +570,7 @@ void __init hyperv_init(void)
if (cpu_feature_enabled(X86_FEATURE_IBT) &&
*(u32 *)hv_hypercall_pg != gen_endbr()) {
setup_clear_cpu_cap(X86_FEATURE_IBT);
- pr_warn("Hyper-V: Disabling IBT because of Hyper-V bug\n");
+ pr_warn("Disabling IBT because of Hyper-V bug\n");
}
#endif
@@ -527,11 +609,17 @@ void __init hyperv_init(void)
/* Query the VMs extended capability once, so that it can be cached. */
hv_query_ext_cap(0);
+ /* Find the VTL */
+ ms_hyperv.vtl = get_vtl();
+
+ if (ms_hyperv.vtl > 0) /* non default VTL */
+ hv_vtl_early_init();
+
return;
clean_guest_os_id:
wrmsrl(HV_X64_MSR_GUEST_OS_ID, 0);
- hv_ghcb_msr_write(HV_X64_MSR_GUEST_OS_ID, 0);
+ hv_ivm_msr_write(HV_X64_MSR_GUEST_OS_ID, 0);
cpuhp_remove_state(cpuhp);
free_ghcb_page:
free_percpu(hv_ghcb_pg);
@@ -552,7 +640,7 @@ void hyperv_cleanup(void)
/* Reset our OS id */
wrmsrl(HV_X64_MSR_GUEST_OS_ID, 0);
- hv_ghcb_msr_write(HV_X64_MSR_GUEST_OS_ID, 0);
+ hv_ivm_msr_write(HV_X64_MSR_GUEST_OS_ID, 0);
/*
* Reset hypercall page reference before reset the page,
@@ -615,6 +703,9 @@ bool hv_is_hyperv_initialized(void)
if (x86_hyper_type != X86_HYPER_MS_HYPERV)
return false;
+ /* A TDX VM with no paravisor uses TDX GHCI call rather than hv_hypercall_pg */
+ if (hv_isolation_type_tdx() && !ms_hyperv.paravisor_present)
+ return true;
/*
* Verify that earlier initialization succeeded by checking
* that the hypercall page is setup
diff --git a/arch/x86/hyperv/hv_vtl.c b/arch/x86/hyperv/hv_vtl.c
index 36a562218010..999f5ac82fe9 100644
--- a/arch/x86/hyperv/hv_vtl.c
+++ b/arch/x86/hyperv/hv_vtl.c
@@ -215,7 +215,7 @@ static int hv_vtl_wakeup_secondary_cpu(int apicid, unsigned long start_eip)
return hv_vtl_bringup_vcpu(vp_id, start_eip);
}
-static int __init hv_vtl_early_init(void)
+int __init hv_vtl_early_init(void)
{
/*
* `boot_cpu_has` returns the runtime feature support,
@@ -230,4 +230,3 @@ static int __init hv_vtl_early_init(void)
return 0;
}
-early_initcall(hv_vtl_early_init);
diff --git a/arch/x86/hyperv/ivm.c b/arch/x86/hyperv/ivm.c
index 28be6df88063..8c6bf07f7d2b 100644
--- a/arch/x86/hyperv/ivm.c
+++ b/arch/x86/hyperv/ivm.c
@@ -18,6 +18,11 @@
#include <asm/mshyperv.h>
#include <asm/hypervisor.h>
#include <asm/mtrr.h>
+#include <asm/io_apic.h>
+#include <asm/realmode.h>
+#include <asm/e820/api.h>
+#include <asm/desc.h>
+#include <uapi/asm/vmx.h>
#ifdef CONFIG_AMD_MEM_ENCRYPT
@@ -56,8 +61,10 @@ union hv_ghcb {
} hypercall;
} __packed __aligned(HV_HYP_PAGE_SIZE);
+/* Only used in an SNP VM with the paravisor */
static u16 hv_ghcb_version __ro_after_init;
+/* Functions only used in an SNP VM with the paravisor go here. */
u64 hv_ghcb_hypercall(u64 control, void *input, void *output, u32 input_size)
{
union hv_ghcb *hv_ghcb;
@@ -175,7 +182,7 @@ bool hv_ghcb_negotiate_protocol(void)
return true;
}
-void hv_ghcb_msr_write(u64 msr, u64 value)
+static void hv_ghcb_msr_write(u64 msr, u64 value)
{
union hv_ghcb *hv_ghcb;
void **ghcb_base;
@@ -203,9 +210,8 @@ void hv_ghcb_msr_write(u64 msr, u64 value)
local_irq_restore(flags);
}
-EXPORT_SYMBOL_GPL(hv_ghcb_msr_write);
-void hv_ghcb_msr_read(u64 msr, u64 *value)
+static void hv_ghcb_msr_read(u64 msr, u64 *value)
{
union hv_ghcb *hv_ghcb;
void **ghcb_base;
@@ -235,7 +241,217 @@ void hv_ghcb_msr_read(u64 msr, u64 *value)
| ((u64)lower_32_bits(hv_ghcb->ghcb.save.rdx) << 32);
local_irq_restore(flags);
}
-EXPORT_SYMBOL_GPL(hv_ghcb_msr_read);
+
+/* Only used in a fully enlightened SNP VM, i.e. without the paravisor */
+static u8 ap_start_input_arg[PAGE_SIZE] __bss_decrypted __aligned(PAGE_SIZE);
+static u8 ap_start_stack[PAGE_SIZE] __aligned(PAGE_SIZE);
+static DEFINE_PER_CPU(struct sev_es_save_area *, hv_sev_vmsa);
+
+/* Functions only used in an SNP VM without the paravisor go here. */
+
+#define hv_populate_vmcb_seg(seg, gdtr_base) \
+do { \
+ if (seg.selector) { \
+ seg.base = 0; \
+ seg.limit = HV_AP_SEGMENT_LIMIT; \
+ seg.attrib = *(u16 *)(gdtr_base + seg.selector + 5); \
+ seg.attrib = (seg.attrib & 0xFF) | ((seg.attrib >> 4) & 0xF00); \
+ } \
+} while (0) \
+
+static int snp_set_vmsa(void *va, bool vmsa)
+{
+ u64 attrs;
+
+ /*
+ * Running at VMPL0 allows the kernel to change the VMSA bit for a page
+ * using the RMPADJUST instruction. However, for the instruction to
+ * succeed it must target the permissions of a lesser privileged
+ * (higher numbered) VMPL level, so use VMPL1 (refer to the RMPADJUST
+ * instruction in the AMD64 APM Volume 3).
+ */
+ attrs = 1;
+ if (vmsa)
+ attrs |= RMPADJUST_VMSA_PAGE_BIT;
+
+ return rmpadjust((unsigned long)va, RMP_PG_SIZE_4K, attrs);
+}
+
+static void snp_cleanup_vmsa(struct sev_es_save_area *vmsa)
+{
+ int err;
+
+ err = snp_set_vmsa(vmsa, false);
+ if (err)
+ pr_err("clear VMSA page failed (%u), leaking page\n", err);
+ else
+ free_page((unsigned long)vmsa);
+}
+
+int hv_snp_boot_ap(int cpu, unsigned long start_ip)
+{
+ struct sev_es_save_area *vmsa = (struct sev_es_save_area *)
+ __get_free_page(GFP_KERNEL | __GFP_ZERO);
+ struct sev_es_save_area *cur_vmsa;
+ struct desc_ptr gdtr;
+ u64 ret, retry = 5;
+ struct hv_enable_vp_vtl *start_vp_input;
+ unsigned long flags;
+
+ if (!vmsa)
+ return -ENOMEM;
+
+ native_store_gdt(&gdtr);
+
+ vmsa->gdtr.base = gdtr.address;
+ vmsa->gdtr.limit = gdtr.size;
+
+ asm volatile("movl %%es, %%eax;" : "=a" (vmsa->es.selector));
+ hv_populate_vmcb_seg(vmsa->es, vmsa->gdtr.base);
+
+ asm volatile("movl %%cs, %%eax;" : "=a" (vmsa->cs.selector));
+ hv_populate_vmcb_seg(vmsa->cs, vmsa->gdtr.base);
+
+ asm volatile("movl %%ss, %%eax;" : "=a" (vmsa->ss.selector));
+ hv_populate_vmcb_seg(vmsa->ss, vmsa->gdtr.base);
+
+ asm volatile("movl %%ds, %%eax;" : "=a" (vmsa->ds.selector));
+ hv_populate_vmcb_seg(vmsa->ds, vmsa->gdtr.base);
+
+ vmsa->efer = native_read_msr(MSR_EFER);
+
+ asm volatile("movq %%cr4, %%rax;" : "=a" (vmsa->cr4));
+ asm volatile("movq %%cr3, %%rax;" : "=a" (vmsa->cr3));
+ asm volatile("movq %%cr0, %%rax;" : "=a" (vmsa->cr0));
+
+ vmsa->xcr0 = 1;
+ vmsa->g_pat = HV_AP_INIT_GPAT_DEFAULT;
+ vmsa->rip = (u64)secondary_startup_64_no_verify;
+ vmsa->rsp = (u64)&ap_start_stack[PAGE_SIZE];
+
+ /*
+ * Set the SNP-specific fields for this VMSA:
+ * VMPL level
+ * SEV_FEATURES (matches the SEV STATUS MSR right shifted 2 bits)
+ */
+ vmsa->vmpl = 0;
+ vmsa->sev_features = sev_status >> 2;
+
+ ret = snp_set_vmsa(vmsa, true);
+ if (!ret) {
+ pr_err("RMPADJUST(%llx) failed: %llx\n", (u64)vmsa, ret);
+ free_page((u64)vmsa);
+ return ret;
+ }
+
+ local_irq_save(flags);
+ start_vp_input = (struct hv_enable_vp_vtl *)ap_start_input_arg;
+ memset(start_vp_input, 0, sizeof(*start_vp_input));
+ start_vp_input->partition_id = -1;
+ start_vp_input->vp_index = cpu;
+ start_vp_input->target_vtl.target_vtl = ms_hyperv.vtl;
+ *(u64 *)&start_vp_input->vp_context = __pa(vmsa) | 1;
+
+ do {
+ ret = hv_do_hypercall(HVCALL_START_VP,
+ start_vp_input, NULL);
+ } while (hv_result(ret) == HV_STATUS_TIME_OUT && retry--);
+
+ local_irq_restore(flags);
+
+ if (!hv_result_success(ret)) {
+ pr_err("HvCallStartVirtualProcessor failed: %llx\n", ret);
+ snp_cleanup_vmsa(vmsa);
+ vmsa = NULL;
+ }
+
+ cur_vmsa = per_cpu(hv_sev_vmsa, cpu);
+ /* Free up any previous VMSA page */
+ if (cur_vmsa)
+ snp_cleanup_vmsa(cur_vmsa);
+
+ /* Record the current VMSA page */
+ per_cpu(hv_sev_vmsa, cpu) = vmsa;
+
+ return ret;
+}
+
+#else
+static inline void hv_ghcb_msr_write(u64 msr, u64 value) {}
+static inline void hv_ghcb_msr_read(u64 msr, u64 *value) {}
+#endif /* CONFIG_AMD_MEM_ENCRYPT */
+
+#ifdef CONFIG_INTEL_TDX_GUEST
+static void hv_tdx_msr_write(u64 msr, u64 val)
+{
+ struct tdx_hypercall_args args = {
+ .r10 = TDX_HYPERCALL_STANDARD,
+ .r11 = EXIT_REASON_MSR_WRITE,
+ .r12 = msr,
+ .r13 = val,
+ };
+
+ u64 ret = __tdx_hypercall(&args);
+
+ WARN_ONCE(ret, "Failed to emulate MSR write: %lld\n", ret);
+}
+
+static void hv_tdx_msr_read(u64 msr, u64 *val)
+{
+ struct tdx_hypercall_args args = {
+ .r10 = TDX_HYPERCALL_STANDARD,
+ .r11 = EXIT_REASON_MSR_READ,
+ .r12 = msr,
+ };
+
+ u64 ret = __tdx_hypercall_ret(&args);
+
+ if (WARN_ONCE(ret, "Failed to emulate MSR read: %lld\n", ret))
+ *val = 0;
+ else
+ *val = args.r11;
+}
+
+u64 hv_tdx_hypercall(u64 control, u64 param1, u64 param2)
+{
+ struct tdx_hypercall_args args = { };
+
+ args.r10 = control;
+ args.rdx = param1;
+ args.r8 = param2;
+
+ (void)__tdx_hypercall_ret(&args);
+
+ return args.r11;
+}
+
+#else
+static inline void hv_tdx_msr_write(u64 msr, u64 value) {}
+static inline void hv_tdx_msr_read(u64 msr, u64 *value) {}
+#endif /* CONFIG_INTEL_TDX_GUEST */
+
+#if defined(CONFIG_AMD_MEM_ENCRYPT) || defined(CONFIG_INTEL_TDX_GUEST)
+void hv_ivm_msr_write(u64 msr, u64 value)
+{
+ if (!ms_hyperv.paravisor_present)
+ return;
+
+ if (hv_isolation_type_tdx())
+ hv_tdx_msr_write(msr, value);
+ else if (hv_isolation_type_snp())
+ hv_ghcb_msr_write(msr, value);
+}
+
+void hv_ivm_msr_read(u64 msr, u64 *value)
+{
+ if (!ms_hyperv.paravisor_present)
+ return;
+
+ if (hv_isolation_type_tdx())
+ hv_tdx_msr_read(msr, value);
+ else if (hv_isolation_type_snp())
+ hv_ghcb_msr_read(msr, value);
+}
/*
* hv_mark_gpa_visibility - Set pages visible to host via hvcall.
@@ -358,13 +574,34 @@ static bool hv_is_private_mmio(u64 addr)
void __init hv_vtom_init(void)
{
+ enum hv_isolation_type type = hv_get_isolation_type();
+
+ switch (type) {
+ case HV_ISOLATION_TYPE_VBS:
+ fallthrough;
/*
* By design, a VM using vTOM doesn't see the SEV setting,
* so SEV initialization is bypassed and sev_status isn't set.
* Set it here to indicate a vTOM VM.
+ *
+ * Note: if CONFIG_AMD_MEM_ENCRYPT is not set, sev_status is
+ * defined as 0ULL, to which we can't assigned a value.
*/
- sev_status = MSR_AMD64_SNP_VTOM;
- cc_vendor = CC_VENDOR_AMD;
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+ case HV_ISOLATION_TYPE_SNP:
+ sev_status = MSR_AMD64_SNP_VTOM;
+ cc_vendor = CC_VENDOR_AMD;
+ break;
+#endif
+
+ case HV_ISOLATION_TYPE_TDX:
+ cc_vendor = CC_VENDOR_INTEL;
+ break;
+
+ default:
+ panic("hv_vtom_init: unsupported isolation type %d\n", type);
+ }
+
cc_set_mask(ms_hyperv.shared_gpa_boundary);
physical_mask &= ms_hyperv.shared_gpa_boundary - 1;
@@ -377,7 +614,7 @@ void __init hv_vtom_init(void)
mtrr_overwrite_state(NULL, 0, MTRR_TYPE_WRBACK);
}
-#endif /* CONFIG_AMD_MEM_ENCRYPT */
+#endif /* defined(CONFIG_AMD_MEM_ENCRYPT) || defined(CONFIG_INTEL_TDX_GUEST) */
enum hv_isolation_type hv_get_isolation_type(void)
{
@@ -405,10 +642,20 @@ bool hv_is_isolation_supported(void)
DEFINE_STATIC_KEY_FALSE(isolation_type_snp);
/*
- * hv_isolation_type_snp - Check system runs in the AMD SEV-SNP based
+ * hv_isolation_type_snp - Check if the system runs in an AMD SEV-SNP based
* isolation VM.
*/
bool hv_isolation_type_snp(void)
{
return static_branch_unlikely(&isolation_type_snp);
}
+
+DEFINE_STATIC_KEY_FALSE(isolation_type_tdx);
+/*
+ * hv_isolation_type_tdx - Check if the system runs in an Intel TDX based
+ * isolated VM.
+ */
+bool hv_isolation_type_tdx(void)
+{
+ return static_branch_unlikely(&isolation_type_tdx);
+}
diff --git a/arch/x86/include/asm/audit.h b/arch/x86/include/asm/audit.h
index 36aec57ea7a3..fa918f01333e 100644
--- a/arch/x86/include/asm/audit.h
+++ b/arch/x86/include/asm/audit.h
@@ -4,4 +4,11 @@
int ia32_classify_syscall(unsigned int syscall);
+extern unsigned ia32_dir_class[];
+extern unsigned ia32_write_class[];
+extern unsigned ia32_read_class[];
+extern unsigned ia32_chattr_class[];
+extern unsigned ia32_signal_class[];
+
+
#endif /* _ASM_X86_AUDIT_H */
diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h
index 4ae14339cb8c..b3a7cfb0d99e 100644
--- a/arch/x86/include/asm/boot.h
+++ b/arch/x86/include/asm/boot.h
@@ -40,23 +40,40 @@
#ifdef CONFIG_X86_64
# define BOOT_STACK_SIZE 0x4000
+/*
+ * Used by decompressor's startup_32() to allocate page tables for identity
+ * mapping of the 4G of RAM in 4-level paging mode:
+ * - 1 level4 table;
+ * - 1 level3 table;
+ * - 4 level2 table that maps everything with 2M pages;
+ *
+ * The additional level5 table needed for 5-level paging is allocated from
+ * trampoline_32bit memory.
+ */
# define BOOT_INIT_PGT_SIZE (6*4096)
-# ifdef CONFIG_RANDOMIZE_BASE
+
/*
- * Assuming all cross the 512GB boundary:
- * 1 page for level4
- * (2+2)*4 pages for kernel, param, cmd_line, and randomized kernel
- * 2 pages for first 2M (video RAM: CONFIG_X86_VERBOSE_BOOTUP).
- * Total is 19 pages.
+ * Total number of page tables kernel_add_identity_map() can allocate,
+ * including page tables consumed by startup_32().
+ *
+ * Worst-case scenario:
+ * - 5-level paging needs 1 level5 table;
+ * - KASLR needs to map kernel, boot_params, cmdline and randomized kernel,
+ * assuming all of them cross 256T boundary:
+ * + 4*2 level4 table;
+ * + 4*2 level3 table;
+ * + 4*2 level2 table;
+ * - X86_VERBOSE_BOOTUP needs to map the first 2M (video RAM):
+ * + 1 level4 table;
+ * + 1 level3 table;
+ * + 1 level2 table;
+ * Total: 28 tables
+ *
+ * Add 4 spare table in case decompressor touches anything beyond what is
+ * accounted above. Warn if it happens.
*/
-# ifdef CONFIG_X86_VERBOSE_BOOTUP
-# define BOOT_PGT_SIZE (19*4096)
-# else /* !CONFIG_X86_VERBOSE_BOOTUP */
-# define BOOT_PGT_SIZE (17*4096)
-# endif
-# else /* !CONFIG_RANDOMIZE_BASE */
-# define BOOT_PGT_SIZE BOOT_INIT_PGT_SIZE
-# endif
+# define BOOT_PGT_SIZE_WARN (28*4096)
+# define BOOT_PGT_SIZE (32*4096)
#else /* !CONFIG_X86_64 */
# define BOOT_STACK_SIZE 0x1000
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 2061ed1c398f..58cb9495e40f 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -439,6 +439,7 @@
#define X86_FEATURE_SEV_ES (19*32+ 3) /* AMD Secure Encrypted Virtualization - Encrypted State */
#define X86_FEATURE_V_TSC_AUX (19*32+ 9) /* "" Virtual TSC_AUX */
#define X86_FEATURE_SME_COHERENT (19*32+10) /* "" AMD hardware-enforced cache coherency */
+#define X86_FEATURE_DEBUG_SWAP (19*32+14) /* AMD SEV-ES full debug state swap support */
/* AMD-defined Extended Feature 2 EAX, CPUID level 0x80000021 (EAX), word 20 */
#define X86_FEATURE_NO_NESTED_DATA_BP (20*32+ 0) /* "" No Nested Data Breakpoints */
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index b0994ae3bc23..c4555b269a1b 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -91,19 +91,6 @@ static inline void efi_fpu_end(void)
#ifdef CONFIG_X86_32
#define EFI_X86_KERNEL_ALLOC_LIMIT (SZ_512M - 1)
-
-#define arch_efi_call_virt_setup() \
-({ \
- efi_fpu_begin(); \
- firmware_restrict_branch_speculation_start(); \
-})
-
-#define arch_efi_call_virt_teardown() \
-({ \
- firmware_restrict_branch_speculation_end(); \
- efi_fpu_end(); \
-})
-
#else /* !CONFIG_X86_32 */
#define EFI_X86_KERNEL_ALLOC_LIMIT EFI_ALLOC_LIMIT
@@ -116,14 +103,6 @@ extern bool efi_disable_ibt_for_runtime;
__efi_call(__VA_ARGS__); \
})
-#define arch_efi_call_virt_setup() \
-({ \
- efi_sync_low_kernel_mappings(); \
- efi_fpu_begin(); \
- firmware_restrict_branch_speculation_start(); \
- efi_enter_mm(); \
-})
-
#undef arch_efi_call_virt
#define arch_efi_call_virt(p, f, args...) ({ \
u64 ret, ibt = ibt_save(efi_disable_ibt_for_runtime); \
@@ -132,13 +111,6 @@ extern bool efi_disable_ibt_for_runtime;
ret; \
})
-#define arch_efi_call_virt_teardown() \
-({ \
- efi_leave_mm(); \
- firmware_restrict_branch_speculation_end(); \
- efi_fpu_end(); \
-})
-
#ifdef CONFIG_KASAN
/*
* CONFIG_KASAN may redefine memset to __memset. __memset function is present
@@ -168,8 +140,8 @@ extern void efi_delete_dummy_variable(void);
extern void efi_crash_gracefully_on_page_fault(unsigned long phys_addr);
extern void efi_free_boot_services(void);
-void efi_enter_mm(void);
-void efi_leave_mm(void);
+void arch_efi_call_virt_setup(void);
+void arch_efi_call_virt_teardown(void);
/* kexec external ABI */
struct efi_setup_data {
diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h
index cea95dcd27c2..2ff26f53cd62 100644
--- a/arch/x86/include/asm/hyperv-tlfs.h
+++ b/arch/x86/include/asm/hyperv-tlfs.h
@@ -169,7 +169,8 @@
enum hv_isolation_type {
HV_ISOLATION_TYPE_NONE = 0,
HV_ISOLATION_TYPE_VBS = 1,
- HV_ISOLATION_TYPE_SNP = 2
+ HV_ISOLATION_TYPE_SNP = 2,
+ HV_ISOLATION_TYPE_TDX = 3
};
/* Hyper-V specific model specific registers (MSRs) */
@@ -301,6 +302,13 @@ enum hv_isolation_type {
#define HV_X64_MSR_TIME_REF_COUNT HV_REGISTER_TIME_REF_COUNT
#define HV_X64_MSR_REFERENCE_TSC HV_REGISTER_REFERENCE_TSC
+/*
+ * Registers are only accessible via HVCALL_GET_VP_REGISTERS hvcall and
+ * there is not associated MSR address.
+ */
+#define HV_X64_REGISTER_VSM_VP_STATUS 0x000D0003
+#define HV_X64_VTL_MASK GENMASK(3, 0)
+
/* Hyper-V memory host visibility */
enum hv_mem_host_visibility {
VMBUS_PAGE_NOT_VISIBLE = 0,
diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h
index 3be6a98751f0..c9f6a6c5de3c 100644
--- a/arch/x86/include/asm/kexec.h
+++ b/arch/x86/include/asm/kexec.h
@@ -205,8 +205,6 @@ int arch_kimage_file_post_load_cleanup(struct kimage *image);
#endif
#endif
-typedef void crash_vmclear_fn(void);
-extern crash_vmclear_fn __rcu *crash_vmclear_loaded_vmcss;
extern void kdump_nmi_shootdown_cpus(void);
#ifdef CONFIG_CRASH_HOTPLUG
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 3bc146dfd38d..17715cb8731d 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -288,13 +288,13 @@ struct kvm_kernel_irq_routing_entry;
* kvm_mmu_page_role tracks the properties of a shadow page (where shadow page
* also includes TDP pages) to determine whether or not a page can be used in
* the given MMU context. This is a subset of the overall kvm_cpu_role to
- * minimize the size of kvm_memory_slot.arch.gfn_track, i.e. allows allocating
- * 2 bytes per gfn instead of 4 bytes per gfn.
+ * minimize the size of kvm_memory_slot.arch.gfn_write_track, i.e. allows
+ * allocating 2 bytes per gfn instead of 4 bytes per gfn.
*
* Upper-level shadow pages having gptes are tracked for write-protection via
- * gfn_track. As above, gfn_track is a 16 bit counter, so KVM must not create
- * more than 2^16-1 upper-level shadow pages at a single gfn, otherwise
- * gfn_track will overflow and explosions will ensure.
+ * gfn_write_track. As above, gfn_write_track is a 16 bit counter, so KVM must
+ * not create more than 2^16-1 upper-level shadow pages at a single gfn,
+ * otherwise gfn_write_track will overflow and explosions will ensue.
*
* A unique shadow page (SP) for a gfn is created if and only if an existing SP
* cannot be reused. The ability to reuse a SP is tracked by its role, which
@@ -746,7 +746,6 @@ struct kvm_vcpu_arch {
u64 smi_count;
bool at_instruction_boundary;
bool tpr_access_reporting;
- bool xsaves_enabled;
bool xfd_no_write_intercept;
u64 ia32_xss;
u64 microcode_version;
@@ -831,6 +830,25 @@ struct kvm_vcpu_arch {
struct kvm_cpuid_entry2 *cpuid_entries;
struct kvm_hypervisor_cpuid kvm_cpuid;
+ /*
+ * FIXME: Drop this macro and use KVM_NR_GOVERNED_FEATURES directly
+ * when "struct kvm_vcpu_arch" is no longer defined in an
+ * arch/x86/include/asm header. The max is mostly arbitrary, i.e.
+ * can be increased as necessary.
+ */
+#define KVM_MAX_NR_GOVERNED_FEATURES BITS_PER_LONG
+
+ /*
+ * Track whether or not the guest is allowed to use features that are
+ * governed by KVM, where "governed" means KVM needs to manage state
+ * and/or explicitly enable the feature in hardware. Typically, but
+ * not always, governed features can be used by the guest if and only
+ * if both KVM and userspace want to expose the feature to the guest.
+ */
+ struct {
+ DECLARE_BITMAP(enabled, KVM_MAX_NR_GOVERNED_FEATURES);
+ } governed_features;
+
u64 reserved_gpa_bits;
int maxphyaddr;
@@ -1005,7 +1023,7 @@ struct kvm_lpage_info {
struct kvm_arch_memory_slot {
struct kvm_rmap_head *rmap[KVM_NR_PAGE_SIZES];
struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1];
- unsigned short *gfn_track[KVM_PAGE_TRACK_MAX];
+ unsigned short *gfn_write_track;
};
/*
@@ -1247,8 +1265,9 @@ struct kvm_arch {
* create an NX huge page (without hanging the guest).
*/
struct list_head possible_nx_huge_pages;
- struct kvm_page_track_notifier_node mmu_sp_tracker;
+#ifdef CONFIG_KVM_EXTERNAL_WRITE_TRACKING
struct kvm_page_track_notifier_head track_notifier_head;
+#endif
/*
* Protects marking pages unsync during page faults, as TDP MMU page
* faults only take mmu_lock for read. For simplicity, the unsync
@@ -1400,7 +1419,6 @@ struct kvm_arch {
* the thread holds the MMU lock in write mode.
*/
spinlock_t tdp_mmu_pages_lock;
- struct workqueue_struct *tdp_mmu_zap_wq;
#endif /* CONFIG_X86_64 */
/*
@@ -1655,8 +1673,8 @@ struct kvm_x86_ops {
u64 (*get_l2_tsc_offset)(struct kvm_vcpu *vcpu);
u64 (*get_l2_tsc_multiplier)(struct kvm_vcpu *vcpu);
- void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
- void (*write_tsc_multiplier)(struct kvm_vcpu *vcpu, u64 multiplier);
+ void (*write_tsc_offset)(struct kvm_vcpu *vcpu);
+ void (*write_tsc_multiplier)(struct kvm_vcpu *vcpu);
/*
* Retrieve somewhat arbitrary exit information. Intended to
@@ -1795,8 +1813,8 @@ static inline struct kvm *kvm_arch_alloc_vm(void)
#define __KVM_HAVE_ARCH_VM_FREE
void kvm_arch_free_vm(struct kvm *kvm);
-#define __KVM_HAVE_ARCH_FLUSH_REMOTE_TLB
-static inline int kvm_arch_flush_remote_tlb(struct kvm *kvm)
+#define __KVM_HAVE_ARCH_FLUSH_REMOTE_TLBS
+static inline int kvm_arch_flush_remote_tlbs(struct kvm *kvm)
{
if (kvm_x86_ops.flush_remote_tlbs &&
!static_call(kvm_x86_flush_remote_tlbs)(kvm))
@@ -1805,6 +1823,8 @@ static inline int kvm_arch_flush_remote_tlb(struct kvm *kvm)
return -ENOTSUPP;
}
+#define __KVM_HAVE_ARCH_FLUSH_REMOTE_TLBS_RANGE
+
#define kvm_arch_pmi_in_guest(vcpu) \
((vcpu) && (vcpu)->arch.handling_intr_from_guest)
@@ -1814,7 +1834,7 @@ void kvm_mmu_vendor_module_exit(void);
void kvm_mmu_destroy(struct kvm_vcpu *vcpu);
int kvm_mmu_create(struct kvm_vcpu *vcpu);
-int kvm_mmu_init_vm(struct kvm *kvm);
+void kvm_mmu_init_vm(struct kvm *kvm);
void kvm_mmu_uninit_vm(struct kvm *kvm);
void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu);
@@ -1833,7 +1853,6 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
const struct kvm_memory_slot *memslot);
void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
const struct kvm_memory_slot *memslot);
-void kvm_mmu_zap_all(struct kvm *kvm);
void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen);
void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long kvm_nr_mmu_pages);
diff --git a/arch/x86/include/asm/kvm_page_track.h b/arch/x86/include/asm/kvm_page_track.h
index eb186bc57f6a..3d040741044b 100644
--- a/arch/x86/include/asm/kvm_page_track.h
+++ b/arch/x86/include/asm/kvm_page_track.h
@@ -2,11 +2,9 @@
#ifndef _ASM_X86_KVM_PAGE_TRACK_H
#define _ASM_X86_KVM_PAGE_TRACK_H
-enum kvm_page_track_mode {
- KVM_PAGE_TRACK_WRITE,
- KVM_PAGE_TRACK_MAX,
-};
+#include <linux/kvm_types.h>
+#ifdef CONFIG_KVM_EXTERNAL_WRITE_TRACKING
/*
* The notifier represented by @kvm_page_track_notifier_node is linked into
* the head which will be notified when guest is triggering the track event.
@@ -26,54 +24,39 @@ struct kvm_page_track_notifier_node {
* It is called when guest is writing the write-tracked page
* and write emulation is finished at that time.
*
- * @vcpu: the vcpu where the write access happened.
* @gpa: the physical address written by guest.
* @new: the data was written to the address.
* @bytes: the written length.
* @node: this node
*/
- void (*track_write)(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new,
- int bytes, struct kvm_page_track_notifier_node *node);
+ void (*track_write)(gpa_t gpa, const u8 *new, int bytes,
+ struct kvm_page_track_notifier_node *node);
+
/*
- * It is called when memory slot is being moved or removed
- * users can drop write-protection for the pages in that memory slot
+ * Invoked when a memory region is removed from the guest. Or in KVM
+ * terms, when a memslot is deleted.
*
- * @kvm: the kvm where memory slot being moved or removed
- * @slot: the memory slot being moved or removed
- * @node: this node
+ * @gfn: base gfn of the region being removed
+ * @nr_pages: number of pages in the to-be-removed region
+ * @node: this node
*/
- void (*track_flush_slot)(struct kvm *kvm, struct kvm_memory_slot *slot,
- struct kvm_page_track_notifier_node *node);
+ void (*track_remove_region)(gfn_t gfn, unsigned long nr_pages,
+ struct kvm_page_track_notifier_node *node);
};
-int kvm_page_track_init(struct kvm *kvm);
-void kvm_page_track_cleanup(struct kvm *kvm);
+int kvm_page_track_register_notifier(struct kvm *kvm,
+ struct kvm_page_track_notifier_node *n);
+void kvm_page_track_unregister_notifier(struct kvm *kvm,
+ struct kvm_page_track_notifier_node *n);
-bool kvm_page_track_write_tracking_enabled(struct kvm *kvm);
-int kvm_page_track_write_tracking_alloc(struct kvm_memory_slot *slot);
-
-void kvm_page_track_free_memslot(struct kvm_memory_slot *slot);
-int kvm_page_track_create_memslot(struct kvm *kvm,
- struct kvm_memory_slot *slot,
- unsigned long npages);
-
-void kvm_slot_page_track_add_page(struct kvm *kvm,
- struct kvm_memory_slot *slot, gfn_t gfn,
- enum kvm_page_track_mode mode);
-void kvm_slot_page_track_remove_page(struct kvm *kvm,
- struct kvm_memory_slot *slot, gfn_t gfn,
- enum kvm_page_track_mode mode);
-bool kvm_slot_page_track_is_active(struct kvm *kvm,
- const struct kvm_memory_slot *slot,
- gfn_t gfn, enum kvm_page_track_mode mode);
+int kvm_write_track_add_gfn(struct kvm *kvm, gfn_t gfn);
+int kvm_write_track_remove_gfn(struct kvm *kvm, gfn_t gfn);
+#else
+/*
+ * Allow defining a node in a structure even if page tracking is disabled, e.g.
+ * to play nice with testing headers via direct inclusion from the command line.
+ */
+struct kvm_page_track_notifier_node {};
+#endif /* CONFIG_KVM_EXTERNAL_WRITE_TRACKING */
-void
-kvm_page_track_register_notifier(struct kvm *kvm,
- struct kvm_page_track_notifier_node *n);
-void
-kvm_page_track_unregister_notifier(struct kvm *kvm,
- struct kvm_page_track_notifier_node *n);
-void kvm_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new,
- int bytes);
-void kvm_page_track_flush_slot(struct kvm *kvm, struct kvm_memory_slot *slot);
#endif
diff --git a/arch/x86/include/asm/linkage.h b/arch/x86/include/asm/linkage.h
index 97a3de7892d3..571fe4d2d232 100644
--- a/arch/x86/include/asm/linkage.h
+++ b/arch/x86/include/asm/linkage.h
@@ -8,6 +8,14 @@
#undef notrace
#define notrace __attribute__((no_instrument_function))
+#ifdef CONFIG_64BIT
+/*
+ * The generic version tends to create spurious ENDBR instructions under
+ * certain conditions.
+ */
+#define _THIS_IP_ ({ unsigned long __here; asm ("lea 0(%%rip), %0" : "=r" (__here)); __here; })
+#endif
+
#ifdef CONFIG_X86_32
#define asmlinkage CPP_ASMLINKAGE __attribute__((regparm(0)))
#endif /* CONFIG_X86_32 */
@@ -97,6 +105,13 @@
CFI_POST_PADDING \
SYM_FUNC_END(__cfi_##name)
+/* UML needs to be able to override memcpy() and friends for KASAN. */
+#ifdef CONFIG_UML
+# define SYM_FUNC_ALIAS_MEMFUNC SYM_FUNC_ALIAS_WEAK
+#else
+# define SYM_FUNC_ALIAS_MEMFUNC SYM_FUNC_ALIAS
+#endif
+
/* SYM_TYPED_FUNC_START -- use for indirectly called globals, w/ CFI type */
#define SYM_TYPED_FUNC_START(name) \
SYM_TYPED_START(name, SYM_L_GLOBAL, SYM_F_ALIGN) \
diff --git a/arch/x86/include/asm/mman.h b/arch/x86/include/asm/mman.h
new file mode 100644
index 000000000000..12b820259b9f
--- /dev/null
+++ b/arch/x86/include/asm/mman.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __ASM_MMAN_H__
+#define __ASM_MMAN_H__
+
+#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
+#define arch_calc_vm_prot_bits(prot, key) ( \
+ ((key) & 0x1 ? VM_PKEY_BIT0 : 0) | \
+ ((key) & 0x2 ? VM_PKEY_BIT1 : 0) | \
+ ((key) & 0x4 ? VM_PKEY_BIT2 : 0) | \
+ ((key) & 0x8 ? VM_PKEY_BIT3 : 0))
+#endif
+
+#include <uapi/asm/mman.h>
+
+#endif /* __ASM_MMAN_H__ */
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 416901d406f8..8dac45a2c7fc 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -186,8 +186,7 @@ do { \
#else
#define deactivate_mm(tsk, mm) \
do { \
- if (!tsk->vfork_done) \
- shstk_free(tsk); \
+ shstk_free(tsk); \
load_gs_index(0); \
loadsegment(fs, 0); \
} while (0)
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index fa83d88e4c99..896445edc6a8 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -26,6 +26,7 @@
union hv_ghcb;
DECLARE_STATIC_KEY_FALSE(isolation_type_snp);
+DECLARE_STATIC_KEY_FALSE(isolation_type_tdx);
typedef int (*hyperv_fill_flush_list_func)(
struct hv_guest_mapping_flush_list *flush,
@@ -40,6 +41,7 @@ static inline unsigned char hv_get_nmi_reason(void)
#if IS_ENABLED(CONFIG_HYPERV)
extern int hyperv_init_cpuhp;
+extern bool hyperv_paravisor_present;
extern void *hv_hypercall_pg;
@@ -47,10 +49,25 @@ extern u64 hv_current_partition_id;
extern union hv_ghcb * __percpu *hv_ghcb_pg;
+bool hv_isolation_type_snp(void);
+bool hv_isolation_type_tdx(void);
+u64 hv_tdx_hypercall(u64 control, u64 param1, u64 param2);
+
+/*
+ * DEFAULT INIT GPAT and SEGMENT LIMIT value in struct VMSA
+ * to start AP in enlightened SEV guest.
+ */
+#define HV_AP_INIT_GPAT_DEFAULT 0x0007040600070406ULL
+#define HV_AP_SEGMENT_LIMIT 0xffffffff
+
int hv_call_deposit_pages(int node, u64 partition_id, u32 num_pages);
int hv_call_add_logical_proc(int node, u32 lp_index, u32 acpi_id);
int hv_call_create_vp(int node, u64 partition_id, u32 vp_index, u32 flags);
+/*
+ * If the hypercall involves no input or output parameters, the hypervisor
+ * ignores the corresponding GPA pointer.
+ */
static inline u64 hv_do_hypercall(u64 control, void *input, void *output)
{
u64 input_address = input ? virt_to_phys(input) : 0;
@@ -58,6 +75,19 @@ static inline u64 hv_do_hypercall(u64 control, void *input, void *output)
u64 hv_status;
#ifdef CONFIG_X86_64
+ if (hv_isolation_type_tdx() && !hyperv_paravisor_present)
+ return hv_tdx_hypercall(control, input_address, output_address);
+
+ if (hv_isolation_type_snp() && !hyperv_paravisor_present) {
+ __asm__ __volatile__("mov %4, %%r8\n"
+ "vmmcall"
+ : "=a" (hv_status), ASM_CALL_CONSTRAINT,
+ "+c" (control), "+d" (input_address)
+ : "r" (output_address)
+ : "cc", "memory", "r8", "r9", "r10", "r11");
+ return hv_status;
+ }
+
if (!hv_hypercall_pg)
return U64_MAX;
@@ -101,7 +131,16 @@ static inline u64 _hv_do_fast_hypercall8(u64 control, u64 input1)
u64 hv_status;
#ifdef CONFIG_X86_64
- {
+ if (hv_isolation_type_tdx() && !hyperv_paravisor_present)
+ return hv_tdx_hypercall(control, input1, 0);
+
+ if (hv_isolation_type_snp() && !hyperv_paravisor_present) {
+ __asm__ __volatile__(
+ "vmmcall"
+ : "=a" (hv_status), ASM_CALL_CONSTRAINT,
+ "+c" (control), "+d" (input1)
+ :: "cc", "r8", "r9", "r10", "r11");
+ } else {
__asm__ __volatile__(CALL_NOSPEC
: "=a" (hv_status), ASM_CALL_CONSTRAINT,
"+c" (control), "+d" (input1)
@@ -146,7 +185,17 @@ static inline u64 _hv_do_fast_hypercall16(u64 control, u64 input1, u64 input2)
u64 hv_status;
#ifdef CONFIG_X86_64
- {
+ if (hv_isolation_type_tdx() && !hyperv_paravisor_present)
+ return hv_tdx_hypercall(control, input1, input2);
+
+ if (hv_isolation_type_snp() && !hyperv_paravisor_present) {
+ __asm__ __volatile__("mov %4, %%r8\n"
+ "vmmcall"
+ : "=a" (hv_status), ASM_CALL_CONSTRAINT,
+ "+c" (control), "+d" (input1)
+ : "r" (input2)
+ : "cc", "r8", "r9", "r10", "r11");
+ } else {
__asm__ __volatile__("mov %4, %%r8\n"
CALL_NOSPEC
: "=a" (hv_status), ASM_CALL_CONSTRAINT,
@@ -225,20 +274,24 @@ int hv_map_ioapic_interrupt(int ioapic_id, bool level, int vcpu, int vector,
int hv_unmap_ioapic_interrupt(int ioapic_id, struct hv_interrupt_entry *entry);
#ifdef CONFIG_AMD_MEM_ENCRYPT
-void hv_ghcb_msr_write(u64 msr, u64 value);
-void hv_ghcb_msr_read(u64 msr, u64 *value);
bool hv_ghcb_negotiate_protocol(void);
void __noreturn hv_ghcb_terminate(unsigned int set, unsigned int reason);
-void hv_vtom_init(void);
+int hv_snp_boot_ap(int cpu, unsigned long start_ip);
#else
-static inline void hv_ghcb_msr_write(u64 msr, u64 value) {}
-static inline void hv_ghcb_msr_read(u64 msr, u64 *value) {}
static inline bool hv_ghcb_negotiate_protocol(void) { return false; }
static inline void hv_ghcb_terminate(unsigned int set, unsigned int reason) {}
-static inline void hv_vtom_init(void) {}
+static inline int hv_snp_boot_ap(int cpu, unsigned long start_ip) { return 0; }
#endif
-extern bool hv_isolation_type_snp(void);
+#if defined(CONFIG_AMD_MEM_ENCRYPT) || defined(CONFIG_INTEL_TDX_GUEST)
+void hv_vtom_init(void);
+void hv_ivm_msr_write(u64 msr, u64 value);
+void hv_ivm_msr_read(u64 msr, u64 *value);
+#else
+static inline void hv_vtom_init(void) {}
+static inline void hv_ivm_msr_write(u64 msr, u64 value) {}
+static inline void hv_ivm_msr_read(u64 msr, u64 *value) {}
+#endif
static inline bool hv_is_synic_reg(unsigned int reg)
{
@@ -287,8 +340,10 @@ static inline u64 hv_get_non_nested_register(unsigned int reg) { return 0; }
#ifdef CONFIG_HYPERV_VTL_MODE
void __init hv_vtl_init_platform(void);
+int __init hv_vtl_early_init(void);
#else
static inline void __init hv_vtl_init_platform(void) {}
+static inline int __init hv_vtl_early_init(void) { return 0; }
#endif
#include <asm-generic/mshyperv.h>
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index 4acbcddddc29..772d03487520 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -9,13 +9,6 @@ struct paravirt_patch_site {
u8 type; /* type of this instruction */
u8 len; /* length of original instruction */
};
-
-/* Lazy mode for batching updates / context switch */
-enum paravirt_lazy_mode {
- PARAVIRT_LAZY_NONE,
- PARAVIRT_LAZY_MMU,
- PARAVIRT_LAZY_CPU,
-};
#endif
#ifdef CONFIG_PARAVIRT
@@ -549,14 +542,6 @@ int paravirt_disable_iospace(void);
__PVOP_VCALL(op, PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \
PVOP_CALL_ARG3(arg3), PVOP_CALL_ARG4(arg4))
-enum paravirt_lazy_mode paravirt_get_lazy_mode(void);
-void paravirt_start_context_switch(struct task_struct *prev);
-void paravirt_end_context_switch(struct task_struct *next);
-
-void paravirt_enter_lazy_mmu(void);
-void paravirt_leave_lazy_mmu(void);
-void paravirt_flush_lazy_mmu(void);
-
void _paravirt_nop(void);
void paravirt_BUG(void);
unsigned long paravirt_ret0(void);
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index d6ad98ca1288..e02b179ec659 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -955,6 +955,14 @@ static inline int pte_same(pte_t a, pte_t b)
return a.pte == b.pte;
}
+static inline pte_t pte_next_pfn(pte_t pte)
+{
+ if (__pte_needs_invert(pte_val(pte)))
+ return __pte(pte_val(pte) - (1UL << PFN_PTE_SHIFT));
+ return __pte(pte_val(pte) + (1UL << PFN_PTE_SHIFT));
+}
+#define pte_next_pfn pte_next_pfn
+
static inline int pte_present(pte_t a)
{
return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE);
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 0086920cda06..a3669a7774ed 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -683,13 +683,11 @@ extern u16 get_llc_id(unsigned int cpu);
#ifdef CONFIG_CPU_SUP_AMD
extern u32 amd_get_nodes_per_socket(void);
extern u32 amd_get_highest_perf(void);
-extern bool cpu_has_ibpb_brtype_microcode(void);
extern void amd_clear_divider(void);
extern void amd_check_microcode(void);
#else
static inline u32 amd_get_nodes_per_socket(void) { return 0; }
static inline u32 amd_get_highest_perf(void) { return 0; }
-static inline bool cpu_has_ibpb_brtype_microcode(void) { return false; }
static inline void amd_clear_divider(void) { }
static inline void amd_check_microcode(void) { }
#endif
diff --git a/arch/x86/include/asm/reboot.h b/arch/x86/include/asm/reboot.h
index 9177b4354c3f..6536873f8fc0 100644
--- a/arch/x86/include/asm/reboot.h
+++ b/arch/x86/include/asm/reboot.h
@@ -25,7 +25,14 @@ void __noreturn machine_real_restart(unsigned int type);
#define MRR_BIOS 0
#define MRR_APM 1
+#if IS_ENABLED(CONFIG_KVM_INTEL) || IS_ENABLED(CONFIG_KVM_AMD)
+typedef void (cpu_emergency_virt_cb)(void);
+void cpu_emergency_register_virt_callback(cpu_emergency_virt_cb *callback);
+void cpu_emergency_unregister_virt_callback(cpu_emergency_virt_cb *callback);
void cpu_emergency_disable_virtualization(void);
+#else
+static inline void cpu_emergency_disable_virtualization(void) {}
+#endif /* CONFIG_KVM_INTEL || CONFIG_KVM_AMD */
typedef void (*nmi_shootdown_cb)(int, struct pt_regs*);
void nmi_shootdown_cpus(nmi_shootdown_cb callback);
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index e7c7379d6ac7..19bf955b67e0 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -288,6 +288,7 @@ static_assert((X2AVIC_MAX_PHYSICAL_ID & AVIC_PHYSICAL_MAX_INDEX_MASK) == X2AVIC_
#define AVIC_HPA_MASK ~((0xFFFULL << 52) | 0xFFF)
+#define SVM_SEV_FEAT_DEBUG_SWAP BIT(5)
struct vmcb_seg {
u16 selector;
@@ -345,7 +346,7 @@ struct vmcb_save_area {
u64 last_excp_from;
u64 last_excp_to;
u8 reserved_0x298[72];
- u32 spec_ctrl; /* Guest version of SPEC_CTRL at 0x2E0 */
+ u64 spec_ctrl; /* Guest version of SPEC_CTRL at 0x2E0 */
} __packed;
/* Save area definition for SEV-ES and SEV-SNP guests */
@@ -512,7 +513,7 @@ struct ghcb {
} __packed;
-#define EXPECTED_VMCB_SAVE_AREA_SIZE 740
+#define EXPECTED_VMCB_SAVE_AREA_SIZE 744
#define EXPECTED_GHCB_SAVE_AREA_SIZE 1032
#define EXPECTED_SEV_ES_SAVE_AREA_SIZE 1648
#define EXPECTED_VMCB_CONTROL_AREA_SIZE 1024
diff --git a/arch/x86/include/asm/virtext.h b/arch/x86/include/asm/virtext.h
deleted file mode 100644
index 3b12e6b99412..000000000000
--- a/arch/x86/include/asm/virtext.h
+++ /dev/null
@@ -1,154 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/* CPU virtualization extensions handling
- *
- * This should carry the code for handling CPU virtualization extensions
- * that needs to live in the kernel core.
- *
- * Author: Eduardo Habkost <ehabkost@redhat.com>
- *
- * Copyright (C) 2008, Red Hat Inc.
- *
- * Contains code from KVM, Copyright (C) 2006 Qumranet, Inc.
- */
-#ifndef _ASM_X86_VIRTEX_H
-#define _ASM_X86_VIRTEX_H
-
-#include <asm/processor.h>
-
-#include <asm/vmx.h>
-#include <asm/svm.h>
-#include <asm/tlbflush.h>
-
-/*
- * VMX functions:
- */
-
-static inline int cpu_has_vmx(void)
-{
- unsigned long ecx = cpuid_ecx(1);
- return test_bit(5, &ecx); /* CPUID.1:ECX.VMX[bit 5] -> VT */
-}
-
-
-/**
- * cpu_vmxoff() - Disable VMX on the current CPU
- *
- * Disable VMX and clear CR4.VMXE (even if VMXOFF faults)
- *
- * Note, VMXOFF causes a #UD if the CPU is !post-VMXON, but it's impossible to
- * atomically track post-VMXON state, e.g. this may be called in NMI context.
- * Eat all faults as all other faults on VMXOFF faults are mode related, i.e.
- * faults are guaranteed to be due to the !post-VMXON check unless the CPU is
- * magically in RM, VM86, compat mode, or at CPL>0.
- */
-static inline int cpu_vmxoff(void)
-{
- asm_volatile_goto("1: vmxoff\n\t"
- _ASM_EXTABLE(1b, %l[fault])
- ::: "cc", "memory" : fault);
-
- cr4_clear_bits(X86_CR4_VMXE);
- return 0;
-
-fault:
- cr4_clear_bits(X86_CR4_VMXE);
- return -EIO;
-}
-
-static inline int cpu_vmx_enabled(void)
-{
- return __read_cr4() & X86_CR4_VMXE;
-}
-
-/** Disable VMX if it is enabled on the current CPU
- *
- * You shouldn't call this if cpu_has_vmx() returns 0.
- */
-static inline void __cpu_emergency_vmxoff(void)
-{
- if (cpu_vmx_enabled())
- cpu_vmxoff();
-}
-
-/** Disable VMX if it is supported and enabled on the current CPU
- */
-static inline void cpu_emergency_vmxoff(void)
-{
- if (cpu_has_vmx())
- __cpu_emergency_vmxoff();
-}
-
-
-
-
-/*
- * SVM functions:
- */
-
-/** Check if the CPU has SVM support
- *
- * You can use the 'msg' arg to get a message describing the problem,
- * if the function returns zero. Simply pass NULL if you are not interested
- * on the messages; gcc should take care of not generating code for
- * the messages on this case.
- */
-static inline int cpu_has_svm(const char **msg)
-{
- if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD &&
- boot_cpu_data.x86_vendor != X86_VENDOR_HYGON) {
- if (msg)
- *msg = "not amd or hygon";
- return 0;
- }
-
- if (boot_cpu_data.extended_cpuid_level < SVM_CPUID_FUNC) {
- if (msg)
- *msg = "can't execute cpuid_8000000a";
- return 0;
- }
-
- if (!boot_cpu_has(X86_FEATURE_SVM)) {
- if (msg)
- *msg = "svm not available";
- return 0;
- }
- return 1;
-}
-
-
-/** Disable SVM on the current CPU
- *
- * You should call this only if cpu_has_svm() returned true.
- */
-static inline void cpu_svm_disable(void)
-{
- uint64_t efer;
-
- wrmsrl(MSR_VM_HSAVE_PA, 0);
- rdmsrl(MSR_EFER, efer);
- if (efer & EFER_SVME) {
- /*
- * Force GIF=1 prior to disabling SVM to ensure INIT and NMI
- * aren't blocked, e.g. if a fatal error occurred between CLGI
- * and STGI. Note, STGI may #UD if SVM is disabled from NMI
- * context between reading EFER and executing STGI. In that
- * case, GIF must already be set, otherwise the NMI would have
- * been blocked, so just eat the fault.
- */
- asm_volatile_goto("1: stgi\n\t"
- _ASM_EXTABLE(1b, %l[fault])
- ::: "memory" : fault);
-fault:
- wrmsrl(MSR_EFER, efer & ~EFER_SVME);
- }
-}
-
-/** Makes sure SVM is disabled, if it is supported on the CPU
- */
-static inline void cpu_emergency_svm_disable(void)
-{
- if (cpu_has_svm(NULL))
- cpu_svm_disable();
-}
-
-#endif /* _ASM_X86_VIRTEX_H */
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 0d02c4aafa6f..0e73616b82f3 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -71,7 +71,7 @@
#define SECONDARY_EXEC_RDSEED_EXITING VMCS_CONTROL_BIT(RDSEED_EXITING)
#define SECONDARY_EXEC_ENABLE_PML VMCS_CONTROL_BIT(PAGE_MOD_LOGGING)
#define SECONDARY_EXEC_PT_CONCEAL_VMX VMCS_CONTROL_BIT(PT_CONCEAL_VMX)
-#define SECONDARY_EXEC_XSAVES VMCS_CONTROL_BIT(XSAVES)
+#define SECONDARY_EXEC_ENABLE_XSAVES VMCS_CONTROL_BIT(XSAVES)
#define SECONDARY_EXEC_MODE_BASED_EPT_EXEC VMCS_CONTROL_BIT(MODE_BASED_EPT_EXEC)
#define SECONDARY_EXEC_PT_USE_GPA VMCS_CONTROL_BIT(PT_USE_GPA)
#define SECONDARY_EXEC_TSC_SCALING VMCS_CONTROL_BIT(TSC_SCALING)
diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h
index 5fc35f889cd1..7048dfacc04b 100644
--- a/arch/x86/include/asm/xen/hypervisor.h
+++ b/arch/x86/include/asm/xen/hypervisor.h
@@ -36,6 +36,7 @@
extern struct shared_info *HYPERVISOR_shared_info;
extern struct start_info *xen_start_info;
+#include <asm/bug.h>
#include <asm/processor.h>
#define XEN_SIGNATURE "XenVMMXenVMM"
@@ -63,4 +64,40 @@ void __init xen_pvh_init(struct boot_params *boot_params);
void __init mem_map_via_hcall(struct boot_params *boot_params_p);
#endif
+/* Lazy mode for batching updates / context switch */
+enum xen_lazy_mode {
+ XEN_LAZY_NONE,
+ XEN_LAZY_MMU,
+ XEN_LAZY_CPU,
+};
+
+DECLARE_PER_CPU(enum xen_lazy_mode, xen_lazy_mode);
+DECLARE_PER_CPU(unsigned int, xen_lazy_nesting);
+
+static inline void enter_lazy(enum xen_lazy_mode mode)
+{
+ enum xen_lazy_mode old_mode = this_cpu_read(xen_lazy_mode);
+
+ if (mode == old_mode) {
+ this_cpu_inc(xen_lazy_nesting);
+ return;
+ }
+
+ BUG_ON(old_mode != XEN_LAZY_NONE);
+
+ this_cpu_write(xen_lazy_mode, mode);
+}
+
+static inline void leave_lazy(enum xen_lazy_mode mode)
+{
+ BUG_ON(this_cpu_read(xen_lazy_mode) != mode);
+
+ if (this_cpu_read(xen_lazy_nesting) == 0)
+ this_cpu_write(xen_lazy_mode, XEN_LAZY_NONE);
+ else
+ this_cpu_dec(xen_lazy_nesting);
+}
+
+enum xen_lazy_mode xen_get_lazy_mode(void);
+
#endif /* _ASM_X86_XEN_HYPERVISOR_H */
diff --git a/arch/x86/include/uapi/asm/mman.h b/arch/x86/include/uapi/asm/mman.h
index 8148bdddbd2c..46cdc941f958 100644
--- a/arch/x86/include/uapi/asm/mman.h
+++ b/arch/x86/include/uapi/asm/mman.h
@@ -5,14 +5,6 @@
#define MAP_32BIT 0x40 /* only give out 32bit addresses */
#define MAP_ABOVE4G 0x80 /* only map above 4GB */
-#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
-#define arch_calc_vm_prot_bits(prot, key) ( \
- ((key) & 0x1 ? VM_PKEY_BIT0 : 0) | \
- ((key) & 0x2 ? VM_PKEY_BIT1 : 0) | \
- ((key) & 0x4 ? VM_PKEY_BIT2 : 0) | \
- ((key) & 0x8 ? VM_PKEY_BIT3 : 0))
-#endif
-
/* Flags for map_shadow_stack(2) */
#define SHADOW_STACK_SET_TOKEN (1ULL << 0) /* Set up a restore token in the shadow stack */
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index a5ead6a6d233..517ee01503be 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -720,13 +720,8 @@ void __init_or_module noinline apply_returns(s32 *start, s32 *end)
{
s32 *s;
- /*
- * Do not patch out the default return thunks if those needed are the
- * ones generated by the compiler.
- */
- if (cpu_feature_enabled(X86_FEATURE_RETHUNK) &&
- (x86_return_thunk == __x86_return_thunk))
- return;
+ if (cpu_feature_enabled(X86_FEATURE_RETHUNK))
+ static_call_force_reinit();
for (s = start; s < end; s++) {
void *dest = NULL, *addr = (void *)s + *s;
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index d9f5d7492f83..205cee567629 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -1533,7 +1533,7 @@ static void __init build_socket_tables(void)
{
struct uv_gam_range_entry *gre = uv_gre_table;
int nums, numn, nump;
- int cpu, i, lnid;
+ int i, lnid, apicid;
int minsock = _min_socket;
int maxsock = _max_socket;
int minpnode = _min_pnode;
@@ -1584,15 +1584,14 @@ static void __init build_socket_tables(void)
/* Set socket -> node values: */
lnid = NUMA_NO_NODE;
- for_each_possible_cpu(cpu) {
- int nid = cpu_to_node(cpu);
- int apicid, sockid;
+ for (apicid = 0; apicid < ARRAY_SIZE(__apicid_to_node); apicid++) {
+ int nid = __apicid_to_node[apicid];
+ int sockid;
- if (lnid == nid)
+ if ((nid == NUMA_NO_NODE) || (lnid == nid))
continue;
lnid = nid;
- apicid = per_cpu(x86_cpu_to_apicid, cpu);
sockid = apicid >> uv_cpuid.socketid_shift;
if (_socket_to_node[sockid - minsock] == SOCK_EMPTY)
diff --git a/arch/x86/kernel/audit_64.c b/arch/x86/kernel/audit_64.c
index 44c3601cfdc4..190c120f4285 100644
--- a/arch/x86/kernel/audit_64.c
+++ b/arch/x86/kernel/audit_64.c
@@ -63,11 +63,6 @@ int audit_classify_syscall(int abi, unsigned syscall)
static int __init audit_classes_init(void)
{
#ifdef CONFIG_IA32_EMULATION
- extern __u32 ia32_dir_class[];
- extern __u32 ia32_write_class[];
- extern __u32 ia32_read_class[];
- extern __u32 ia32_chattr_class[];
- extern __u32 ia32_signal_class[];
audit_register_class(AUDIT_CLASS_WRITE_32, ia32_write_class);
audit_register_class(AUDIT_CLASS_READ_32, ia32_read_class);
audit_register_class(AUDIT_CLASS_DIR_WRITE_32, ia32_dir_class);
diff --git a/arch/x86/kernel/callthunks.c b/arch/x86/kernel/callthunks.c
index c06bfc086565..faa9f2299848 100644
--- a/arch/x86/kernel/callthunks.c
+++ b/arch/x86/kernel/callthunks.c
@@ -272,7 +272,6 @@ void __init callthunks_patch_builtin_calls(void)
pr_info("Setting up call depth tracking\n");
mutex_lock(&text_mutex);
callthunks_setup(&cs, &builtin_coretext);
- static_call_force_reinit();
thunks_initialized = true;
mutex_unlock(&text_mutex);
}
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index dd8379d84445..03ef962a6992 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -766,6 +766,15 @@ static void early_init_amd(struct cpuinfo_x86 *c)
if (cpu_has(c, X86_FEATURE_TOPOEXT))
smp_num_siblings = ((cpuid_ebx(0x8000001e) >> 8) & 0xff) + 1;
+
+ if (!cpu_has(c, X86_FEATURE_HYPERVISOR) && !cpu_has(c, X86_FEATURE_IBPB_BRTYPE)) {
+ if (c->x86 == 0x17 && boot_cpu_has(X86_FEATURE_AMD_IBPB))
+ setup_force_cpu_cap(X86_FEATURE_IBPB_BRTYPE);
+ else if (c->x86 >= 0x19 && !wrmsrl_safe(MSR_IA32_PRED_CMD, PRED_CMD_SBPB)) {
+ setup_force_cpu_cap(X86_FEATURE_IBPB_BRTYPE);
+ setup_force_cpu_cap(X86_FEATURE_SBPB);
+ }
+ }
}
static void init_amd_k8(struct cpuinfo_x86 *c)
@@ -1301,25 +1310,6 @@ void amd_check_microcode(void)
on_each_cpu(zenbleed_check_cpu, NULL, 1);
}
-bool cpu_has_ibpb_brtype_microcode(void)
-{
- switch (boot_cpu_data.x86) {
- /* Zen1/2 IBPB flushes branch type predictions too. */
- case 0x17:
- return boot_cpu_has(X86_FEATURE_AMD_IBPB);
- case 0x19:
- /* Poke the MSR bit on Zen3/4 to check its presence. */
- if (!wrmsrl_safe(MSR_IA32_PRED_CMD, PRED_CMD_SBPB)) {
- setup_force_cpu_cap(X86_FEATURE_SBPB);
- return true;
- } else {
- return false;
- }
- default:
- return false;
- }
-}
-
/*
* Issue a DIV 0/1 insn to clear any division data from previous DIV
* operations.
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index f081d26616ac..10499bcd4e39 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -2404,27 +2404,16 @@ early_param("spec_rstack_overflow", srso_parse_cmdline);
static void __init srso_select_mitigation(void)
{
- bool has_microcode;
+ bool has_microcode = boot_cpu_has(X86_FEATURE_IBPB_BRTYPE);
if (!boot_cpu_has_bug(X86_BUG_SRSO) || cpu_mitigations_off())
goto pred_cmd;
- /*
- * The first check is for the kernel running as a guest in order
- * for guests to verify whether IBPB is a viable mitigation.
- */
- has_microcode = boot_cpu_has(X86_FEATURE_IBPB_BRTYPE) || cpu_has_ibpb_brtype_microcode();
if (!has_microcode) {
pr_warn("IBPB-extending microcode not applied!\n");
pr_warn(SRSO_NOTICE);
} else {
/*
- * Enable the synthetic (even if in a real CPUID leaf)
- * flags for guests.
- */
- setup_force_cpu_cap(X86_FEATURE_IBPB_BRTYPE);
-
- /*
* Zen1/2 with SMT off aren't vulnerable after the right
* IBPB microcode has been applied.
*/
@@ -2444,7 +2433,7 @@ static void __init srso_select_mitigation(void)
switch (srso_cmd) {
case SRSO_CMD_OFF:
- return;
+ goto pred_cmd;
case SRSO_CMD_MICROCODE:
if (has_microcode) {
@@ -2717,7 +2706,7 @@ static ssize_t srso_show_state(char *buf)
return sysfs_emit(buf, "%s%s\n",
srso_strings[srso_mitigation],
- (cpu_has_ibpb_brtype_microcode() ? "" : ", no microcode"));
+ boot_cpu_has(X86_FEATURE_IBPB_BRTYPE) ? "" : ", no microcode");
}
static ssize_t gds_show_state(char *buf)
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 6d75fab10161..4e5ffc8b0e46 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1280,11 +1280,11 @@ static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = {
VULNBL_INTEL_STEPPINGS(BROADWELL_G, X86_STEPPING_ANY, SRBDS),
VULNBL_INTEL_STEPPINGS(BROADWELL_X, X86_STEPPING_ANY, MMIO),
VULNBL_INTEL_STEPPINGS(BROADWELL, X86_STEPPING_ANY, SRBDS),
- VULNBL_INTEL_STEPPINGS(SKYLAKE_L, X86_STEPPING_ANY, SRBDS | MMIO | RETBLEED),
VULNBL_INTEL_STEPPINGS(SKYLAKE_X, X86_STEPPING_ANY, MMIO | RETBLEED | GDS),
- VULNBL_INTEL_STEPPINGS(SKYLAKE, X86_STEPPING_ANY, SRBDS | MMIO | RETBLEED),
- VULNBL_INTEL_STEPPINGS(KABYLAKE_L, X86_STEPPING_ANY, SRBDS | MMIO | RETBLEED | GDS),
- VULNBL_INTEL_STEPPINGS(KABYLAKE, X86_STEPPING_ANY, SRBDS | MMIO | RETBLEED | GDS),
+ VULNBL_INTEL_STEPPINGS(SKYLAKE_L, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS),
+ VULNBL_INTEL_STEPPINGS(SKYLAKE, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS),
+ VULNBL_INTEL_STEPPINGS(KABYLAKE_L, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS),
+ VULNBL_INTEL_STEPPINGS(KABYLAKE, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS),
VULNBL_INTEL_STEPPINGS(CANNONLAKE_L, X86_STEPPING_ANY, RETBLEED),
VULNBL_INTEL_STEPPINGS(ICELAKE_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED | GDS),
VULNBL_INTEL_STEPPINGS(ICELAKE_D, X86_STEPPING_ANY, MMIO | GDS),
@@ -1303,7 +1303,7 @@ static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = {
VULNBL_AMD(0x15, RETBLEED),
VULNBL_AMD(0x16, RETBLEED),
VULNBL_AMD(0x17, RETBLEED | SMT_RSB | SRSO),
- VULNBL_HYGON(0x18, RETBLEED | SMT_RSB),
+ VULNBL_HYGON(0x18, RETBLEED | SMT_RSB | SRSO),
VULNBL_AMD(0x19, SRSO),
{}
};
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 0100468e72ca..e6bba12c759c 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -32,6 +32,7 @@
#include <asm/nmi.h>
#include <clocksource/hyperv_timer.h>
#include <asm/numa.h>
+#include <asm/svm.h>
/* Is Linux running as the root partition? */
bool hv_root_partition;
@@ -39,6 +40,10 @@ bool hv_root_partition;
bool hv_nested;
struct ms_hyperv_info ms_hyperv;
+/* Used in modules via hv_do_hypercall(): see arch/x86/include/asm/mshyperv.h */
+bool hyperv_paravisor_present __ro_after_init;
+EXPORT_SYMBOL_GPL(hyperv_paravisor_present);
+
#if IS_ENABLED(CONFIG_HYPERV)
static inline unsigned int hv_get_nested_reg(unsigned int reg)
{
@@ -65,8 +70,8 @@ u64 hv_get_non_nested_register(unsigned int reg)
{
u64 value;
- if (hv_is_synic_reg(reg) && hv_isolation_type_snp())
- hv_ghcb_msr_read(reg, &value);
+ if (hv_is_synic_reg(reg) && ms_hyperv.paravisor_present)
+ hv_ivm_msr_read(reg, &value);
else
rdmsrl(reg, value);
return value;
@@ -75,8 +80,8 @@ EXPORT_SYMBOL_GPL(hv_get_non_nested_register);
void hv_set_non_nested_register(unsigned int reg, u64 value)
{
- if (hv_is_synic_reg(reg) && hv_isolation_type_snp()) {
- hv_ghcb_msr_write(reg, value);
+ if (hv_is_synic_reg(reg) && ms_hyperv.paravisor_present) {
+ hv_ivm_msr_write(reg, value);
/* Write proxy bit via wrmsl instruction */
if (hv_is_sint_reg(reg))
@@ -295,6 +300,15 @@ static void __init hv_smp_prepare_cpus(unsigned int max_cpus)
native_smp_prepare_cpus(max_cpus);
+ /*
+ * Override wakeup_secondary_cpu_64 callback for SEV-SNP
+ * enlightened guest.
+ */
+ if (!ms_hyperv.paravisor_present && hv_isolation_type_snp()) {
+ apic->wakeup_secondary_cpu_64 = hv_snp_boot_ap;
+ return;
+ }
+
#ifdef CONFIG_X86_64
for_each_present_cpu(i) {
if (i == 0)
@@ -313,6 +327,26 @@ static void __init hv_smp_prepare_cpus(unsigned int max_cpus)
}
#endif
+/*
+ * When a fully enlightened TDX VM runs on Hyper-V, the firmware sets the
+ * HW_REDUCED flag: refer to acpi_tb_create_local_fadt(). Consequently ttyS0
+ * interrupts can't work because request_irq() -> ... -> irq_to_desc() returns
+ * NULL for ttyS0. This happens because mp_config_acpi_legacy_irqs() sees a
+ * nr_legacy_irqs() of 0, so it doesn't initialize the array 'mp_irqs[]', and
+ * later setup_IO_APIC_irqs() -> find_irq_entry() fails to find the legacy irqs
+ * from the array and hence doesn't create the necessary irq description info.
+ *
+ * Clone arch/x86/kernel/acpi/boot.c: acpi_generic_reduced_hw_init() here,
+ * except don't change 'legacy_pic', which keeps its default value
+ * 'default_legacy_pic'. This way, mp_config_acpi_legacy_irqs() sees a non-zero
+ * nr_legacy_irqs() and eventually serial console interrupts works properly.
+ */
+static void __init reduced_hw_init(void)
+{
+ x86_init.timers.timer_init = x86_init_noop;
+ x86_init.irqs.pre_vector_init = x86_init_noop;
+}
+
static void __init ms_hyperv_init_platform(void)
{
int hv_max_functions_eax;
@@ -399,11 +433,33 @@ static void __init ms_hyperv_init_platform(void)
ms_hyperv.shared_gpa_boundary =
BIT_ULL(ms_hyperv.shared_gpa_boundary_bits);
+ hyperv_paravisor_present = !!ms_hyperv.paravisor_present;
+
pr_info("Hyper-V: Isolation Config: Group A 0x%x, Group B 0x%x\n",
ms_hyperv.isolation_config_a, ms_hyperv.isolation_config_b);
- if (hv_get_isolation_type() == HV_ISOLATION_TYPE_SNP)
+
+ if (hv_get_isolation_type() == HV_ISOLATION_TYPE_SNP) {
static_branch_enable(&isolation_type_snp);
+ } else if (hv_get_isolation_type() == HV_ISOLATION_TYPE_TDX) {
+ static_branch_enable(&isolation_type_tdx);
+
+ /* A TDX VM must use x2APIC and doesn't use lazy EOI. */
+ ms_hyperv.hints &= ~HV_X64_APIC_ACCESS_RECOMMENDED;
+
+ if (!ms_hyperv.paravisor_present) {
+ /* To be supported: more work is required. */
+ ms_hyperv.features &= ~HV_MSR_REFERENCE_TSC_AVAILABLE;
+
+ /* HV_REGISTER_CRASH_CTL is unsupported. */
+ ms_hyperv.misc_features &= ~HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE;
+
+ /* Don't trust Hyper-V's TLB-flushing hypercalls. */
+ ms_hyperv.hints &= ~HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED;
+
+ x86_init.acpi.reduced_hw_early_init = reduced_hw_init;
+ }
+ }
}
if (hv_max_functions_eax >= HYPERV_CPUID_NESTED_FEATURES) {
@@ -473,7 +529,7 @@ static void __init ms_hyperv_init_platform(void)
#if IS_ENABLED(CONFIG_HYPERV)
if ((hv_get_isolation_type() == HV_ISOLATION_TYPE_VBS) ||
- (hv_get_isolation_type() == HV_ISOLATION_TYPE_SNP))
+ ms_hyperv.paravisor_present)
hv_vtom_init();
/*
* Setup the hook to get control post apic initialization.
@@ -497,7 +553,8 @@ static void __init ms_hyperv_init_platform(void)
# ifdef CONFIG_SMP
smp_ops.smp_prepare_boot_cpu = hv_smp_prepare_boot_cpu;
- if (hv_root_partition)
+ if (hv_root_partition ||
+ (!ms_hyperv.paravisor_present && hv_isolation_type_snp()))
smp_ops.smp_prepare_cpus = hv_smp_prepare_cpus;
# endif
@@ -560,6 +617,22 @@ static bool __init ms_hyperv_msi_ext_dest_id(void)
return eax & HYPERV_VS_PROPERTIES_EAX_EXTENDED_IOAPIC_RTE;
}
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+static void hv_sev_es_hcall_prepare(struct ghcb *ghcb, struct pt_regs *regs)
+{
+ /* RAX and CPL are already in the GHCB */
+ ghcb_set_rcx(ghcb, regs->cx);
+ ghcb_set_rdx(ghcb, regs->dx);
+ ghcb_set_r8(ghcb, regs->r8);
+}
+
+static bool hv_sev_es_hcall_finish(struct ghcb *ghcb, struct pt_regs *regs)
+{
+ /* No checking of the return state needed */
+ return true;
+}
+#endif
+
const __initconst struct hypervisor_x86 x86_hyper_ms_hyperv = {
.name = "Microsoft Hyper-V",
.detect = ms_hyperv_platform,
@@ -567,4 +640,8 @@ const __initconst struct hypervisor_x86 x86_hyper_ms_hyperv = {
.init.x2apic_available = ms_hyperv_x2apic_available,
.init.msi_ext_dest_id = ms_hyperv_msi_ext_dest_id,
.init.init_platform = ms_hyperv_init_platform,
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+ .runtime.sev_es_hcall_prepare = hv_sev_es_hcall_prepare,
+ .runtime.sev_es_hcall_finish = hv_sev_es_hcall_finish,
+#endif
};
diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c
index 91fa70e51004..279148e72459 100644
--- a/arch/x86/kernel/cpu/sgx/encl.c
+++ b/arch/x86/kernel/cpu/sgx/encl.c
@@ -235,6 +235,21 @@ static struct sgx_epc_page *sgx_encl_eldu(struct sgx_encl_page *encl_page,
return epc_page;
}
+/*
+ * Ensure the SECS page is not swapped out. Must be called with encl->lock
+ * to protect the enclave states including SECS and ensure the SECS page is
+ * not swapped out again while being used.
+ */
+static struct sgx_epc_page *sgx_encl_load_secs(struct sgx_encl *encl)
+{
+ struct sgx_epc_page *epc_page = encl->secs.epc_page;
+
+ if (!epc_page)
+ epc_page = sgx_encl_eldu(&encl->secs, NULL);
+
+ return epc_page;
+}
+
static struct sgx_encl_page *__sgx_encl_load_page(struct sgx_encl *encl,
struct sgx_encl_page *entry)
{
@@ -248,11 +263,9 @@ static struct sgx_encl_page *__sgx_encl_load_page(struct sgx_encl *encl,
return entry;
}
- if (!(encl->secs.epc_page)) {
- epc_page = sgx_encl_eldu(&encl->secs, NULL);
- if (IS_ERR(epc_page))
- return ERR_CAST(epc_page);
- }
+ epc_page = sgx_encl_load_secs(encl);
+ if (IS_ERR(epc_page))
+ return ERR_CAST(epc_page);
epc_page = sgx_encl_eldu(entry, encl->secs.epc_page);
if (IS_ERR(epc_page))
@@ -339,6 +352,13 @@ static vm_fault_t sgx_encl_eaug_page(struct vm_area_struct *vma,
mutex_lock(&encl->lock);
+ epc_page = sgx_encl_load_secs(encl);
+ if (IS_ERR(epc_page)) {
+ if (PTR_ERR(epc_page) == -EBUSY)
+ vmret = VM_FAULT_NOPAGE;
+ goto err_out_unlock;
+ }
+
epc_page = sgx_alloc_epc_page(encl_page, false);
if (IS_ERR(epc_page)) {
if (PTR_ERR(epc_page) == -EBUSY)
diff --git a/arch/x86/kernel/cpu/sgx/virt.c b/arch/x86/kernel/cpu/sgx/virt.c
index c3e37eaec8ec..7aaa3652e31d 100644
--- a/arch/x86/kernel/cpu/sgx/virt.c
+++ b/arch/x86/kernel/cpu/sgx/virt.c
@@ -204,6 +204,7 @@ static int sgx_vepc_release(struct inode *inode, struct file *file)
continue;
xa_erase(&vepc->page_array, index);
+ cond_resched();
}
/*
@@ -222,6 +223,7 @@ static int sgx_vepc_release(struct inode *inode, struct file *file)
list_add_tail(&epc_page->list, &secs_pages);
xa_erase(&vepc->page_array, index);
+ cond_resched();
}
/*
@@ -243,6 +245,7 @@ static int sgx_vepc_release(struct inode *inode, struct file *file)
if (sgx_vepc_free_page(epc_page))
list_add_tail(&epc_page->list, &secs_pages);
+ cond_resched();
}
if (!list_empty(&secs_pages))
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index 587c7743fd21..c92d88680dbf 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -48,27 +48,6 @@ struct crash_memmap_data {
unsigned int type;
};
-/*
- * This is used to VMCLEAR all VMCSs loaded on the
- * processor. And when loading kvm_intel module, the
- * callback function pointer will be assigned.
- *
- * protected by rcu.
- */
-crash_vmclear_fn __rcu *crash_vmclear_loaded_vmcss = NULL;
-EXPORT_SYMBOL_GPL(crash_vmclear_loaded_vmcss);
-
-static inline void cpu_crash_vmclear_loaded_vmcss(void)
-{
- crash_vmclear_fn *do_vmclear_operation = NULL;
-
- rcu_read_lock();
- do_vmclear_operation = rcu_dereference(crash_vmclear_loaded_vmcss);
- if (do_vmclear_operation)
- do_vmclear_operation();
- rcu_read_unlock();
-}
-
#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC)
static void kdump_nmi_callback(int cpu, struct pt_regs *regs)
@@ -76,11 +55,6 @@ static void kdump_nmi_callback(int cpu, struct pt_regs *regs)
crash_save_cpu(regs, cpu);
/*
- * VMCLEAR VMCSs loaded on all cpus if needed.
- */
- cpu_crash_vmclear_loaded_vmcss();
-
- /*
* Disable Intel PT to stop its logging
*/
cpu_emergency_stop_pt();
@@ -133,11 +107,6 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
crash_smp_send_stop();
- /*
- * VMCLEAR VMCSs loaded on this cpu if needed.
- */
- cpu_crash_vmclear_loaded_vmcss();
-
cpu_emergency_disable_virtualization();
/*
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 41dac93b8ea4..cadf68737e6b 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -71,7 +71,7 @@ static unsigned short xsave_cpuid_features[] __initdata = {
[XFEATURE_ZMM_Hi256] = X86_FEATURE_AVX512F,
[XFEATURE_Hi16_ZMM] = X86_FEATURE_AVX512F,
[XFEATURE_PT_UNIMPLEMENTED_SO_FAR] = X86_FEATURE_INTEL_PT,
- [XFEATURE_PKRU] = X86_FEATURE_PKU,
+ [XFEATURE_PKRU] = X86_FEATURE_OSPKE,
[XFEATURE_PASID] = X86_FEATURE_ENQCMD,
[XFEATURE_CET_USER] = X86_FEATURE_SHSTK,
[XFEATURE_XTILE_CFG] = X86_FEATURE_AMX_TILE,
diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c
index 4d8aff05a509..30a55207c000 100644
--- a/arch/x86/kernel/i8259.c
+++ b/arch/x86/kernel/i8259.c
@@ -231,9 +231,7 @@ struct irq_chip i8259A_chip = {
};
static char irq_trigger[2];
-/**
- * ELCR registers (0x4d0, 0x4d1) control edge/level of IRQ
- */
+/* ELCR registers (0x4d0, 0x4d1) control edge/level of IRQ */
static void restore_ELCR(char *trigger)
{
outb(trigger[0], PIC_ELCR1);
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index 3a43a2dee658..9c9faa1634fb 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -695,7 +695,6 @@ void kgdb_arch_exit(void)
}
/**
- *
* kgdb_skipexception - Bail out of KGDB when we've been triggered.
* @exception: Exception vector number
* @regs: Current &struct pt_regs.
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 975f98d5eee5..97f1436c1a20 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -143,66 +143,7 @@ int paravirt_disable_iospace(void)
return request_resource(&ioport_resource, &reserve_ioports);
}
-static DEFINE_PER_CPU(enum paravirt_lazy_mode, paravirt_lazy_mode) = PARAVIRT_LAZY_NONE;
-
-static inline void enter_lazy(enum paravirt_lazy_mode mode)
-{
- BUG_ON(this_cpu_read(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE);
-
- this_cpu_write(paravirt_lazy_mode, mode);
-}
-
-static void leave_lazy(enum paravirt_lazy_mode mode)
-{
- BUG_ON(this_cpu_read(paravirt_lazy_mode) != mode);
-
- this_cpu_write(paravirt_lazy_mode, PARAVIRT_LAZY_NONE);
-}
-
-void paravirt_enter_lazy_mmu(void)
-{
- enter_lazy(PARAVIRT_LAZY_MMU);
-}
-
-void paravirt_leave_lazy_mmu(void)
-{
- leave_lazy(PARAVIRT_LAZY_MMU);
-}
-
-void paravirt_flush_lazy_mmu(void)
-{
- preempt_disable();
-
- if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
- arch_leave_lazy_mmu_mode();
- arch_enter_lazy_mmu_mode();
- }
-
- preempt_enable();
-}
-
#ifdef CONFIG_PARAVIRT_XXL
-void paravirt_start_context_switch(struct task_struct *prev)
-{
- BUG_ON(preemptible());
-
- if (this_cpu_read(paravirt_lazy_mode) == PARAVIRT_LAZY_MMU) {
- arch_leave_lazy_mmu_mode();
- set_ti_thread_flag(task_thread_info(prev), TIF_LAZY_MMU_UPDATES);
- }
- enter_lazy(PARAVIRT_LAZY_CPU);
-}
-
-void paravirt_end_context_switch(struct task_struct *next)
-{
- BUG_ON(preemptible());
-
- leave_lazy(PARAVIRT_LAZY_CPU);
-
- if (test_and_clear_ti_thread_flag(task_thread_info(next), TIF_LAZY_MMU_UPDATES))
- arch_enter_lazy_mmu_mode();
-}
-
static noinstr void pv_native_write_cr2(unsigned long val)
{
native_write_cr2(val);
@@ -229,14 +170,6 @@ static noinstr void pv_native_safe_halt(void)
}
#endif
-enum paravirt_lazy_mode paravirt_get_lazy_mode(void)
-{
- if (in_interrupt())
- return PARAVIRT_LAZY_NONE;
-
- return this_cpu_read(paravirt_lazy_mode);
-}
-
struct pv_info pv_info = {
.name = "bare hardware",
#ifdef CONFIG_PARAVIRT_XXL
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 9f0909142a0a..b6f4e8399fca 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -257,13 +257,6 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
if (!ret && unlikely(test_tsk_thread_flag(current, TIF_IO_BITMAP)))
io_bitmap_share(p);
- /*
- * If copy_thread() if failing, don't leak the shadow stack possibly
- * allocated in shstk_alloc_thread_stack() above.
- */
- if (ret)
- shstk_free(p);
-
return ret;
}
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 3adbe97015c1..830425e6d38e 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -22,7 +22,6 @@
#include <asm/reboot_fixups.h>
#include <asm/reboot.h>
#include <asm/pci_x86.h>
-#include <asm/virtext.h>
#include <asm/cpu.h>
#include <asm/nmi.h>
#include <asm/smp.h>
@@ -530,9 +529,54 @@ static inline void kb_wait(void)
static inline void nmi_shootdown_cpus_on_restart(void);
+#if IS_ENABLED(CONFIG_KVM_INTEL) || IS_ENABLED(CONFIG_KVM_AMD)
+/* RCU-protected callback to disable virtualization prior to reboot. */
+static cpu_emergency_virt_cb __rcu *cpu_emergency_virt_callback;
+
+void cpu_emergency_register_virt_callback(cpu_emergency_virt_cb *callback)
+{
+ if (WARN_ON_ONCE(rcu_access_pointer(cpu_emergency_virt_callback)))
+ return;
+
+ rcu_assign_pointer(cpu_emergency_virt_callback, callback);
+}
+EXPORT_SYMBOL_GPL(cpu_emergency_register_virt_callback);
+
+void cpu_emergency_unregister_virt_callback(cpu_emergency_virt_cb *callback)
+{
+ if (WARN_ON_ONCE(rcu_access_pointer(cpu_emergency_virt_callback) != callback))
+ return;
+
+ rcu_assign_pointer(cpu_emergency_virt_callback, NULL);
+ synchronize_rcu();
+}
+EXPORT_SYMBOL_GPL(cpu_emergency_unregister_virt_callback);
+
+/*
+ * Disable virtualization, i.e. VMX or SVM, to ensure INIT is recognized during
+ * reboot. VMX blocks INIT if the CPU is post-VMXON, and SVM blocks INIT if
+ * GIF=0, i.e. if the crash occurred between CLGI and STGI.
+ */
+void cpu_emergency_disable_virtualization(void)
+{
+ cpu_emergency_virt_cb *callback;
+
+ /*
+ * IRQs must be disabled as KVM enables virtualization in hardware via
+ * function call IPIs, i.e. IRQs need to be disabled to guarantee
+ * virtualization stays disabled.
+ */
+ lockdep_assert_irqs_disabled();
+
+ rcu_read_lock();
+ callback = rcu_dereference(cpu_emergency_virt_callback);
+ if (callback)
+ callback();
+ rcu_read_unlock();
+}
+
static void emergency_reboot_disable_virtualization(void)
{
- /* Just make sure we won't change CPUs while doing this */
local_irq_disable();
/*
@@ -545,7 +589,7 @@ static void emergency_reboot_disable_virtualization(void)
* Do the NMI shootdown even if virtualization is off on _this_ CPU, as
* other CPUs may have virtualization enabled.
*/
- if (cpu_has_vmx() || cpu_has_svm(NULL)) {
+ if (rcu_access_pointer(cpu_emergency_virt_callback)) {
/* Safely force _this_ CPU out of VMX/SVM operation. */
cpu_emergency_disable_virtualization();
@@ -553,7 +597,9 @@ static void emergency_reboot_disable_virtualization(void)
nmi_shootdown_cpus_on_restart();
}
}
-
+#else
+static void emergency_reboot_disable_virtualization(void) { }
+#endif /* CONFIG_KVM_INTEL || CONFIG_KVM_AMD */
void __attribute__((weak)) mach_reboot_fixups(void)
{
@@ -787,21 +833,9 @@ void machine_crash_shutdown(struct pt_regs *regs)
}
#endif
-
/* This is the CPU performing the emergency shutdown work. */
int crashing_cpu = -1;
-/*
- * Disable virtualization, i.e. VMX or SVM, to ensure INIT is recognized during
- * reboot. VMX blocks INIT if the CPU is post-VMXON, and SVM blocks INIT if
- * GIF=0, i.e. if the crash occurred between CLGI and STGI.
- */
-void cpu_emergency_disable_virtualization(void)
-{
- cpu_emergency_vmxoff();
- cpu_emergency_svm_disable();
-}
-
#if defined(CONFIG_SMP)
static nmi_shootdown_cb shootdown_callback;
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index b9145a63da77..b098b1fa2470 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -358,15 +358,11 @@ static void __init add_early_ima_buffer(u64 phys_addr)
#if defined(CONFIG_HAVE_IMA_KEXEC) && !defined(CONFIG_OF_FLATTREE)
int __init ima_free_kexec_buffer(void)
{
- int rc;
-
if (!ima_kexec_buffer_size)
return -ENOENT;
- rc = memblock_phys_free(ima_kexec_buffer_phys,
- ima_kexec_buffer_size);
- if (rc)
- return rc;
+ memblock_free_late(ima_kexec_buffer_phys,
+ ima_kexec_buffer_size);
ima_kexec_buffer_phys = 0;
ima_kexec_buffer_size = 0;
diff --git a/arch/x86/kernel/sev-shared.c b/arch/x86/kernel/sev-shared.c
index 2eabccde94fb..dcf325b7b022 100644
--- a/arch/x86/kernel/sev-shared.c
+++ b/arch/x86/kernel/sev-shared.c
@@ -256,7 +256,7 @@ static int __sev_cpuid_hv(u32 fn, int reg_idx, u32 *reg)
return 0;
}
-static int sev_cpuid_hv(struct cpuid_leaf *leaf)
+static int __sev_cpuid_hv_msr(struct cpuid_leaf *leaf)
{
int ret;
@@ -279,6 +279,45 @@ static int sev_cpuid_hv(struct cpuid_leaf *leaf)
return ret;
}
+static int __sev_cpuid_hv_ghcb(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_leaf *leaf)
+{
+ u32 cr4 = native_read_cr4();
+ int ret;
+
+ ghcb_set_rax(ghcb, leaf->fn);
+ ghcb_set_rcx(ghcb, leaf->subfn);
+
+ if (cr4 & X86_CR4_OSXSAVE)
+ /* Safe to read xcr0 */
+ ghcb_set_xcr0(ghcb, xgetbv(XCR_XFEATURE_ENABLED_MASK));
+ else
+ /* xgetbv will cause #UD - use reset value for xcr0 */
+ ghcb_set_xcr0(ghcb, 1);
+
+ ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_CPUID, 0, 0);
+ if (ret != ES_OK)
+ return ret;
+
+ if (!(ghcb_rax_is_valid(ghcb) &&
+ ghcb_rbx_is_valid(ghcb) &&
+ ghcb_rcx_is_valid(ghcb) &&
+ ghcb_rdx_is_valid(ghcb)))
+ return ES_VMM_ERROR;
+
+ leaf->eax = ghcb->save.rax;
+ leaf->ebx = ghcb->save.rbx;
+ leaf->ecx = ghcb->save.rcx;
+ leaf->edx = ghcb->save.rdx;
+
+ return ES_OK;
+}
+
+static int sev_cpuid_hv(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_leaf *leaf)
+{
+ return ghcb ? __sev_cpuid_hv_ghcb(ghcb, ctxt, leaf)
+ : __sev_cpuid_hv_msr(leaf);
+}
+
/*
* This may be called early while still running on the initial identity
* mapping. Use RIP-relative addressing to obtain the correct address
@@ -388,19 +427,20 @@ snp_cpuid_get_validated_func(struct cpuid_leaf *leaf)
return false;
}
-static void snp_cpuid_hv(struct cpuid_leaf *leaf)
+static void snp_cpuid_hv(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_leaf *leaf)
{
- if (sev_cpuid_hv(leaf))
+ if (sev_cpuid_hv(ghcb, ctxt, leaf))
sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_CPUID_HV);
}
-static int snp_cpuid_postprocess(struct cpuid_leaf *leaf)
+static int snp_cpuid_postprocess(struct ghcb *ghcb, struct es_em_ctxt *ctxt,
+ struct cpuid_leaf *leaf)
{
struct cpuid_leaf leaf_hv = *leaf;
switch (leaf->fn) {
case 0x1:
- snp_cpuid_hv(&leaf_hv);
+ snp_cpuid_hv(ghcb, ctxt, &leaf_hv);
/* initial APIC ID */
leaf->ebx = (leaf_hv.ebx & GENMASK(31, 24)) | (leaf->ebx & GENMASK(23, 0));
@@ -419,7 +459,7 @@ static int snp_cpuid_postprocess(struct cpuid_leaf *leaf)
break;
case 0xB:
leaf_hv.subfn = 0;
- snp_cpuid_hv(&leaf_hv);
+ snp_cpuid_hv(ghcb, ctxt, &leaf_hv);
/* extended APIC ID */
leaf->edx = leaf_hv.edx;
@@ -467,7 +507,7 @@ static int snp_cpuid_postprocess(struct cpuid_leaf *leaf)
}
break;
case 0x8000001E:
- snp_cpuid_hv(&leaf_hv);
+ snp_cpuid_hv(ghcb, ctxt, &leaf_hv);
/* extended APIC ID */
leaf->eax = leaf_hv.eax;
@@ -488,7 +528,7 @@ static int snp_cpuid_postprocess(struct cpuid_leaf *leaf)
* Returns -EOPNOTSUPP if feature not enabled. Any other non-zero return value
* should be treated as fatal by caller.
*/
-static int snp_cpuid(struct cpuid_leaf *leaf)
+static int snp_cpuid(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_leaf *leaf)
{
const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table();
@@ -522,7 +562,7 @@ static int snp_cpuid(struct cpuid_leaf *leaf)
return 0;
}
- return snp_cpuid_postprocess(leaf);
+ return snp_cpuid_postprocess(ghcb, ctxt, leaf);
}
/*
@@ -544,14 +584,14 @@ void __init do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code)
leaf.fn = fn;
leaf.subfn = subfn;
- ret = snp_cpuid(&leaf);
+ ret = snp_cpuid(NULL, NULL, &leaf);
if (!ret)
goto cpuid_done;
if (ret != -EOPNOTSUPP)
goto fail;
- if (sev_cpuid_hv(&leaf))
+ if (__sev_cpuid_hv_msr(&leaf))
goto fail;
cpuid_done:
@@ -848,14 +888,15 @@ static enum es_result vc_handle_ioio(struct ghcb *ghcb, struct es_em_ctxt *ctxt)
return ret;
}
-static int vc_handle_cpuid_snp(struct pt_regs *regs)
+static int vc_handle_cpuid_snp(struct ghcb *ghcb, struct es_em_ctxt *ctxt)
{
+ struct pt_regs *regs = ctxt->regs;
struct cpuid_leaf leaf;
int ret;
leaf.fn = regs->ax;
leaf.subfn = regs->cx;
- ret = snp_cpuid(&leaf);
+ ret = snp_cpuid(ghcb, ctxt, &leaf);
if (!ret) {
regs->ax = leaf.eax;
regs->bx = leaf.ebx;
@@ -874,7 +915,7 @@ static enum es_result vc_handle_cpuid(struct ghcb *ghcb,
enum es_result ret;
int snp_cpuid_ret;
- snp_cpuid_ret = vc_handle_cpuid_snp(regs);
+ snp_cpuid_ret = vc_handle_cpuid_snp(ghcb, ctxt);
if (!snp_cpuid_ret)
return ES_OK;
if (snp_cpuid_ret != -EOPNOTSUPP)
diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c
index 2787826d9f60..d8c1e3be74c0 100644
--- a/arch/x86/kernel/sev.c
+++ b/arch/x86/kernel/sev.c
@@ -868,8 +868,7 @@ void snp_set_memory_private(unsigned long vaddr, unsigned long npages)
void snp_accept_memory(phys_addr_t start, phys_addr_t end)
{
- unsigned long vaddr;
- unsigned int npages;
+ unsigned long vaddr, npages;
if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
return;
diff --git a/arch/x86/kernel/shstk.c b/arch/x86/kernel/shstk.c
index fd689921a1db..59e15dd8d0f8 100644
--- a/arch/x86/kernel/shstk.c
+++ b/arch/x86/kernel/shstk.c
@@ -205,10 +205,21 @@ unsigned long shstk_alloc_thread_stack(struct task_struct *tsk, unsigned long cl
return 0;
/*
- * For CLONE_VM, except vfork, the child needs a separate shadow
+ * For CLONE_VFORK the child will share the parents shadow stack.
+ * Make sure to clear the internal tracking of the thread shadow
+ * stack so the freeing logic run for child knows to leave it alone.
+ */
+ if (clone_flags & CLONE_VFORK) {
+ shstk->base = 0;
+ shstk->size = 0;
+ return 0;
+ }
+
+ /*
+ * For !CLONE_VM the child will use a copy of the parents shadow
* stack.
*/
- if ((clone_flags & (CLONE_VFORK | CLONE_VM)) != CLONE_VM)
+ if (!(clone_flags & CLONE_VM))
return 0;
size = adjust_shstk_size(stack_size);
@@ -408,7 +419,25 @@ void shstk_free(struct task_struct *tsk)
if (!tsk->mm || tsk->mm != current->mm)
return;
+ /*
+ * If shstk->base is NULL, then this task is not managing its
+ * own shadow stack (CLONE_VFORK). So skip freeing it.
+ */
+ if (!shstk->base)
+ return;
+
+ /*
+ * shstk->base is NULL for CLONE_VFORK child tasks, and so is
+ * normal. But size = 0 on a shstk->base is not normal and
+ * indicated an attempt to free the thread shadow stack twice.
+ * Warn about it.
+ */
+ if (WARN_ON(!shstk->size))
+ return;
+
unmap_shadow_stack(shstk->base, shstk->size);
+
+ shstk->size = 0;
}
static int wrss_control(bool enable)
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index d7667a29acf3..48e040618731 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -579,7 +579,6 @@ static bool match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
}
-#if defined(CONFIG_SCHED_SMT) || defined(CONFIG_SCHED_CLUSTER) || defined(CONFIG_SCHED_MC)
static inline int x86_sched_itmt_flags(void)
{
return sysctl_sched_itmt_enabled ? SD_ASYM_PACKING : 0;
@@ -603,7 +602,14 @@ static int x86_cluster_flags(void)
return cpu_cluster_flags() | x86_sched_itmt_flags();
}
#endif
-#endif
+
+static int x86_die_flags(void)
+{
+ if (cpu_feature_enabled(X86_FEATURE_HYBRID_CPU))
+ return x86_sched_itmt_flags();
+
+ return 0;
+}
/*
* Set if a package/die has multiple NUMA nodes inside.
@@ -640,7 +646,7 @@ static void __init build_sched_topology(void)
*/
if (!x86_has_numa_in_package) {
x86_topology[i++] = (struct sched_domain_topology_level){
- cpu_cpu_mask, SD_INIT_NAME(DIE)
+ cpu_cpu_mask, x86_die_flags, SD_INIT_NAME(DIE)
};
}
@@ -1250,7 +1256,7 @@ bool smp_park_other_cpus_in_init(void)
if (this_cpu)
return false;
- for_each_present_cpu(cpu) {
+ for_each_cpu_and(cpu, &cpus_booted_once_mask, cpu_present_mask) {
if (cpu == this_cpu)
continue;
apicid = apic->cpu_present_to_apicid(cpu);
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 83d41c2601d7..f15fb71f280e 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -156,7 +156,7 @@ SECTIONS
ALIGN_ENTRY_TEXT_END
*(.gnu.warning)
- } :text =0xcccc
+ } :text = 0xcccccccc
/* End of text section, which should occupy whole number of pages */
_etext = .;
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 89ca7f4c1464..ed90f148140d 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -101,7 +101,7 @@ config X86_SGX_KVM
config KVM_AMD
tristate "KVM for AMD processors support"
- depends on KVM
+ depends on KVM && (CPU_SUP_AMD || CPU_SUP_HYGON)
help
Provides support for KVM on AMD processors equipped with the AMD-V
(SVM) extensions.
@@ -138,6 +138,19 @@ config KVM_XEN
If in doubt, say "N".
+config KVM_PROVE_MMU
+ bool "Prove KVM MMU correctness"
+ depends on DEBUG_KERNEL
+ depends on KVM
+ depends on EXPERT
+ help
+ Enables runtime assertions in KVM's MMU that are too costly to enable
+ in anything remotely resembling a production environment, e.g. this
+ gates code that verifies a to-be-freed page table doesn't have any
+ present SPTEs.
+
+ If in doubt, say "N".
+
config KVM_EXTERNAL_WRITE_TRACKING
bool
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index d3432687c9e6..0544e30b4946 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -11,6 +11,7 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kvm_host.h>
+#include "linux/lockdep.h"
#include <linux/export.h>
#include <linux/vmalloc.h>
#include <linux/uaccess.h>
@@ -84,6 +85,18 @@ static inline struct kvm_cpuid_entry2 *cpuid_entry2_find(
struct kvm_cpuid_entry2 *e;
int i;
+ /*
+ * KVM has a semi-arbitrary rule that querying the guest's CPUID model
+ * with IRQs disabled is disallowed. The CPUID model can legitimately
+ * have over one hundred entries, i.e. the lookup is slow, and IRQs are
+ * typically disabled in KVM only when KVM is in a performance critical
+ * path, e.g. the core VM-Enter/VM-Exit run loop. Nothing will break
+ * if this rule is violated, this assertion is purely to flag potential
+ * performance issues. If this fires, consider moving the lookup out
+ * of the hotpath, e.g. by caching information during CPUID updates.
+ */
+ lockdep_assert_irqs_enabled();
+
for (i = 0; i < nent; i++) {
e = &entries[i];
@@ -312,6 +325,27 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic = vcpu->arch.apic;
struct kvm_cpuid_entry2 *best;
+ bool allow_gbpages;
+
+ BUILD_BUG_ON(KVM_NR_GOVERNED_FEATURES > KVM_MAX_NR_GOVERNED_FEATURES);
+ bitmap_zero(vcpu->arch.governed_features.enabled,
+ KVM_MAX_NR_GOVERNED_FEATURES);
+
+ /*
+ * If TDP is enabled, let the guest use GBPAGES if they're supported in
+ * hardware. The hardware page walker doesn't let KVM disable GBPAGES,
+ * i.e. won't treat them as reserved, and KVM doesn't redo the GVA->GPA
+ * walk for performance and complexity reasons. Not to mention KVM
+ * _can't_ solve the problem because GVA->GPA walks aren't visible to
+ * KVM once a TDP translation is installed. Mimic hardware behavior so
+ * that KVM's is at least consistent, i.e. doesn't randomly inject #PF.
+ * If TDP is disabled, honor *only* guest CPUID as KVM has full control
+ * and can install smaller shadow pages if the host lacks 1GiB support.
+ */
+ allow_gbpages = tdp_enabled ? boot_cpu_has(X86_FEATURE_GBPAGES) :
+ guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES);
+ if (allow_gbpages)
+ kvm_governed_feature_set(vcpu, X86_FEATURE_GBPAGES);
best = kvm_find_cpuid_entry(vcpu, 1);
if (best && apic) {
@@ -647,7 +681,8 @@ void kvm_set_cpu_caps(void)
);
kvm_cpu_cap_init_kvm_defined(CPUID_7_1_EDX,
- F(AVX_VNNI_INT8) | F(AVX_NE_CONVERT) | F(PREFETCHITI)
+ F(AVX_VNNI_INT8) | F(AVX_NE_CONVERT) | F(PREFETCHITI) |
+ F(AMX_COMPLEX)
);
kvm_cpu_cap_mask(CPUID_D_1_EAX,
@@ -1154,6 +1189,9 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
cpuid_entry_override(entry, CPUID_8000_0001_EDX);
cpuid_entry_override(entry, CPUID_8000_0001_ECX);
break;
+ case 0x80000005:
+ /* Pass host L1 cache and TLB info. */
+ break;
case 0x80000006:
/* Drop reserved bits, pass host L2 cache and TLB info. */
entry->edx &= ~GENMASK(17, 16);
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index b1658c0de847..284fa4704553 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -232,4 +232,50 @@ static __always_inline bool guest_pv_has(struct kvm_vcpu *vcpu,
return vcpu->arch.pv_cpuid.features & (1u << kvm_feature);
}
+enum kvm_governed_features {
+#define KVM_GOVERNED_FEATURE(x) KVM_GOVERNED_##x,
+#include "governed_features.h"
+ KVM_NR_GOVERNED_FEATURES
+};
+
+static __always_inline int kvm_governed_feature_index(unsigned int x86_feature)
+{
+ switch (x86_feature) {
+#define KVM_GOVERNED_FEATURE(x) case x: return KVM_GOVERNED_##x;
+#include "governed_features.h"
+ default:
+ return -1;
+ }
+}
+
+static __always_inline bool kvm_is_governed_feature(unsigned int x86_feature)
+{
+ return kvm_governed_feature_index(x86_feature) >= 0;
+}
+
+static __always_inline void kvm_governed_feature_set(struct kvm_vcpu *vcpu,
+ unsigned int x86_feature)
+{
+ BUILD_BUG_ON(!kvm_is_governed_feature(x86_feature));
+
+ __set_bit(kvm_governed_feature_index(x86_feature),
+ vcpu->arch.governed_features.enabled);
+}
+
+static __always_inline void kvm_governed_feature_check_and_set(struct kvm_vcpu *vcpu,
+ unsigned int x86_feature)
+{
+ if (kvm_cpu_cap_has(x86_feature) && guest_cpuid_has(vcpu, x86_feature))
+ kvm_governed_feature_set(vcpu, x86_feature);
+}
+
+static __always_inline bool guest_can_use(struct kvm_vcpu *vcpu,
+ unsigned int x86_feature)
+{
+ BUILD_BUG_ON(!kvm_is_governed_feature(x86_feature));
+
+ return test_bit(kvm_governed_feature_index(x86_feature),
+ vcpu->arch.governed_features.enabled);
+}
+
#endif
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 936a397a08cd..2673cd5c46cb 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -1799,13 +1799,11 @@ static int writeback(struct x86_emulate_ctxt *ctxt, struct operand *op)
op->addr.mem,
&op->val,
op->bytes);
- break;
case OP_MEM_STR:
return segmented_write(ctxt,
op->addr.mem,
op->data,
op->bytes * op->count);
- break;
case OP_XMM:
kvm_write_sse_reg(op->addr.xmm, &op->vec_val);
break;
diff --git a/arch/x86/kvm/governed_features.h b/arch/x86/kvm/governed_features.h
new file mode 100644
index 000000000000..423a73395c10
--- /dev/null
+++ b/arch/x86/kvm/governed_features.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#if !defined(KVM_GOVERNED_FEATURE) || defined(KVM_GOVERNED_X86_FEATURE)
+BUILD_BUG()
+#endif
+
+#define KVM_GOVERNED_X86_FEATURE(x) KVM_GOVERNED_FEATURE(X86_FEATURE_##x)
+
+KVM_GOVERNED_X86_FEATURE(GBPAGES)
+KVM_GOVERNED_X86_FEATURE(XSAVES)
+KVM_GOVERNED_X86_FEATURE(VMX)
+KVM_GOVERNED_X86_FEATURE(NRIPS)
+KVM_GOVERNED_X86_FEATURE(TSCRATEMSR)
+KVM_GOVERNED_X86_FEATURE(V_VMSAVE_VMLOAD)
+KVM_GOVERNED_X86_FEATURE(LBRV)
+KVM_GOVERNED_X86_FEATURE(PAUSEFILTER)
+KVM_GOVERNED_X86_FEATURE(PFTHRESHOLD)
+KVM_GOVERNED_X86_FEATURE(VGIF)
+KVM_GOVERNED_X86_FEATURE(VNMI)
+
+#undef KVM_GOVERNED_X86_FEATURE
+#undef KVM_GOVERNED_FEATURE
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index b28fd020066f..7c2dac6824e2 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -1293,7 +1293,6 @@ static bool hv_check_msr_access(struct kvm_vcpu_hv *hv_vcpu, u32 msr)
case HV_X64_MSR_VP_ASSIST_PAGE:
return hv_vcpu->cpuid_cache.features_eax &
HV_MSR_APIC_ACCESS_AVAILABLE;
- break;
case HV_X64_MSR_TSC_FREQUENCY:
case HV_X64_MSR_APIC_FREQUENCY:
return hv_vcpu->cpuid_cache.features_eax &
diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h
index ab65f3a47dfd..be7aeb9b8ea3 100644
--- a/arch/x86/kvm/kvm_emulate.h
+++ b/arch/x86/kvm/kvm_emulate.h
@@ -213,7 +213,6 @@ struct x86_emulate_ops {
bool (*get_cpuid)(struct x86_emulate_ctxt *ctxt, u32 *eax, u32 *ebx,
u32 *ecx, u32 *edx, bool exact_only);
- bool (*guest_has_long_mode)(struct x86_emulate_ctxt *ctxt);
bool (*guest_has_movbe)(struct x86_emulate_ctxt *ctxt);
bool (*guest_has_fxsr)(struct x86_emulate_ctxt *ctxt);
bool (*guest_has_rdpid)(struct x86_emulate_ctxt *ctxt);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index a983a16163b1..dcd60b39e794 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -376,7 +376,8 @@ void kvm_recalculate_apic_map(struct kvm *kvm)
struct kvm_vcpu *vcpu;
unsigned long i;
u32 max_id = 255; /* enough space for any xAPIC ID */
- bool xapic_id_mismatch = false;
+ bool xapic_id_mismatch;
+ int r;
/* Read kvm->arch.apic_map_dirty before kvm->arch.apic_map. */
if (atomic_read_acquire(&kvm->arch.apic_map_dirty) == CLEAN)
@@ -386,9 +387,14 @@ void kvm_recalculate_apic_map(struct kvm *kvm)
"Dirty APIC map without an in-kernel local APIC");
mutex_lock(&kvm->arch.apic_map_lock);
+
+retry:
/*
- * Read kvm->arch.apic_map_dirty before kvm->arch.apic_map
- * (if clean) or the APIC registers (if dirty).
+ * Read kvm->arch.apic_map_dirty before kvm->arch.apic_map (if clean)
+ * or the APIC registers (if dirty). Note, on retry the map may have
+ * not yet been marked dirty by whatever task changed a vCPU's x2APIC
+ * ID, i.e. the map may still show up as in-progress. In that case
+ * this task still needs to retry and complete its calculation.
*/
if (atomic_cmpxchg_acquire(&kvm->arch.apic_map_dirty,
DIRTY, UPDATE_IN_PROGRESS) == CLEAN) {
@@ -397,6 +403,15 @@ void kvm_recalculate_apic_map(struct kvm *kvm)
return;
}
+ /*
+ * Reset the mismatch flag between attempts so that KVM does the right
+ * thing if a vCPU changes its xAPIC ID, but do NOT reset max_id, i.e.
+ * keep max_id strictly increasing. Disallowing max_id from shrinking
+ * ensures KVM won't get stuck in an infinite loop, e.g. if the vCPU
+ * with the highest x2APIC ID is toggling its APIC on and off.
+ */
+ xapic_id_mismatch = false;
+
kvm_for_each_vcpu(i, vcpu, kvm)
if (kvm_apic_present(vcpu))
max_id = max(max_id, kvm_x2apic_id(vcpu->arch.apic));
@@ -415,9 +430,15 @@ void kvm_recalculate_apic_map(struct kvm *kvm)
if (!kvm_apic_present(vcpu))
continue;
- if (kvm_recalculate_phys_map(new, vcpu, &xapic_id_mismatch)) {
+ r = kvm_recalculate_phys_map(new, vcpu, &xapic_id_mismatch);
+ if (r) {
kvfree(new);
new = NULL;
+ if (r == -E2BIG) {
+ cond_resched();
+ goto retry;
+ }
+
goto out;
}
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 92d5a1924fc1..253fb2093d5d 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -121,6 +121,8 @@ void kvm_mmu_unload(struct kvm_vcpu *vcpu);
void kvm_mmu_free_obsolete_roots(struct kvm_vcpu *vcpu);
void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu);
void kvm_mmu_sync_prev_roots(struct kvm_vcpu *vcpu);
+void kvm_mmu_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new,
+ int bytes);
static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu)
{
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index ec169f5c7dce..f7901cb4d2fa 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -25,6 +25,7 @@
#include "kvm_cache_regs.h"
#include "smm.h"
#include "kvm_emulate.h"
+#include "page_track.h"
#include "cpuid.h"
#include "spte.h"
@@ -53,7 +54,7 @@
#include <asm/io.h>
#include <asm/set_memory.h>
#include <asm/vmx.h>
-#include <asm/kvm_page_track.h>
+
#include "trace.h"
extern bool itlb_multihit_kvm_mitigation;
@@ -115,11 +116,6 @@ static int max_huge_page_level __read_mostly;
static int tdp_root_level __read_mostly;
static int max_tdp_level __read_mostly;
-#ifdef MMU_DEBUG
-bool dbg = 0;
-module_param(dbg, bool, 0644);
-#endif
-
#define PTE_PREFETCH_NUM 8
#include <trace/events/kvm.h>
@@ -278,16 +274,12 @@ static inline bool kvm_available_flush_remote_tlbs_range(void)
return kvm_x86_ops.flush_remote_tlbs_range;
}
-void kvm_flush_remote_tlbs_range(struct kvm *kvm, gfn_t start_gfn,
- gfn_t nr_pages)
+int kvm_arch_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn, u64 nr_pages)
{
- int ret = -EOPNOTSUPP;
+ if (!kvm_x86_ops.flush_remote_tlbs_range)
+ return -EOPNOTSUPP;
- if (kvm_x86_ops.flush_remote_tlbs_range)
- ret = static_call(kvm_x86_flush_remote_tlbs_range)(kvm, start_gfn,
- nr_pages);
- if (ret)
- kvm_flush_remote_tlbs(kvm);
+ return static_call(kvm_x86_flush_remote_tlbs_range)(kvm, gfn, nr_pages);
}
static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index);
@@ -490,7 +482,7 @@ retry:
*/
static void mmu_spte_set(u64 *sptep, u64 new_spte)
{
- WARN_ON(is_shadow_present_pte(*sptep));
+ WARN_ON_ONCE(is_shadow_present_pte(*sptep));
__set_spte(sptep, new_spte);
}
@@ -502,7 +494,7 @@ static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte)
{
u64 old_spte = *sptep;
- WARN_ON(!is_shadow_present_pte(new_spte));
+ WARN_ON_ONCE(!is_shadow_present_pte(new_spte));
check_spte_writable_invariants(new_spte);
if (!is_shadow_present_pte(old_spte)) {
@@ -515,7 +507,7 @@ static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte)
else
old_spte = __update_clear_spte_slow(sptep, new_spte);
- WARN_ON(spte_to_pfn(old_spte) != spte_to_pfn(new_spte));
+ WARN_ON_ONCE(spte_to_pfn(old_spte) != spte_to_pfn(new_spte));
return old_spte;
}
@@ -597,7 +589,7 @@ static u64 mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep)
* by a refcounted page, the refcount is elevated.
*/
page = kvm_pfn_to_refcounted_page(pfn);
- WARN_ON(page && !page_count(page));
+ WARN_ON_ONCE(page && !page_count(page));
if (is_accessed_spte(old_spte))
kvm_set_pfn_accessed(pfn);
@@ -812,7 +804,7 @@ static void update_gfn_disallow_lpage_count(const struct kvm_memory_slot *slot,
for (i = PG_LEVEL_2M; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) {
linfo = lpage_info_slot(gfn, slot, i);
linfo->disallow_lpage += count;
- WARN_ON(linfo->disallow_lpage < 0);
+ WARN_ON_ONCE(linfo->disallow_lpage < 0);
}
}
@@ -839,8 +831,7 @@ static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
/* the non-leaf shadow pages are keeping readonly. */
if (sp->role.level > PG_LEVEL_4K)
- return kvm_slot_page_track_add_page(kvm, slot, gfn,
- KVM_PAGE_TRACK_WRITE);
+ return __kvm_write_track_add_gfn(kvm, slot, gfn);
kvm_mmu_gfn_disallow_lpage(slot, gfn);
@@ -886,8 +877,7 @@ static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
slots = kvm_memslots_for_spte_role(kvm, sp->role);
slot = __gfn_to_memslot(slots, gfn);
if (sp->role.level > PG_LEVEL_4K)
- return kvm_slot_page_track_remove_page(kvm, slot, gfn,
- KVM_PAGE_TRACK_WRITE);
+ return __kvm_write_track_remove_gfn(kvm, slot, gfn);
kvm_mmu_gfn_allow_lpage(slot, gfn);
}
@@ -941,10 +931,8 @@ static int pte_list_add(struct kvm_mmu_memory_cache *cache, u64 *spte,
int count = 0;
if (!rmap_head->val) {
- rmap_printk("%p %llx 0->1\n", spte, *spte);
rmap_head->val = (unsigned long)spte;
} else if (!(rmap_head->val & 1)) {
- rmap_printk("%p %llx 1->many\n", spte, *spte);
desc = kvm_mmu_memory_cache_alloc(cache);
desc->sptes[0] = (u64 *)rmap_head->val;
desc->sptes[1] = spte;
@@ -953,7 +941,6 @@ static int pte_list_add(struct kvm_mmu_memory_cache *cache, u64 *spte,
rmap_head->val = (unsigned long)desc | 1;
++count;
} else {
- rmap_printk("%p %llx many->many\n", spte, *spte);
desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
count = desc->tail_count + desc->spte_count;
@@ -973,7 +960,8 @@ static int pte_list_add(struct kvm_mmu_memory_cache *cache, u64 *spte,
return count;
}
-static void pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head,
+static void pte_list_desc_remove_entry(struct kvm *kvm,
+ struct kvm_rmap_head *rmap_head,
struct pte_list_desc *desc, int i)
{
struct pte_list_desc *head_desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
@@ -984,7 +972,7 @@ static void pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head,
* when adding an entry and the previous head is full, and heads are
* removed (this flow) when they become empty.
*/
- BUG_ON(j < 0);
+ KVM_BUG_ON_DATA_CORRUPTION(j < 0, kvm);
/*
* Replace the to-be-freed SPTE with the last valid entry from the head
@@ -1009,35 +997,34 @@ static void pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head,
mmu_free_pte_list_desc(head_desc);
}
-static void pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head)
+static void pte_list_remove(struct kvm *kvm, u64 *spte,
+ struct kvm_rmap_head *rmap_head)
{
struct pte_list_desc *desc;
int i;
- if (!rmap_head->val) {
- pr_err("%s: %p 0->BUG\n", __func__, spte);
- BUG();
- } else if (!(rmap_head->val & 1)) {
- rmap_printk("%p 1->0\n", spte);
- if ((u64 *)rmap_head->val != spte) {
- pr_err("%s: %p 1->BUG\n", __func__, spte);
- BUG();
- }
+ if (KVM_BUG_ON_DATA_CORRUPTION(!rmap_head->val, kvm))
+ return;
+
+ if (!(rmap_head->val & 1)) {
+ if (KVM_BUG_ON_DATA_CORRUPTION((u64 *)rmap_head->val != spte, kvm))
+ return;
+
rmap_head->val = 0;
} else {
- rmap_printk("%p many->many\n", spte);
desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
while (desc) {
for (i = 0; i < desc->spte_count; ++i) {
if (desc->sptes[i] == spte) {
- pte_list_desc_remove_entry(rmap_head, desc, i);
+ pte_list_desc_remove_entry(kvm, rmap_head,
+ desc, i);
return;
}
}
desc = desc->more;
}
- pr_err("%s: %p many->many\n", __func__, spte);
- BUG();
+
+ KVM_BUG_ON_DATA_CORRUPTION(true, kvm);
}
}
@@ -1045,7 +1032,7 @@ static void kvm_zap_one_rmap_spte(struct kvm *kvm,
struct kvm_rmap_head *rmap_head, u64 *sptep)
{
mmu_spte_clear_track_bits(kvm, sptep);
- pte_list_remove(sptep, rmap_head);
+ pte_list_remove(kvm, sptep, rmap_head);
}
/* Return true if at least one SPTE was zapped, false otherwise */
@@ -1120,7 +1107,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
slot = __gfn_to_memslot(slots, gfn);
rmap_head = gfn_to_rmap(gfn, sp->role.level, slot);
- pte_list_remove(spte, rmap_head);
+ pte_list_remove(kvm, spte, rmap_head);
}
/*
@@ -1212,7 +1199,7 @@ static void drop_large_spte(struct kvm *kvm, u64 *sptep, bool flush)
struct kvm_mmu_page *sp;
sp = sptep_to_sp(sptep);
- WARN_ON(sp->role.level == PG_LEVEL_4K);
+ WARN_ON_ONCE(sp->role.level == PG_LEVEL_4K);
drop_spte(kvm, sptep);
@@ -1241,8 +1228,6 @@ static bool spte_write_protect(u64 *sptep, bool pt_protect)
!(pt_protect && is_mmu_writable_spte(spte)))
return false;
- rmap_printk("spte %p %llx\n", sptep, *sptep);
-
if (pt_protect)
spte &= ~shadow_mmu_writable_mask;
spte = spte & ~PT_WRITABLE_MASK;
@@ -1267,9 +1252,7 @@ static bool spte_clear_dirty(u64 *sptep)
{
u64 spte = *sptep;
- rmap_printk("spte %p %llx\n", sptep, *sptep);
-
- MMU_WARN_ON(!spte_ad_enabled(spte));
+ KVM_MMU_WARN_ON(!spte_ad_enabled(spte));
spte &= ~shadow_dirty_mask;
return mmu_spte_update(sptep, spte);
}
@@ -1475,14 +1458,11 @@ static bool kvm_set_pte_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
u64 new_spte;
kvm_pfn_t new_pfn;
- WARN_ON(pte_huge(pte));
+ WARN_ON_ONCE(pte_huge(pte));
new_pfn = pte_pfn(pte);
restart:
for_each_rmap_spte(rmap_head, &iter, sptep) {
- rmap_printk("spte %p %llx gfn %llx (%d)\n",
- sptep, *sptep, gfn, level);
-
need_flush = true;
if (pte_write(pte)) {
@@ -1588,7 +1568,7 @@ static __always_inline bool kvm_handle_gfn_range(struct kvm *kvm,
for_each_slot_rmap_range(range->slot, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL,
range->start, range->end - 1, &iterator)
ret |= handler(kvm, iterator.rmap, range->slot, iterator.gfn,
- iterator.level, range->pte);
+ iterator.level, range->arg.pte);
return ret;
}
@@ -1710,21 +1690,19 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
return young;
}
-#ifdef MMU_DEBUG
-static int is_empty_shadow_page(u64 *spt)
+static void kvm_mmu_check_sptes_at_free(struct kvm_mmu_page *sp)
{
- u64 *pos;
- u64 *end;
+#ifdef CONFIG_KVM_PROVE_MMU
+ int i;
- for (pos = spt, end = pos + SPTE_ENT_PER_PAGE; pos != end; pos++)
- if (is_shadow_present_pte(*pos)) {
- printk(KERN_ERR "%s: %p %llx\n", __func__,
- pos, *pos);
- return 0;
- }
- return 1;
-}
+ for (i = 0; i < SPTE_ENT_PER_PAGE; i++) {
+ if (KVM_MMU_WARN_ON(is_shadow_present_pte(sp->spt[i])))
+ pr_err_ratelimited("SPTE %llx (@ %p) for gfn %llx shadow-present at free",
+ sp->spt[i], &sp->spt[i],
+ kvm_mmu_page_get_gfn(sp, i));
+ }
#endif
+}
/*
* This value is the sum of all of the kvm instances's
@@ -1752,7 +1730,8 @@ static void kvm_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
static void kvm_mmu_free_shadow_page(struct kvm_mmu_page *sp)
{
- MMU_WARN_ON(!is_empty_shadow_page(sp->spt));
+ kvm_mmu_check_sptes_at_free(sp);
+
hlist_del(&sp->hash_link);
list_del(&sp->link);
free_page((unsigned long)sp->spt);
@@ -1775,16 +1754,16 @@ static void mmu_page_add_parent_pte(struct kvm_mmu_memory_cache *cache,
pte_list_add(cache, parent_pte, &sp->parent_ptes);
}
-static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
+static void mmu_page_remove_parent_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
u64 *parent_pte)
{
- pte_list_remove(parent_pte, &sp->parent_ptes);
+ pte_list_remove(kvm, parent_pte, &sp->parent_ptes);
}
-static void drop_parent_pte(struct kvm_mmu_page *sp,
+static void drop_parent_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
u64 *parent_pte)
{
- mmu_page_remove_parent_pte(sp, parent_pte);
+ mmu_page_remove_parent_pte(kvm, sp, parent_pte);
mmu_spte_clear_no_track(parent_pte);
}
@@ -1840,7 +1819,7 @@ static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp,
static inline void clear_unsync_child_bit(struct kvm_mmu_page *sp, int idx)
{
--sp->unsync_children;
- WARN_ON((int)sp->unsync_children < 0);
+ WARN_ON_ONCE((int)sp->unsync_children < 0);
__clear_bit(idx, sp->unsync_child_bitmap);
}
@@ -1898,7 +1877,7 @@ static int mmu_unsync_walk(struct kvm_mmu_page *sp,
static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
{
- WARN_ON(!sp->unsync);
+ WARN_ON_ONCE(!sp->unsync);
trace_kvm_mmu_sync_page(sp);
sp->unsync = 0;
--kvm->stat.mmu_unsync;
@@ -2073,11 +2052,11 @@ static int mmu_pages_first(struct kvm_mmu_pages *pvec,
if (pvec->nr == 0)
return 0;
- WARN_ON(pvec->page[0].idx != INVALID_INDEX);
+ WARN_ON_ONCE(pvec->page[0].idx != INVALID_INDEX);
sp = pvec->page[0].sp;
level = sp->role.level;
- WARN_ON(level == PG_LEVEL_4K);
+ WARN_ON_ONCE(level == PG_LEVEL_4K);
parents->parent[level-2] = sp;
@@ -2099,7 +2078,7 @@ static void mmu_pages_clear_parents(struct mmu_page_path *parents)
if (!sp)
return;
- WARN_ON(idx == INVALID_INDEX);
+ WARN_ON_ONCE(idx == INVALID_INDEX);
clear_unsync_child_bit(sp, idx);
level++;
} while (!sp->unsync_children);
@@ -2220,7 +2199,7 @@ static struct kvm_mmu_page *kvm_mmu_find_shadow_page(struct kvm *kvm,
if (ret < 0)
break;
- WARN_ON(!list_empty(&invalid_list));
+ WARN_ON_ONCE(!list_empty(&invalid_list));
if (ret > 0)
kvm_flush_remote_tlbs(kvm);
}
@@ -2499,7 +2478,7 @@ static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
if (child->role.access == direct_access)
return;
- drop_parent_pte(child, sptep);
+ drop_parent_pte(vcpu->kvm, child, sptep);
kvm_flush_remote_tlbs_sptep(vcpu->kvm, sptep);
}
}
@@ -2517,7 +2496,7 @@ static int mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
drop_spte(kvm, spte);
} else {
child = spte_to_child_sp(pte);
- drop_parent_pte(child, spte);
+ drop_parent_pte(kvm, child, spte);
/*
* Recursively zap nested TDP SPs, parentless SPs are
@@ -2548,13 +2527,13 @@ static int kvm_mmu_page_unlink_children(struct kvm *kvm,
return zapped;
}
-static void kvm_mmu_unlink_parents(struct kvm_mmu_page *sp)
+static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
{
u64 *sptep;
struct rmap_iterator iter;
while ((sptep = rmap_get_first(&sp->parent_ptes, &iter)))
- drop_parent_pte(sp, sptep);
+ drop_parent_pte(kvm, sp, sptep);
}
static int mmu_zap_unsync_children(struct kvm *kvm,
@@ -2593,7 +2572,7 @@ static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm,
++kvm->stat.mmu_shadow_zapped;
*nr_zapped = mmu_zap_unsync_children(kvm, sp, invalid_list);
*nr_zapped += kvm_mmu_page_unlink_children(kvm, sp, invalid_list);
- kvm_mmu_unlink_parents(sp);
+ kvm_mmu_unlink_parents(kvm, sp);
/* Zapping children means active_mmu_pages has become unstable. */
list_unstable = *nr_zapped;
@@ -2675,7 +2654,7 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
kvm_flush_remote_tlbs(kvm);
list_for_each_entry_safe(sp, nsp, invalid_list, link) {
- WARN_ON(!sp->role.invalid || sp->root_count);
+ WARN_ON_ONCE(!sp->role.invalid || sp->root_count);
kvm_mmu_free_shadow_page(sp);
}
}
@@ -2775,12 +2754,9 @@ int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
LIST_HEAD(invalid_list);
int r;
- pgprintk("%s: looking for gfn %llx\n", __func__, gfn);
r = 0;
write_lock(&kvm->mmu_lock);
for_each_gfn_valid_sp_with_gptes(kvm, sp, gfn) {
- pgprintk("%s: gfn %llx role %x\n", __func__, gfn,
- sp->role.word);
r = 1;
kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
}
@@ -2831,7 +2807,7 @@ int mmu_try_to_unsync_pages(struct kvm *kvm, const struct kvm_memory_slot *slot,
* track machinery is used to write-protect upper-level shadow pages,
* i.e. this guards the role.level == 4K assertion below!
*/
- if (kvm_slot_page_track_is_active(kvm, slot, gfn, KVM_PAGE_TRACK_WRITE))
+ if (kvm_gfn_is_write_tracked(kvm, slot, gfn))
return -EPERM;
/*
@@ -2873,7 +2849,7 @@ int mmu_try_to_unsync_pages(struct kvm *kvm, const struct kvm_memory_slot *slot,
continue;
}
- WARN_ON(sp->role.level != PG_LEVEL_4K);
+ WARN_ON_ONCE(sp->role.level != PG_LEVEL_4K);
kvm_unsync_page(kvm, sp);
}
if (locked)
@@ -2938,9 +2914,6 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot,
bool prefetch = !fault || fault->prefetch;
bool write_fault = fault && fault->write;
- pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__,
- *sptep, write_fault, gfn);
-
if (unlikely(is_noslot_pfn(pfn))) {
vcpu->stat.pf_mmio_spte_created++;
mark_mmio_spte(vcpu, sptep, gfn, pte_access);
@@ -2957,11 +2930,9 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot,
u64 pte = *sptep;
child = spte_to_child_sp(pte);
- drop_parent_pte(child, sptep);
+ drop_parent_pte(vcpu->kvm, child, sptep);
flush = true;
} else if (pfn != spte_to_pfn(*sptep)) {
- pgprintk("hfn old %llx new %llx\n",
- spte_to_pfn(*sptep), pfn);
drop_spte(vcpu->kvm, sptep);
flush = true;
} else
@@ -2986,8 +2957,6 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot,
if (flush)
kvm_flush_remote_tlbs_gfn(vcpu->kvm, gfn, level);
- pgprintk("%s: setting spte %llx\n", __func__, *sptep);
-
if (!was_rmapped) {
WARN_ON_ONCE(ret == RET_PF_SPURIOUS);
rmap_add(vcpu, slot, sptep, gfn, pte_access);
@@ -3033,7 +3002,7 @@ static void __direct_pte_prefetch(struct kvm_vcpu *vcpu,
u64 *spte, *start = NULL;
int i;
- WARN_ON(!sp->role.direct);
+ WARN_ON_ONCE(!sp->role.direct);
i = spte_index(sptep) & ~(PTE_PREFETCH_NUM - 1);
spte = sp->spt + i;
@@ -3574,12 +3543,8 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
if (!VALID_PAGE(*root_hpa))
return;
- /*
- * The "root" may be a special root, e.g. a PAE entry, treat it as a
- * SPTE to ensure any non-PA bits are dropped.
- */
- sp = spte_to_child_sp(*root_hpa);
- if (WARN_ON(!sp))
+ sp = root_to_sp(*root_hpa);
+ if (WARN_ON_ONCE(!sp))
return;
if (is_tdp_mmu_page(sp))
@@ -3624,7 +3589,9 @@ void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu,
&invalid_list);
if (free_active_root) {
- if (to_shadow_page(mmu->root.hpa)) {
+ if (kvm_mmu_is_dummy_root(mmu->root.hpa)) {
+ /* Nothing to cleanup for dummy roots. */
+ } else if (root_to_sp(mmu->root.hpa)) {
mmu_free_root_page(kvm, &mmu->root.hpa, &invalid_list);
} else if (mmu->pae_root) {
for (i = 0; i < 4; ++i) {
@@ -3648,6 +3615,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_free_roots);
void kvm_mmu_free_guest_mode_roots(struct kvm *kvm, struct kvm_mmu *mmu)
{
unsigned long roots_to_free = 0;
+ struct kvm_mmu_page *sp;
hpa_t root_hpa;
int i;
@@ -3662,8 +3630,8 @@ void kvm_mmu_free_guest_mode_roots(struct kvm *kvm, struct kvm_mmu *mmu)
if (!VALID_PAGE(root_hpa))
continue;
- if (!to_shadow_page(root_hpa) ||
- to_shadow_page(root_hpa)->role.guest_mode)
+ sp = root_to_sp(root_hpa);
+ if (!sp || sp->role.guest_mode)
roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
}
@@ -3671,19 +3639,6 @@ void kvm_mmu_free_guest_mode_roots(struct kvm *kvm, struct kvm_mmu *mmu)
}
EXPORT_SYMBOL_GPL(kvm_mmu_free_guest_mode_roots);
-
-static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn)
-{
- int ret = 0;
-
- if (!kvm_vcpu_is_visible_gfn(vcpu, root_gfn)) {
- kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
- ret = 1;
- }
-
- return ret;
-}
-
static hpa_t mmu_alloc_root(struct kvm_vcpu *vcpu, gfn_t gfn, int quadrant,
u8 level)
{
@@ -3821,8 +3776,10 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
root_pgd = kvm_mmu_get_guest_pgd(vcpu, mmu);
root_gfn = root_pgd >> PAGE_SHIFT;
- if (mmu_check_root(vcpu, root_gfn))
- return 1;
+ if (!kvm_vcpu_is_visible_gfn(vcpu, root_gfn)) {
+ mmu->root.hpa = kvm_mmu_get_dummy_root();
+ return 0;
+ }
/*
* On SVM, reading PDPTRs might access guest memory, which might fault
@@ -3834,8 +3791,8 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
if (!(pdptrs[i] & PT_PRESENT_MASK))
continue;
- if (mmu_check_root(vcpu, pdptrs[i] >> PAGE_SHIFT))
- return 1;
+ if (!kvm_vcpu_is_visible_gfn(vcpu, pdptrs[i] >> PAGE_SHIFT))
+ pdptrs[i] = 0;
}
}
@@ -4002,7 +3959,7 @@ static bool is_unsync_root(hpa_t root)
{
struct kvm_mmu_page *sp;
- if (!VALID_PAGE(root))
+ if (!VALID_PAGE(root) || kvm_mmu_is_dummy_root(root))
return false;
/*
@@ -4018,7 +3975,7 @@ static bool is_unsync_root(hpa_t root)
* requirement isn't satisfied.
*/
smp_rmb();
- sp = to_shadow_page(root);
+ sp = root_to_sp(root);
/*
* PAE roots (somewhat arbitrarily) aren't backed by shadow pages, the
@@ -4048,11 +4005,12 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
if (vcpu->arch.mmu->cpu_role.base.level >= PT64_ROOT_4LEVEL) {
hpa_t root = vcpu->arch.mmu->root.hpa;
- sp = to_shadow_page(root);
if (!is_unsync_root(root))
return;
+ sp = root_to_sp(root);
+
write_lock(&vcpu->kvm->mmu_lock);
mmu_sync_children(vcpu, sp, true);
write_unlock(&vcpu->kvm->mmu_lock);
@@ -4194,7 +4152,7 @@ static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct)
return RET_PF_EMULATE;
reserved = get_mmio_spte(vcpu, addr, &spte);
- if (WARN_ON(reserved))
+ if (WARN_ON_ONCE(reserved))
return -EINVAL;
if (is_mmio_spte(spte)) {
@@ -4232,7 +4190,7 @@ static bool page_fault_handle_page_track(struct kvm_vcpu *vcpu,
* guest is writing the page which is write tracked which can
* not be fixed by page fault handler.
*/
- if (kvm_slot_page_track_is_active(vcpu->kvm, fault->slot, fault->gfn, KVM_PAGE_TRACK_WRITE))
+ if (kvm_gfn_is_write_tracked(vcpu->kvm, fault->slot, fault->gfn))
return true;
return false;
@@ -4382,7 +4340,7 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
static bool is_page_fault_stale(struct kvm_vcpu *vcpu,
struct kvm_page_fault *fault)
{
- struct kvm_mmu_page *sp = to_shadow_page(vcpu->arch.mmu->root.hpa);
+ struct kvm_mmu_page *sp = root_to_sp(vcpu->arch.mmu->root.hpa);
/* Special roots, e.g. pae_root, are not backed by shadow pages. */
if (sp && is_obsolete_sp(vcpu->kvm, sp))
@@ -4407,6 +4365,10 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
{
int r;
+ /* Dummy roots are used only for shadowing bad guest roots. */
+ if (WARN_ON_ONCE(kvm_mmu_is_dummy_root(vcpu->arch.mmu->root.hpa)))
+ return RET_PF_RETRY;
+
if (page_fault_handle_page_track(vcpu, fault))
return RET_PF_EMULATE;
@@ -4443,8 +4405,6 @@ out_unlock:
static int nonpaging_page_fault(struct kvm_vcpu *vcpu,
struct kvm_page_fault *fault)
{
- pgprintk("%s: gva %lx error %x\n", __func__, fault->addr, fault->error_code);
-
/* This path builds a PAE pagetable, we can map 2mb pages at maximum. */
fault->max_level = PG_LEVEL_2M;
return direct_page_fault(vcpu, fault);
@@ -4562,9 +4522,19 @@ static void nonpaging_init_context(struct kvm_mmu *context)
static inline bool is_root_usable(struct kvm_mmu_root_info *root, gpa_t pgd,
union kvm_mmu_page_role role)
{
- return (role.direct || pgd == root->pgd) &&
- VALID_PAGE(root->hpa) &&
- role.word == to_shadow_page(root->hpa)->role.word;
+ struct kvm_mmu_page *sp;
+
+ if (!VALID_PAGE(root->hpa))
+ return false;
+
+ if (!role.direct && pgd != root->pgd)
+ return false;
+
+ sp = root_to_sp(root->hpa);
+ if (WARN_ON_ONCE(!sp))
+ return false;
+
+ return role.word == sp->role.word;
}
/*
@@ -4634,11 +4604,10 @@ static bool fast_pgd_switch(struct kvm *kvm, struct kvm_mmu *mmu,
gpa_t new_pgd, union kvm_mmu_page_role new_role)
{
/*
- * For now, limit the caching to 64-bit hosts+VMs in order to avoid
- * having to deal with PDPTEs. We may add support for 32-bit hosts/VMs
- * later if necessary.
+ * Limit reuse to 64-bit hosts+VMs without "special" roots in order to
+ * avoid having to deal with PDPTEs and other complexities.
*/
- if (VALID_PAGE(mmu->root.hpa) && !to_shadow_page(mmu->root.hpa))
+ if (VALID_PAGE(mmu->root.hpa) && !root_to_sp(mmu->root.hpa))
kvm_mmu_free_roots(kvm, mmu, KVM_MMU_ROOT_CURRENT);
if (VALID_PAGE(mmu->root.hpa))
@@ -4684,9 +4653,12 @@ void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd)
* If this is a direct root page, it doesn't have a write flooding
* count. Otherwise, clear the write flooding count.
*/
- if (!new_role.direct)
- __clear_sp_write_flooding_count(
- to_shadow_page(vcpu->arch.mmu->root.hpa));
+ if (!new_role.direct) {
+ struct kvm_mmu_page *sp = root_to_sp(vcpu->arch.mmu->root.hpa);
+
+ if (!WARN_ON_ONCE(!sp))
+ __clear_sp_write_flooding_count(sp);
+ }
}
EXPORT_SYMBOL_GPL(kvm_mmu_new_pgd);
@@ -4808,28 +4780,13 @@ static void __reset_rsvds_bits_mask(struct rsvd_bits_validate *rsvd_check,
}
}
-static bool guest_can_use_gbpages(struct kvm_vcpu *vcpu)
-{
- /*
- * If TDP is enabled, let the guest use GBPAGES if they're supported in
- * hardware. The hardware page walker doesn't let KVM disable GBPAGES,
- * i.e. won't treat them as reserved, and KVM doesn't redo the GVA->GPA
- * walk for performance and complexity reasons. Not to mention KVM
- * _can't_ solve the problem because GVA->GPA walks aren't visible to
- * KVM once a TDP translation is installed. Mimic hardware behavior so
- * that KVM's is at least consistent, i.e. doesn't randomly inject #PF.
- */
- return tdp_enabled ? boot_cpu_has(X86_FEATURE_GBPAGES) :
- guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES);
-}
-
static void reset_guest_rsvds_bits_mask(struct kvm_vcpu *vcpu,
struct kvm_mmu *context)
{
__reset_rsvds_bits_mask(&context->guest_rsvd_check,
vcpu->arch.reserved_gpa_bits,
context->cpu_role.base.level, is_efer_nx(context),
- guest_can_use_gbpages(vcpu),
+ guest_can_use(vcpu, X86_FEATURE_GBPAGES),
is_cr4_pse(context),
guest_cpuid_is_amd_or_hygon(vcpu));
}
@@ -4906,7 +4863,8 @@ static void reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
__reset_rsvds_bits_mask(shadow_zero_check, reserved_hpa_bits(),
context->root_role.level,
context->root_role.efer_nx,
- guest_can_use_gbpages(vcpu), is_pse, is_amd);
+ guest_can_use(vcpu, X86_FEATURE_GBPAGES),
+ is_pse, is_amd);
if (!shadow_me_mask)
return;
@@ -5467,8 +5425,8 @@ void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu)
* physical address properties) in a single VM would require tracking
* all relevant CPUID information in kvm_mmu_page_role. That is very
* undesirable as it would increase the memory requirements for
- * gfn_track (see struct kvm_mmu_page_role comments). For now that
- * problem is swept under the rug; KVM's CPUID API is horrific and
+ * gfn_write_track (see struct kvm_mmu_page_role comments). For now
+ * that problem is swept under the rug; KVM's CPUID API is horrific and
* it's all but impossible to solve it without introducing a new API.
*/
vcpu->arch.root_mmu.root_role.word = 0;
@@ -5531,9 +5489,9 @@ void kvm_mmu_unload(struct kvm_vcpu *vcpu)
struct kvm *kvm = vcpu->kvm;
kvm_mmu_free_roots(kvm, &vcpu->arch.root_mmu, KVM_MMU_ROOTS_ALL);
- WARN_ON(VALID_PAGE(vcpu->arch.root_mmu.root.hpa));
+ WARN_ON_ONCE(VALID_PAGE(vcpu->arch.root_mmu.root.hpa));
kvm_mmu_free_roots(kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
- WARN_ON(VALID_PAGE(vcpu->arch.guest_mmu.root.hpa));
+ WARN_ON_ONCE(VALID_PAGE(vcpu->arch.guest_mmu.root.hpa));
vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
}
@@ -5546,16 +5504,21 @@ static bool is_obsolete_root(struct kvm *kvm, hpa_t root_hpa)
/*
* When freeing obsolete roots, treat roots as obsolete if they don't
- * have an associated shadow page. This does mean KVM will get false
+ * have an associated shadow page, as it's impossible to determine if
+ * such roots are fresh or stale. This does mean KVM will get false
* positives and free roots that don't strictly need to be freed, but
* such false positives are relatively rare:
*
- * (a) only PAE paging and nested NPT has roots without shadow pages
+ * (a) only PAE paging and nested NPT have roots without shadow pages
+ * (or any shadow paging flavor with a dummy root, see note below)
* (b) remote reloads due to a memslot update obsoletes _all_ roots
* (c) KVM doesn't track previous roots for PAE paging, and the guest
* is unlikely to zap an in-use PGD.
+ *
+ * Note! Dummy roots are unique in that they are obsoleted by memslot
+ * _creation_! See also FNAME(fetch).
*/
- sp = to_shadow_page(root_hpa);
+ sp = root_to_sp(root_hpa);
return !sp || is_obsolete_sp(kvm, sp);
}
@@ -5634,9 +5597,6 @@ static bool detect_write_misaligned(struct kvm_mmu_page *sp, gpa_t gpa,
{
unsigned offset, pte_size, misaligned;
- pgprintk("misaligned: gpa %llx bytes %d role %x\n",
- gpa, bytes, sp->role.word);
-
offset = offset_in_page(gpa);
pte_size = sp->role.has_4_byte_gpte ? 4 : 8;
@@ -5684,9 +5644,8 @@ static u64 *get_written_sptes(struct kvm_mmu_page *sp, gpa_t gpa, int *nspte)
return spte;
}
-static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
- const u8 *new, int bytes,
- struct kvm_page_track_notifier_node *node)
+void kvm_mmu_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new,
+ int bytes)
{
gfn_t gfn = gpa >> PAGE_SHIFT;
struct kvm_mmu_page *sp;
@@ -5702,8 +5661,6 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
if (!READ_ONCE(vcpu->kvm->arch.indirect_shadow_pages))
return;
- pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
-
write_lock(&vcpu->kvm->mmu_lock);
gentry = mmu_pte_write_fetch_gpte(vcpu, &gpa, &bytes);
@@ -5742,7 +5699,18 @@ int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 err
int r, emulation_type = EMULTYPE_PF;
bool direct = vcpu->arch.mmu->root_role.direct;
- if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root.hpa)))
+ /*
+ * IMPLICIT_ACCESS is a KVM-defined flag used to correctly perform SMAP
+ * checks when emulating instructions that triggers implicit access.
+ * WARN if hardware generates a fault with an error code that collides
+ * with the KVM-defined value. Clear the flag and continue on, i.e.
+ * don't terminate the VM, as KVM can't possibly be relying on a flag
+ * that KVM doesn't know about.
+ */
+ if (WARN_ON_ONCE(error_code & PFERR_IMPLICIT_ACCESS))
+ error_code &= ~PFERR_IMPLICIT_ACCESS;
+
+ if (WARN_ON_ONCE(!VALID_PAGE(vcpu->arch.mmu->root.hpa)))
return RET_PF_RETRY;
r = RET_PF_INVALID;
@@ -6099,7 +6067,7 @@ restart:
* pages. Skip the bogus page, otherwise we'll get stuck in an
* infinite loop if the page gets put back on the list (again).
*/
- if (WARN_ON(sp->role.invalid))
+ if (WARN_ON_ONCE(sp->role.invalid))
continue;
/*
@@ -6199,32 +6167,15 @@ static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages));
}
-static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm,
- struct kvm_memory_slot *slot,
- struct kvm_page_track_notifier_node *node)
+void kvm_mmu_init_vm(struct kvm *kvm)
{
- kvm_mmu_zap_all_fast(kvm);
-}
-
-int kvm_mmu_init_vm(struct kvm *kvm)
-{
- struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
- int r;
-
INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages);
INIT_LIST_HEAD(&kvm->arch.possible_nx_huge_pages);
spin_lock_init(&kvm->arch.mmu_unsync_pages_lock);
- if (tdp_mmu_enabled) {
- r = kvm_mmu_init_tdp_mmu(kvm);
- if (r < 0)
- return r;
- }
-
- node->track_write = kvm_mmu_pte_write;
- node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot;
- kvm_page_track_register_notifier(kvm, node);
+ if (tdp_mmu_enabled)
+ kvm_mmu_init_tdp_mmu(kvm);
kvm->arch.split_page_header_cache.kmem_cache = mmu_page_header_cache;
kvm->arch.split_page_header_cache.gfp_zero = __GFP_ZERO;
@@ -6233,8 +6184,6 @@ int kvm_mmu_init_vm(struct kvm *kvm)
kvm->arch.split_desc_cache.kmem_cache = pte_list_desc_cache;
kvm->arch.split_desc_cache.gfp_zero = __GFP_ZERO;
-
- return 0;
}
static void mmu_free_vm_memory_caches(struct kvm *kvm)
@@ -6246,10 +6195,6 @@ static void mmu_free_vm_memory_caches(struct kvm *kvm)
void kvm_mmu_uninit_vm(struct kvm *kvm)
{
- struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
-
- kvm_page_track_unregister_notifier(kvm, node);
-
if (tdp_mmu_enabled)
kvm_mmu_uninit_tdp_mmu(kvm);
@@ -6294,7 +6239,6 @@ static bool kvm_rmap_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_e
void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
{
bool flush;
- int i;
if (WARN_ON_ONCE(gfn_end <= gfn_start))
return;
@@ -6305,11 +6249,8 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
flush = kvm_rmap_zap_gfn_range(kvm, gfn_start, gfn_end);
- if (tdp_mmu_enabled) {
- for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
- flush = kvm_tdp_mmu_zap_leafs(kvm, i, gfn_start,
- gfn_end, true, flush);
- }
+ if (tdp_mmu_enabled)
+ flush = kvm_tdp_mmu_zap_leafs(kvm, gfn_start, gfn_end, flush);
if (flush)
kvm_flush_remote_tlbs_range(kvm, gfn_start, gfn_end - gfn_start);
@@ -6670,7 +6611,7 @@ static void kvm_rmap_zap_collapsible_sptes(struct kvm *kvm,
*/
if (walk_slot_rmaps(kvm, slot, kvm_mmu_zap_collapsible_spte,
PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL - 1, true))
- kvm_arch_flush_remote_tlbs_memslot(kvm, slot);
+ kvm_flush_remote_tlbs_memslot(kvm, slot);
}
void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
@@ -6689,20 +6630,6 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
}
}
-void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm,
- const struct kvm_memory_slot *memslot)
-{
- /*
- * All current use cases for flushing the TLBs for a specific memslot
- * related to dirty logging, and many do the TLB flush out of mmu_lock.
- * The interaction between the various operations on memslot must be
- * serialized by slots_locks to ensure the TLB flush from one operation
- * is observed by any other operation on the same memslot.
- */
- lockdep_assert_held(&kvm->slots_lock);
- kvm_flush_remote_tlbs_range(kvm, memslot->base_gfn, memslot->npages);
-}
-
void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
const struct kvm_memory_slot *memslot)
{
@@ -6732,7 +6659,7 @@ void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
*/
}
-void kvm_mmu_zap_all(struct kvm *kvm)
+static void kvm_mmu_zap_all(struct kvm *kvm)
{
struct kvm_mmu_page *sp, *node;
LIST_HEAD(invalid_list);
@@ -6741,7 +6668,7 @@ void kvm_mmu_zap_all(struct kvm *kvm)
write_lock(&kvm->mmu_lock);
restart:
list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) {
- if (WARN_ON(sp->role.invalid))
+ if (WARN_ON_ONCE(sp->role.invalid))
continue;
if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign))
goto restart;
@@ -6757,9 +6684,20 @@ restart:
write_unlock(&kvm->mmu_lock);
}
+void kvm_arch_flush_shadow_all(struct kvm *kvm)
+{
+ kvm_mmu_zap_all(kvm);
+}
+
+void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
+ struct kvm_memory_slot *slot)
+{
+ kvm_mmu_zap_all_fast(kvm);
+}
+
void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
{
- WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
+ WARN_ON_ONCE(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
gen &= MMIO_SPTE_GEN_MASK;
@@ -6862,7 +6800,7 @@ static void mmu_destroy_caches(void)
static int get_nx_huge_pages(char *buffer, const struct kernel_param *kp)
{
if (nx_hugepage_mitigation_hard_disabled)
- return sprintf(buffer, "never\n");
+ return sysfs_emit(buffer, "never\n");
return param_get_bool(buffer, kp);
}
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index d39af5639ce9..decc1f153669 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -6,18 +6,10 @@
#include <linux/kvm_host.h>
#include <asm/kvm_host.h>
-#undef MMU_DEBUG
-
-#ifdef MMU_DEBUG
-extern bool dbg;
-
-#define pgprintk(x...) do { if (dbg) printk(x); } while (0)
-#define rmap_printk(fmt, args...) do { if (dbg) printk("%s: " fmt, __func__, ## args); } while (0)
-#define MMU_WARN_ON(x) WARN_ON(x)
+#ifdef CONFIG_KVM_PROVE_MMU
+#define KVM_MMU_WARN_ON(x) WARN_ON_ONCE(x)
#else
-#define pgprintk(x...) do { } while (0)
-#define rmap_printk(x...) do { } while (0)
-#define MMU_WARN_ON(x) do { } while (0)
+#define KVM_MMU_WARN_ON(x) BUILD_BUG_ON_INVALID(x)
#endif
/* Page table builder macros common to shadow (host) PTEs and guest PTEs. */
@@ -44,6 +36,16 @@ extern bool dbg;
#define INVALID_PAE_ROOT 0
#define IS_VALID_PAE_ROOT(x) (!!(x))
+static inline hpa_t kvm_mmu_get_dummy_root(void)
+{
+ return my_zero_pfn(0) << PAGE_SHIFT;
+}
+
+static inline bool kvm_mmu_is_dummy_root(hpa_t shadow_page)
+{
+ return is_zero_pfn(shadow_page >> PAGE_SHIFT);
+}
+
typedef u64 __rcu *tdp_ptep_t;
struct kvm_mmu_page {
@@ -56,7 +58,12 @@ struct kvm_mmu_page {
bool tdp_mmu_page;
bool unsync;
- u8 mmu_valid_gen;
+ union {
+ u8 mmu_valid_gen;
+
+ /* Only accessed under slots_lock. */
+ bool tdp_mmu_scheduled_root_to_zap;
+ };
/*
* The shadow page can't be replaced by an equivalent huge page
@@ -98,13 +105,7 @@ struct kvm_mmu_page {
struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */
tdp_ptep_t ptep;
};
- union {
- DECLARE_BITMAP(unsync_child_bitmap, 512);
- struct {
- struct work_struct tdp_mmu_async_work;
- void *tdp_mmu_async_data;
- };
- };
+ DECLARE_BITMAP(unsync_child_bitmap, 512);
/*
* Tracks shadow pages that, if zapped, would allow KVM to create an NX
@@ -170,9 +171,6 @@ bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
struct kvm_memory_slot *slot, u64 gfn,
int min_level);
-void kvm_flush_remote_tlbs_range(struct kvm *kvm, gfn_t start_gfn,
- gfn_t nr_pages);
-
/* Flush the given page (huge or not) of guest memory. */
static inline void kvm_flush_remote_tlbs_gfn(struct kvm *kvm, gfn_t gfn, int level)
{
diff --git a/arch/x86/kvm/mmu/page_track.c b/arch/x86/kvm/mmu/page_track.c
index 0a2ac438d647..c87da11f3a04 100644
--- a/arch/x86/kvm/mmu/page_track.c
+++ b/arch/x86/kvm/mmu/page_track.c
@@ -12,13 +12,13 @@
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/lockdep.h>
#include <linux/kvm_host.h>
#include <linux/rculist.h>
-#include <asm/kvm_page_track.h>
-
#include "mmu.h"
#include "mmu_internal.h"
+#include "page_track.h"
bool kvm_page_track_write_tracking_enabled(struct kvm *kvm)
{
@@ -28,103 +28,64 @@ bool kvm_page_track_write_tracking_enabled(struct kvm *kvm)
void kvm_page_track_free_memslot(struct kvm_memory_slot *slot)
{
- int i;
-
- for (i = 0; i < KVM_PAGE_TRACK_MAX; i++) {
- kvfree(slot->arch.gfn_track[i]);
- slot->arch.gfn_track[i] = NULL;
- }
+ kvfree(slot->arch.gfn_write_track);
+ slot->arch.gfn_write_track = NULL;
}
-int kvm_page_track_create_memslot(struct kvm *kvm,
- struct kvm_memory_slot *slot,
- unsigned long npages)
+static int __kvm_page_track_write_tracking_alloc(struct kvm_memory_slot *slot,
+ unsigned long npages)
{
- int i;
-
- for (i = 0; i < KVM_PAGE_TRACK_MAX; i++) {
- if (i == KVM_PAGE_TRACK_WRITE &&
- !kvm_page_track_write_tracking_enabled(kvm))
- continue;
-
- slot->arch.gfn_track[i] =
- __vcalloc(npages, sizeof(*slot->arch.gfn_track[i]),
- GFP_KERNEL_ACCOUNT);
- if (!slot->arch.gfn_track[i])
- goto track_free;
- }
+ const size_t size = sizeof(*slot->arch.gfn_write_track);
- return 0;
+ if (!slot->arch.gfn_write_track)
+ slot->arch.gfn_write_track = __vcalloc(npages, size,
+ GFP_KERNEL_ACCOUNT);
-track_free:
- kvm_page_track_free_memslot(slot);
- return -ENOMEM;
+ return slot->arch.gfn_write_track ? 0 : -ENOMEM;
}
-static inline bool page_track_mode_is_valid(enum kvm_page_track_mode mode)
+int kvm_page_track_create_memslot(struct kvm *kvm,
+ struct kvm_memory_slot *slot,
+ unsigned long npages)
{
- if (mode < 0 || mode >= KVM_PAGE_TRACK_MAX)
- return false;
+ if (!kvm_page_track_write_tracking_enabled(kvm))
+ return 0;
- return true;
+ return __kvm_page_track_write_tracking_alloc(slot, npages);
}
int kvm_page_track_write_tracking_alloc(struct kvm_memory_slot *slot)
{
- unsigned short *gfn_track;
-
- if (slot->arch.gfn_track[KVM_PAGE_TRACK_WRITE])
- return 0;
-
- gfn_track = __vcalloc(slot->npages, sizeof(*gfn_track),
- GFP_KERNEL_ACCOUNT);
- if (gfn_track == NULL)
- return -ENOMEM;
-
- slot->arch.gfn_track[KVM_PAGE_TRACK_WRITE] = gfn_track;
- return 0;
+ return __kvm_page_track_write_tracking_alloc(slot, slot->npages);
}
-static void update_gfn_track(struct kvm_memory_slot *slot, gfn_t gfn,
- enum kvm_page_track_mode mode, short count)
+static void update_gfn_write_track(struct kvm_memory_slot *slot, gfn_t gfn,
+ short count)
{
int index, val;
index = gfn_to_index(gfn, slot->base_gfn, PG_LEVEL_4K);
- val = slot->arch.gfn_track[mode][index];
+ val = slot->arch.gfn_write_track[index];
- if (WARN_ON(val + count < 0 || val + count > USHRT_MAX))
+ if (WARN_ON_ONCE(val + count < 0 || val + count > USHRT_MAX))
return;
- slot->arch.gfn_track[mode][index] += count;
+ slot->arch.gfn_write_track[index] += count;
}
-/*
- * add guest page to the tracking pool so that corresponding access on that
- * page will be intercepted.
- *
- * It should be called under the protection both of mmu-lock and kvm->srcu
- * or kvm->slots_lock.
- *
- * @kvm: the guest instance we are interested in.
- * @slot: the @gfn belongs to.
- * @gfn: the guest page.
- * @mode: tracking mode, currently only write track is supported.
- */
-void kvm_slot_page_track_add_page(struct kvm *kvm,
- struct kvm_memory_slot *slot, gfn_t gfn,
- enum kvm_page_track_mode mode)
+void __kvm_write_track_add_gfn(struct kvm *kvm, struct kvm_memory_slot *slot,
+ gfn_t gfn)
{
+ lockdep_assert_held_write(&kvm->mmu_lock);
- if (WARN_ON(!page_track_mode_is_valid(mode)))
- return;
+ lockdep_assert_once(lockdep_is_held(&kvm->slots_lock) ||
+ srcu_read_lock_held(&kvm->srcu));
- if (WARN_ON(mode == KVM_PAGE_TRACK_WRITE &&
- !kvm_page_track_write_tracking_enabled(kvm)))
+ if (KVM_BUG_ON(!kvm_page_track_write_tracking_enabled(kvm), kvm))
return;
- update_gfn_track(slot, gfn, mode, 1);
+ update_gfn_write_track(slot, gfn, 1);
/*
* new track stops large page mapping for the
@@ -132,37 +93,22 @@ void kvm_slot_page_track_add_page(struct kvm *kvm,
*/
kvm_mmu_gfn_disallow_lpage(slot, gfn);
- if (mode == KVM_PAGE_TRACK_WRITE)
- if (kvm_mmu_slot_gfn_write_protect(kvm, slot, gfn, PG_LEVEL_4K))
- kvm_flush_remote_tlbs(kvm);
+ if (kvm_mmu_slot_gfn_write_protect(kvm, slot, gfn, PG_LEVEL_4K))
+ kvm_flush_remote_tlbs(kvm);
}
-EXPORT_SYMBOL_GPL(kvm_slot_page_track_add_page);
-/*
- * remove the guest page from the tracking pool which stops the interception
- * of corresponding access on that page. It is the opposed operation of
- * kvm_slot_page_track_add_page().
- *
- * It should be called under the protection both of mmu-lock and kvm->srcu
- * or kvm->slots_lock.
- *
- * @kvm: the guest instance we are interested in.
- * @slot: the @gfn belongs to.
- * @gfn: the guest page.
- * @mode: tracking mode, currently only write track is supported.
- */
-void kvm_slot_page_track_remove_page(struct kvm *kvm,
- struct kvm_memory_slot *slot, gfn_t gfn,
- enum kvm_page_track_mode mode)
+void __kvm_write_track_remove_gfn(struct kvm *kvm,
+ struct kvm_memory_slot *slot, gfn_t gfn)
{
- if (WARN_ON(!page_track_mode_is_valid(mode)))
- return;
+ lockdep_assert_held_write(&kvm->mmu_lock);
- if (WARN_ON(mode == KVM_PAGE_TRACK_WRITE &&
- !kvm_page_track_write_tracking_enabled(kvm)))
+ lockdep_assert_once(lockdep_is_held(&kvm->slots_lock) ||
+ srcu_read_lock_held(&kvm->srcu));
+
+ if (KVM_BUG_ON(!kvm_page_track_write_tracking_enabled(kvm), kvm))
return;
- update_gfn_track(slot, gfn, mode, -1);
+ update_gfn_write_track(slot, gfn, -1);
/*
* allow large page mapping for the tracked page
@@ -170,31 +116,26 @@ void kvm_slot_page_track_remove_page(struct kvm *kvm,
*/
kvm_mmu_gfn_allow_lpage(slot, gfn);
}
-EXPORT_SYMBOL_GPL(kvm_slot_page_track_remove_page);
/*
* check if the corresponding access on the specified guest page is tracked.
*/
-bool kvm_slot_page_track_is_active(struct kvm *kvm,
- const struct kvm_memory_slot *slot,
- gfn_t gfn, enum kvm_page_track_mode mode)
+bool kvm_gfn_is_write_tracked(struct kvm *kvm,
+ const struct kvm_memory_slot *slot, gfn_t gfn)
{
int index;
- if (WARN_ON(!page_track_mode_is_valid(mode)))
- return false;
-
if (!slot)
return false;
- if (mode == KVM_PAGE_TRACK_WRITE &&
- !kvm_page_track_write_tracking_enabled(kvm))
+ if (!kvm_page_track_write_tracking_enabled(kvm))
return false;
index = gfn_to_index(gfn, slot->base_gfn, PG_LEVEL_4K);
- return !!READ_ONCE(slot->arch.gfn_track[mode][index]);
+ return !!READ_ONCE(slot->arch.gfn_write_track[index]);
}
+#ifdef CONFIG_KVM_EXTERNAL_WRITE_TRACKING
void kvm_page_track_cleanup(struct kvm *kvm)
{
struct kvm_page_track_notifier_head *head;
@@ -216,17 +157,22 @@ int kvm_page_track_init(struct kvm *kvm)
* register the notifier so that event interception for the tracked guest
* pages can be received.
*/
-void
-kvm_page_track_register_notifier(struct kvm *kvm,
- struct kvm_page_track_notifier_node *n)
+int kvm_page_track_register_notifier(struct kvm *kvm,
+ struct kvm_page_track_notifier_node *n)
{
struct kvm_page_track_notifier_head *head;
+ if (!kvm || kvm->mm != current->mm)
+ return -ESRCH;
+
+ kvm_get_kvm(kvm);
+
head = &kvm->arch.track_notifier_head;
write_lock(&kvm->mmu_lock);
hlist_add_head_rcu(&n->node, &head->track_notifier_list);
write_unlock(&kvm->mmu_lock);
+ return 0;
}
EXPORT_SYMBOL_GPL(kvm_page_track_register_notifier);
@@ -234,9 +180,8 @@ EXPORT_SYMBOL_GPL(kvm_page_track_register_notifier);
* stop receiving the event interception. It is the opposed operation of
* kvm_page_track_register_notifier().
*/
-void
-kvm_page_track_unregister_notifier(struct kvm *kvm,
- struct kvm_page_track_notifier_node *n)
+void kvm_page_track_unregister_notifier(struct kvm *kvm,
+ struct kvm_page_track_notifier_node *n)
{
struct kvm_page_track_notifier_head *head;
@@ -246,6 +191,8 @@ kvm_page_track_unregister_notifier(struct kvm *kvm,
hlist_del_rcu(&n->node);
write_unlock(&kvm->mmu_lock);
synchronize_srcu(&head->track_srcu);
+
+ kvm_put_kvm(kvm);
}
EXPORT_SYMBOL_GPL(kvm_page_track_unregister_notifier);
@@ -256,34 +203,30 @@ EXPORT_SYMBOL_GPL(kvm_page_track_unregister_notifier);
* The node should figure out if the written page is the one that node is
* interested in by itself.
*/
-void kvm_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new,
- int bytes)
+void __kvm_page_track_write(struct kvm *kvm, gpa_t gpa, const u8 *new, int bytes)
{
struct kvm_page_track_notifier_head *head;
struct kvm_page_track_notifier_node *n;
int idx;
- head = &vcpu->kvm->arch.track_notifier_head;
+ head = &kvm->arch.track_notifier_head;
if (hlist_empty(&head->track_notifier_list))
return;
idx = srcu_read_lock(&head->track_srcu);
hlist_for_each_entry_srcu(n, &head->track_notifier_list, node,
- srcu_read_lock_held(&head->track_srcu))
+ srcu_read_lock_held(&head->track_srcu))
if (n->track_write)
- n->track_write(vcpu, gpa, new, bytes, n);
+ n->track_write(gpa, new, bytes, n);
srcu_read_unlock(&head->track_srcu, idx);
}
/*
- * Notify the node that memory slot is being removed or moved so that it can
- * drop write-protection for the pages in the memory slot.
- *
- * The node should figure out it has any write-protected pages in this slot
- * by itself.
+ * Notify external page track nodes that a memory region is being removed from
+ * the VM, e.g. so that users can free any associated metadata.
*/
-void kvm_page_track_flush_slot(struct kvm *kvm, struct kvm_memory_slot *slot)
+void kvm_page_track_delete_slot(struct kvm *kvm, struct kvm_memory_slot *slot)
{
struct kvm_page_track_notifier_head *head;
struct kvm_page_track_notifier_node *n;
@@ -296,8 +239,69 @@ void kvm_page_track_flush_slot(struct kvm *kvm, struct kvm_memory_slot *slot)
idx = srcu_read_lock(&head->track_srcu);
hlist_for_each_entry_srcu(n, &head->track_notifier_list, node,
- srcu_read_lock_held(&head->track_srcu))
- if (n->track_flush_slot)
- n->track_flush_slot(kvm, slot, n);
+ srcu_read_lock_held(&head->track_srcu))
+ if (n->track_remove_region)
+ n->track_remove_region(slot->base_gfn, slot->npages, n);
srcu_read_unlock(&head->track_srcu, idx);
}
+
+/*
+ * add guest page to the tracking pool so that corresponding access on that
+ * page will be intercepted.
+ *
+ * @kvm: the guest instance we are interested in.
+ * @gfn: the guest page.
+ */
+int kvm_write_track_add_gfn(struct kvm *kvm, gfn_t gfn)
+{
+ struct kvm_memory_slot *slot;
+ int idx;
+
+ idx = srcu_read_lock(&kvm->srcu);
+
+ slot = gfn_to_memslot(kvm, gfn);
+ if (!slot) {
+ srcu_read_unlock(&kvm->srcu, idx);
+ return -EINVAL;
+ }
+
+ write_lock(&kvm->mmu_lock);
+ __kvm_write_track_add_gfn(kvm, slot, gfn);
+ write_unlock(&kvm->mmu_lock);
+
+ srcu_read_unlock(&kvm->srcu, idx);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_write_track_add_gfn);
+
+/*
+ * remove the guest page from the tracking pool which stops the interception
+ * of corresponding access on that page.
+ *
+ * @kvm: the guest instance we are interested in.
+ * @gfn: the guest page.
+ */
+int kvm_write_track_remove_gfn(struct kvm *kvm, gfn_t gfn)
+{
+ struct kvm_memory_slot *slot;
+ int idx;
+
+ idx = srcu_read_lock(&kvm->srcu);
+
+ slot = gfn_to_memslot(kvm, gfn);
+ if (!slot) {
+ srcu_read_unlock(&kvm->srcu, idx);
+ return -EINVAL;
+ }
+
+ write_lock(&kvm->mmu_lock);
+ __kvm_write_track_remove_gfn(kvm, slot, gfn);
+ write_unlock(&kvm->mmu_lock);
+
+ srcu_read_unlock(&kvm->srcu, idx);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_write_track_remove_gfn);
+#endif
diff --git a/arch/x86/kvm/mmu/page_track.h b/arch/x86/kvm/mmu/page_track.h
new file mode 100644
index 000000000000..d4d72ed999b1
--- /dev/null
+++ b/arch/x86/kvm/mmu/page_track.h
@@ -0,0 +1,58 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __KVM_X86_PAGE_TRACK_H
+#define __KVM_X86_PAGE_TRACK_H
+
+#include <linux/kvm_host.h>
+
+#include <asm/kvm_page_track.h>
+
+
+bool kvm_page_track_write_tracking_enabled(struct kvm *kvm);
+int kvm_page_track_write_tracking_alloc(struct kvm_memory_slot *slot);
+
+void kvm_page_track_free_memslot(struct kvm_memory_slot *slot);
+int kvm_page_track_create_memslot(struct kvm *kvm,
+ struct kvm_memory_slot *slot,
+ unsigned long npages);
+
+void __kvm_write_track_add_gfn(struct kvm *kvm, struct kvm_memory_slot *slot,
+ gfn_t gfn);
+void __kvm_write_track_remove_gfn(struct kvm *kvm,
+ struct kvm_memory_slot *slot, gfn_t gfn);
+
+bool kvm_gfn_is_write_tracked(struct kvm *kvm,
+ const struct kvm_memory_slot *slot, gfn_t gfn);
+
+#ifdef CONFIG_KVM_EXTERNAL_WRITE_TRACKING
+int kvm_page_track_init(struct kvm *kvm);
+void kvm_page_track_cleanup(struct kvm *kvm);
+
+void __kvm_page_track_write(struct kvm *kvm, gpa_t gpa, const u8 *new, int bytes);
+void kvm_page_track_delete_slot(struct kvm *kvm, struct kvm_memory_slot *slot);
+
+static inline bool kvm_page_track_has_external_user(struct kvm *kvm)
+{
+ return !hlist_empty(&kvm->arch.track_notifier_head.track_notifier_list);
+}
+#else
+static inline int kvm_page_track_init(struct kvm *kvm) { return 0; }
+static inline void kvm_page_track_cleanup(struct kvm *kvm) { }
+
+static inline void __kvm_page_track_write(struct kvm *kvm, gpa_t gpa,
+ const u8 *new, int bytes) { }
+static inline void kvm_page_track_delete_slot(struct kvm *kvm,
+ struct kvm_memory_slot *slot) { }
+
+static inline bool kvm_page_track_has_external_user(struct kvm *kvm) { return false; }
+
+#endif /* CONFIG_KVM_EXTERNAL_WRITE_TRACKING */
+
+static inline void kvm_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa,
+ const u8 *new, int bytes)
+{
+ __kvm_page_track_write(vcpu->kvm, gpa, new, bytes);
+
+ kvm_mmu_track_write(vcpu, gpa, new, bytes);
+}
+
+#endif /* __KVM_X86_PAGE_TRACK_H */
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index 0662e0278e70..c85255073f67 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -338,7 +338,6 @@ retry_walk:
}
#endif
walker->max_level = walker->level;
- ASSERT(!(is_long_mode(vcpu) && !is_pae(vcpu)));
/*
* FIXME: on Intel processors, loads of the PDPTE registers for PAE paging
@@ -348,9 +347,21 @@ retry_walk:
nested_access = (have_ad ? PFERR_WRITE_MASK : 0) | PFERR_USER_MASK;
pte_access = ~0;
+
+ /*
+ * Queue a page fault for injection if this assertion fails, as callers
+ * assume that walker.fault contains sane info on a walk failure. I.e.
+ * avoid making the situation worse by inducing even worse badness
+ * between when the assertion fails and when KVM kicks the vCPU out to
+ * userspace (because the VM is bugged).
+ */
+ if (KVM_BUG_ON(is_long_mode(vcpu) && !is_pae(vcpu), vcpu->kvm))
+ goto error;
+
++walker->level;
do {
+ struct kvm_memory_slot *slot;
unsigned long host_addr;
pt_access = pte_access;
@@ -381,7 +392,11 @@ retry_walk:
if (unlikely(real_gpa == INVALID_GPA))
return 0;
- host_addr = kvm_vcpu_gfn_to_hva_prot(vcpu, gpa_to_gfn(real_gpa),
+ slot = kvm_vcpu_gfn_to_memslot(vcpu, gpa_to_gfn(real_gpa));
+ if (!kvm_is_visible_memslot(slot))
+ goto error;
+
+ host_addr = gfn_to_hva_memslot_prot(slot, gpa_to_gfn(real_gpa),
&walker->pte_writable[walker->level - 1]);
if (unlikely(kvm_is_error_hva(host_addr)))
goto error;
@@ -456,9 +471,6 @@ retry_walk:
goto retry_walk;
}
- pgprintk("%s: pte %llx pte_access %x pt_access %x\n",
- __func__, (u64)pte, walker->pte_access,
- walker->pt_access[walker->level - 1]);
return 1;
error:
@@ -529,8 +541,6 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
return false;
- pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
-
gfn = gpte_to_gfn(gpte);
pte_access = sp->role.access & FNAME(gpte_access)(gpte);
FNAME(protect_clean_gpte)(vcpu->arch.mmu, &pte_access, gpte);
@@ -638,8 +648,19 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
if (FNAME(gpte_changed)(vcpu, gw, top_level))
goto out_gpte_changed;
- if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root.hpa)))
+ if (WARN_ON_ONCE(!VALID_PAGE(vcpu->arch.mmu->root.hpa)))
+ goto out_gpte_changed;
+
+ /*
+ * Load a new root and retry the faulting instruction in the extremely
+ * unlikely scenario that the guest root gfn became visible between
+ * loading a dummy root and handling the resulting page fault, e.g. if
+ * userspace create a memslot in the interim.
+ */
+ if (unlikely(kvm_mmu_is_dummy_root(vcpu->arch.mmu->root.hpa))) {
+ kvm_make_request(KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, vcpu);
goto out_gpte_changed;
+ }
for_each_shadow_entry(vcpu, fault->addr, it) {
gfn_t table_gfn;
@@ -758,7 +779,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
struct guest_walker walker;
int r;
- pgprintk("%s: addr %lx err %x\n", __func__, fault->addr, fault->error_code);
WARN_ON_ONCE(fault->is_tdp);
/*
@@ -773,7 +793,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
* The page is not mapped by the guest. Let the guest handle it.
*/
if (!r) {
- pgprintk("%s: guest page fault\n", __func__);
if (!fault->prefetch)
kvm_inject_emulated_page_fault(vcpu, &walker.fault);
@@ -837,7 +856,7 @@ static gpa_t FNAME(get_level1_sp_gpa)(struct kvm_mmu_page *sp)
{
int offset = 0;
- WARN_ON(sp->role.level != PG_LEVEL_4K);
+ WARN_ON_ONCE(sp->role.level != PG_LEVEL_4K);
if (PTTYPE == 32)
offset = sp->role.quadrant << SPTE_LEVEL_BITS;
diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c
index cf2c6426a6fc..4a599130e9c9 100644
--- a/arch/x86/kvm/mmu/spte.c
+++ b/arch/x86/kvm/mmu/spte.c
@@ -61,7 +61,7 @@ static u64 generation_mmio_spte_mask(u64 gen)
{
u64 mask;
- WARN_ON(gen & ~MMIO_SPTE_GEN_MASK);
+ WARN_ON_ONCE(gen & ~MMIO_SPTE_GEN_MASK);
mask = (gen << MMIO_SPTE_GEN_LOW_SHIFT) & MMIO_SPTE_GEN_LOW_MASK;
mask |= (gen << MMIO_SPTE_GEN_HIGH_SHIFT) & MMIO_SPTE_GEN_HIGH_MASK;
@@ -221,8 +221,6 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
* shadow pages and unsync'ing pages is not allowed.
*/
if (mmu_try_to_unsync_pages(vcpu->kvm, slot, gfn, can_unsync, prefetch)) {
- pgprintk("%s: found shadow page for %llx, marking ro\n",
- __func__, gfn);
wrprot = true;
pte_access &= ~ACC_WRITE_MASK;
spte &= ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
@@ -242,7 +240,7 @@ out:
if ((spte & PT_WRITABLE_MASK) && kvm_slot_dirty_track_enabled(slot)) {
/* Enforced by kvm_mmu_hugepage_adjust. */
- WARN_ON(level > PG_LEVEL_4K);
+ WARN_ON_ONCE(level > PG_LEVEL_4K);
mark_page_dirty_in_slot(vcpu->kvm, slot, gfn);
}
diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
index 1279db2eab44..a129951c9a88 100644
--- a/arch/x86/kvm/mmu/spte.h
+++ b/arch/x86/kvm/mmu/spte.h
@@ -3,6 +3,7 @@
#ifndef KVM_X86_MMU_SPTE_H
#define KVM_X86_MMU_SPTE_H
+#include "mmu.h"
#include "mmu_internal.h"
/*
@@ -236,6 +237,18 @@ static inline struct kvm_mmu_page *sptep_to_sp(u64 *sptep)
return to_shadow_page(__pa(sptep));
}
+static inline struct kvm_mmu_page *root_to_sp(hpa_t root)
+{
+ if (kvm_mmu_is_dummy_root(root))
+ return NULL;
+
+ /*
+ * The "root" may be a special root, e.g. a PAE entry, treat it as a
+ * SPTE to ensure any non-PA bits are dropped.
+ */
+ return spte_to_child_sp(root);
+}
+
static inline bool is_mmio_spte(u64 spte)
{
return (spte & shadow_mmio_mask) == shadow_mmio_value &&
@@ -265,13 +278,13 @@ static inline bool sp_ad_disabled(struct kvm_mmu_page *sp)
static inline bool spte_ad_enabled(u64 spte)
{
- MMU_WARN_ON(!is_shadow_present_pte(spte));
+ KVM_MMU_WARN_ON(!is_shadow_present_pte(spte));
return (spte & SPTE_TDP_AD_MASK) != SPTE_TDP_AD_DISABLED;
}
static inline bool spte_ad_need_write_protect(u64 spte)
{
- MMU_WARN_ON(!is_shadow_present_pte(spte));
+ KVM_MMU_WARN_ON(!is_shadow_present_pte(spte));
/*
* This is benign for non-TDP SPTEs as SPTE_TDP_AD_ENABLED is '0',
* and non-TDP SPTEs will never set these bits. Optimize for 64-bit
@@ -282,13 +295,13 @@ static inline bool spte_ad_need_write_protect(u64 spte)
static inline u64 spte_shadow_accessed_mask(u64 spte)
{
- MMU_WARN_ON(!is_shadow_present_pte(spte));
+ KVM_MMU_WARN_ON(!is_shadow_present_pte(spte));
return spte_ad_enabled(spte) ? shadow_accessed_mask : 0;
}
static inline u64 spte_shadow_dirty_mask(u64 spte)
{
- MMU_WARN_ON(!is_shadow_present_pte(spte));
+ KVM_MMU_WARN_ON(!is_shadow_present_pte(spte));
return spte_ad_enabled(spte) ? shadow_dirty_mask : 0;
}
diff --git a/arch/x86/kvm/mmu/tdp_iter.c b/arch/x86/kvm/mmu/tdp_iter.c
index d2eb0d4f8710..bd30ebfb2f2c 100644
--- a/arch/x86/kvm/mmu/tdp_iter.c
+++ b/arch/x86/kvm/mmu/tdp_iter.c
@@ -39,13 +39,14 @@ void tdp_iter_restart(struct tdp_iter *iter)
void tdp_iter_start(struct tdp_iter *iter, struct kvm_mmu_page *root,
int min_level, gfn_t next_last_level_gfn)
{
- int root_level = root->role.level;
-
- WARN_ON(root_level < 1);
- WARN_ON(root_level > PT64_ROOT_MAX_LEVEL);
+ if (WARN_ON_ONCE(!root || (root->role.level < 1) ||
+ (root->role.level > PT64_ROOT_MAX_LEVEL))) {
+ iter->valid = false;
+ return;
+ }
iter->next_last_level_gfn = next_last_level_gfn;
- iter->root_level = root_level;
+ iter->root_level = root->role.level;
iter->min_level = min_level;
iter->pt_path[iter->root_level - 1] = (tdp_ptep_t)root->spt;
iter->as_id = kvm_mmu_page_as_id(root);
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 512163d52194..6cd4dd631a2f 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -12,18 +12,10 @@
#include <trace/events/kvm.h>
/* Initializes the TDP MMU for the VM, if enabled. */
-int kvm_mmu_init_tdp_mmu(struct kvm *kvm)
+void kvm_mmu_init_tdp_mmu(struct kvm *kvm)
{
- struct workqueue_struct *wq;
-
- wq = alloc_workqueue("kvm", WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 0);
- if (!wq)
- return -ENOMEM;
-
INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
- kvm->arch.tdp_mmu_zap_wq = wq;
- return 1;
}
/* Arbitrarily returns true so that this may be used in if statements. */
@@ -46,20 +38,15 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
* ultimately frees all roots.
*/
kvm_tdp_mmu_invalidate_all_roots(kvm);
-
- /*
- * Destroying a workqueue also first flushes the workqueue, i.e. no
- * need to invoke kvm_tdp_mmu_zap_invalidated_roots().
- */
- destroy_workqueue(kvm->arch.tdp_mmu_zap_wq);
+ kvm_tdp_mmu_zap_invalidated_roots(kvm);
WARN_ON(atomic64_read(&kvm->arch.tdp_mmu_pages));
WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
/*
* Ensure that all the outstanding RCU callbacks to free shadow pages
- * can run before the VM is torn down. Work items on tdp_mmu_zap_wq
- * can call kvm_tdp_mmu_put_root and create new callbacks.
+ * can run before the VM is torn down. Putting the last reference to
+ * zapped roots will create new callbacks.
*/
rcu_barrier();
}
@@ -86,46 +73,6 @@ static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
tdp_mmu_free_sp(sp);
}
-static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
- bool shared);
-
-static void tdp_mmu_zap_root_work(struct work_struct *work)
-{
- struct kvm_mmu_page *root = container_of(work, struct kvm_mmu_page,
- tdp_mmu_async_work);
- struct kvm *kvm = root->tdp_mmu_async_data;
-
- read_lock(&kvm->mmu_lock);
-
- /*
- * A TLB flush is not necessary as KVM performs a local TLB flush when
- * allocating a new root (see kvm_mmu_load()), and when migrating vCPU
- * to a different pCPU. Note, the local TLB flush on reuse also
- * invalidates any paging-structure-cache entries, i.e. TLB entries for
- * intermediate paging structures, that may be zapped, as such entries
- * are associated with the ASID on both VMX and SVM.
- */
- tdp_mmu_zap_root(kvm, root, true);
-
- /*
- * Drop the refcount using kvm_tdp_mmu_put_root() to test its logic for
- * avoiding an infinite loop. By design, the root is reachable while
- * it's being asynchronously zapped, thus a different task can put its
- * last reference, i.e. flowing through kvm_tdp_mmu_put_root() for an
- * asynchronously zapped root is unavoidable.
- */
- kvm_tdp_mmu_put_root(kvm, root, true);
-
- read_unlock(&kvm->mmu_lock);
-}
-
-static void tdp_mmu_schedule_zap_root(struct kvm *kvm, struct kvm_mmu_page *root)
-{
- root->tdp_mmu_async_data = kvm;
- INIT_WORK(&root->tdp_mmu_async_work, tdp_mmu_zap_root_work);
- queue_work(kvm->arch.tdp_mmu_zap_wq, &root->tdp_mmu_async_work);
-}
-
void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
bool shared)
{
@@ -211,8 +158,12 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
#define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared) \
__for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, true)
-#define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id) \
- __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, false, false)
+#define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _shared) \
+ for (_root = tdp_mmu_next_root(_kvm, NULL, _shared, false); \
+ _root; \
+ _root = tdp_mmu_next_root(_kvm, _root, _shared, false)) \
+ if (!kvm_lockdep_assert_mmu_lock_held(_kvm, _shared)) { \
+ } else
/*
* Iterate over all TDP MMU roots. Requires that mmu_lock be held for write,
@@ -292,7 +243,7 @@ hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
* by a memslot update or by the destruction of the VM. Initialize the
* refcount to two; one reference for the vCPU, and one reference for
* the TDP MMU itself, which is held until the root is invalidated and
- * is ultimately put by tdp_mmu_zap_root_work().
+ * is ultimately put by kvm_tdp_mmu_zap_invalidated_roots().
*/
refcount_set(&root->tdp_mmu_root_count, 2);
@@ -475,9 +426,9 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
bool is_leaf = is_present && is_last_spte(new_spte, level);
bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
- WARN_ON(level > PT64_ROOT_MAX_LEVEL);
- WARN_ON(level < PG_LEVEL_4K);
- WARN_ON(gfn & (KVM_PAGES_PER_HPAGE(level) - 1));
+ WARN_ON_ONCE(level > PT64_ROOT_MAX_LEVEL);
+ WARN_ON_ONCE(level < PG_LEVEL_4K);
+ WARN_ON_ONCE(gfn & (KVM_PAGES_PER_HPAGE(level) - 1));
/*
* If this warning were to trigger it would indicate that there was a
@@ -522,9 +473,9 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
* impact the guest since both the former and current SPTEs
* are nonpresent.
*/
- if (WARN_ON(!is_mmio_spte(old_spte) &&
- !is_mmio_spte(new_spte) &&
- !is_removed_spte(new_spte)))
+ if (WARN_ON_ONCE(!is_mmio_spte(old_spte) &&
+ !is_mmio_spte(new_spte) &&
+ !is_removed_spte(new_spte)))
pr_err("Unexpected SPTE change! Nonpresent SPTEs\n"
"should not be replaced with another,\n"
"different nonpresent SPTE, unless one or both\n"
@@ -661,7 +612,7 @@ static u64 tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
* should be used. If operating under the MMU lock in write mode, the
* use of the removed SPTE should not be necessary.
*/
- WARN_ON(is_removed_spte(old_spte) || is_removed_spte(new_spte));
+ WARN_ON_ONCE(is_removed_spte(old_spte) || is_removed_spte(new_spte));
old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, new_spte, level);
@@ -689,7 +640,7 @@ static inline void tdp_mmu_iter_set_spte(struct kvm *kvm, struct tdp_iter *iter,
else
#define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end) \
- for_each_tdp_pte(_iter, to_shadow_page(_mmu->root.hpa), _start, _end)
+ for_each_tdp_pte(_iter, root_to_sp(_mmu->root.hpa), _start, _end)
/*
* Yield if the MMU lock is contended or this thread needs to return control
@@ -709,7 +660,7 @@ static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm,
struct tdp_iter *iter,
bool flush, bool shared)
{
- WARN_ON(iter->yielded);
+ WARN_ON_ONCE(iter->yielded);
/* Ensure forward progress has been made before yielding. */
if (iter->next_last_level_gfn == iter->yielded_gfn)
@@ -728,7 +679,7 @@ static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm,
rcu_read_lock();
- WARN_ON(iter->gfn > iter->next_last_level_gfn);
+ WARN_ON_ONCE(iter->gfn > iter->next_last_level_gfn);
iter->yielded = true;
}
@@ -877,13 +828,12 @@ static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
* true if a TLB flush is needed before releasing the MMU lock, i.e. if one or
* more SPTEs were zapped since the MMU lock was last acquired.
*/
-bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, int as_id, gfn_t start, gfn_t end,
- bool can_yield, bool flush)
+bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, gfn_t start, gfn_t end, bool flush)
{
struct kvm_mmu_page *root;
- for_each_tdp_mmu_root_yield_safe(kvm, root, as_id)
- flush = tdp_mmu_zap_leafs(kvm, root, start, end, can_yield, flush);
+ for_each_tdp_mmu_root_yield_safe(kvm, root, false)
+ flush = tdp_mmu_zap_leafs(kvm, root, start, end, true, flush);
return flush;
}
@@ -891,7 +841,6 @@ bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, int as_id, gfn_t start, gfn_t end,
void kvm_tdp_mmu_zap_all(struct kvm *kvm)
{
struct kvm_mmu_page *root;
- int i;
/*
* Zap all roots, including invalid roots, as all SPTEs must be dropped
@@ -905,10 +854,8 @@ void kvm_tdp_mmu_zap_all(struct kvm *kvm)
* is being destroyed or the userspace VMM has exited. In both cases,
* KVM_RUN is unreachable, i.e. no vCPUs will ever service the request.
*/
- for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
- for_each_tdp_mmu_root_yield_safe(kvm, root, i)
- tdp_mmu_zap_root(kvm, root, false);
- }
+ for_each_tdp_mmu_root_yield_safe(kvm, root, false)
+ tdp_mmu_zap_root(kvm, root, false);
}
/*
@@ -917,18 +864,47 @@ void kvm_tdp_mmu_zap_all(struct kvm *kvm)
*/
void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
{
- flush_workqueue(kvm->arch.tdp_mmu_zap_wq);
+ struct kvm_mmu_page *root;
+
+ read_lock(&kvm->mmu_lock);
+
+ for_each_tdp_mmu_root_yield_safe(kvm, root, true) {
+ if (!root->tdp_mmu_scheduled_root_to_zap)
+ continue;
+
+ root->tdp_mmu_scheduled_root_to_zap = false;
+ KVM_BUG_ON(!root->role.invalid, kvm);
+
+ /*
+ * A TLB flush is not necessary as KVM performs a local TLB
+ * flush when allocating a new root (see kvm_mmu_load()), and
+ * when migrating a vCPU to a different pCPU. Note, the local
+ * TLB flush on reuse also invalidates paging-structure-cache
+ * entries, i.e. TLB entries for intermediate paging structures,
+ * that may be zapped, as such entries are associated with the
+ * ASID on both VMX and SVM.
+ */
+ tdp_mmu_zap_root(kvm, root, true);
+
+ /*
+ * The referenced needs to be put *after* zapping the root, as
+ * the root must be reachable by mmu_notifiers while it's being
+ * zapped
+ */
+ kvm_tdp_mmu_put_root(kvm, root, true);
+ }
+
+ read_unlock(&kvm->mmu_lock);
}
/*
* Mark each TDP MMU root as invalid to prevent vCPUs from reusing a root that
* is about to be zapped, e.g. in response to a memslots update. The actual
- * zapping is performed asynchronously. Using a separate workqueue makes it
- * easy to ensure that the destruction is performed before the "fast zap"
- * completes, without keeping a separate list of invalidated roots; the list is
- * effectively the list of work items in the workqueue.
+ * zapping is done separately so that it happens with mmu_lock with read,
+ * whereas invalidating roots must be done with mmu_lock held for write (unless
+ * the VM is being destroyed).
*
- * Note, the asynchronous worker is gifted the TDP MMU's reference.
+ * Note, kvm_tdp_mmu_zap_invalidated_roots() is gifted the TDP MMU's reference.
* See kvm_tdp_mmu_get_vcpu_root_hpa().
*/
void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm)
@@ -953,19 +929,20 @@ void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm)
/*
* As above, mmu_lock isn't held when destroying the VM! There can't
* be other references to @kvm, i.e. nothing else can invalidate roots
- * or be consuming roots, but walking the list of roots does need to be
- * guarded against roots being deleted by the asynchronous zap worker.
+ * or get/put references to roots.
*/
- rcu_read_lock();
-
- list_for_each_entry_rcu(root, &kvm->arch.tdp_mmu_roots, link) {
+ list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) {
+ /*
+ * Note, invalid roots can outlive a memslot update! Invalid
+ * roots must be *zapped* before the memslot update completes,
+ * but a different task can acquire a reference and keep the
+ * root alive after its been zapped.
+ */
if (!root->role.invalid) {
+ root->tdp_mmu_scheduled_root_to_zap = true;
root->role.invalid = true;
- tdp_mmu_schedule_zap_root(kvm, root);
}
}
-
- rcu_read_unlock();
}
/*
@@ -1146,8 +1123,13 @@ retry:
bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
bool flush)
{
- return kvm_tdp_mmu_zap_leafs(kvm, range->slot->as_id, range->start,
- range->end, range->may_block, flush);
+ struct kvm_mmu_page *root;
+
+ __for_each_tdp_mmu_root_yield_safe(kvm, root, range->slot->as_id, false, false)
+ flush = tdp_mmu_zap_leafs(kvm, root, range->start, range->end,
+ range->may_block, flush);
+
+ return flush;
}
typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter,
@@ -1241,7 +1223,7 @@ static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter,
u64 new_spte;
/* Huge pages aren't expected to be modified without first being zapped. */
- WARN_ON(pte_huge(range->pte) || range->start + 1 != range->end);
+ WARN_ON_ONCE(pte_huge(range->arg.pte) || range->start + 1 != range->end);
if (iter->level != PG_LEVEL_4K ||
!is_shadow_present_pte(iter->old_spte))
@@ -1255,9 +1237,9 @@ static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter,
*/
tdp_mmu_iter_set_spte(kvm, iter, 0);
- if (!pte_write(range->pte)) {
+ if (!pte_write(range->arg.pte)) {
new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte,
- pte_pfn(range->pte));
+ pte_pfn(range->arg.pte));
tdp_mmu_iter_set_spte(kvm, iter, new_spte);
}
@@ -1548,8 +1530,8 @@ retry:
if (!is_shadow_present_pte(iter.old_spte))
continue;
- MMU_WARN_ON(kvm_ad_enabled() &&
- spte_ad_need_write_protect(iter.old_spte));
+ KVM_MMU_WARN_ON(kvm_ad_enabled() &&
+ spte_ad_need_write_protect(iter.old_spte));
if (!(iter.old_spte & dbit))
continue;
@@ -1600,6 +1582,8 @@ static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
shadow_dirty_mask;
struct tdp_iter iter;
+ lockdep_assert_held_write(&kvm->mmu_lock);
+
rcu_read_lock();
tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask),
@@ -1607,8 +1591,8 @@ static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
if (!mask)
break;
- MMU_WARN_ON(kvm_ad_enabled() &&
- spte_ad_need_write_protect(iter.old_spte));
+ KVM_MMU_WARN_ON(kvm_ad_enabled() &&
+ spte_ad_need_write_protect(iter.old_spte));
if (iter.level > PG_LEVEL_4K ||
!(mask & (1UL << (iter.gfn - gfn))))
@@ -1646,7 +1630,6 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
{
struct kvm_mmu_page *root;
- lockdep_assert_held_write(&kvm->mmu_lock);
for_each_tdp_mmu_root(kvm, root, slot->as_id)
clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
}
diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h
index 0a63b1afabd3..733a3aef3a96 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.h
+++ b/arch/x86/kvm/mmu/tdp_mmu.h
@@ -7,7 +7,7 @@
#include "spte.h"
-int kvm_mmu_init_tdp_mmu(struct kvm *kvm);
+void kvm_mmu_init_tdp_mmu(struct kvm *kvm);
void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm);
hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu);
@@ -20,8 +20,7 @@ __must_check static inline bool kvm_tdp_mmu_get_root(struct kvm_mmu_page *root)
void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
bool shared);
-bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, int as_id, gfn_t start,
- gfn_t end, bool can_yield, bool flush);
+bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, gfn_t start, gfn_t end, bool flush);
bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp);
void kvm_tdp_mmu_zap_all(struct kvm *kvm);
void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm);
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index bf653df86112..edb89b51b383 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -382,9 +382,6 @@ static bool check_pmu_event_filter(struct kvm_pmc *pmc)
struct kvm_x86_pmu_event_filter *filter;
struct kvm *kvm = pmc->vcpu->kvm;
- if (!static_call(kvm_x86_pmu_hw_event_available)(pmc))
- return false;
-
filter = srcu_dereference(kvm->arch.pmu_event_filter, &kvm->srcu);
if (!filter)
return true;
@@ -398,6 +395,7 @@ static bool check_pmu_event_filter(struct kvm_pmc *pmc)
static bool pmc_event_is_allowed(struct kvm_pmc *pmc)
{
return pmc_is_globally_enabled(pmc) && pmc_speculative_in_use(pmc) &&
+ static_call(kvm_x86_pmu_hw_event_available)(pmc) &&
check_pmu_event_filter(pmc);
}
diff --git a/arch/x86/kvm/reverse_cpuid.h b/arch/x86/kvm/reverse_cpuid.h
index 56cbdb24400a..b81650678375 100644
--- a/arch/x86/kvm/reverse_cpuid.h
+++ b/arch/x86/kvm/reverse_cpuid.h
@@ -43,6 +43,7 @@ enum kvm_only_cpuid_leafs {
/* Intel-defined sub-features, CPUID level 0x00000007:1 (EDX) */
#define X86_FEATURE_AVX_VNNI_INT8 KVM_X86_FEATURE(CPUID_7_1_EDX, 4)
#define X86_FEATURE_AVX_NE_CONVERT KVM_X86_FEATURE(CPUID_7_1_EDX, 5)
+#define X86_FEATURE_AMX_COMPLEX KVM_X86_FEATURE(CPUID_7_1_EDX, 8)
#define X86_FEATURE_PREFETCHITI KVM_X86_FEATURE(CPUID_7_1_EDX, 14)
/* CPUID level 0x80000007 (EDX). */
diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
index cfc8ab773025..2092db892d7d 100644
--- a/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@@ -791,6 +791,7 @@ static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
int ret = 0;
unsigned long flags;
struct amd_svm_iommu_ir *ir;
+ u64 entry;
/**
* In some cases, the existing irte is updated and re-set,
@@ -824,6 +825,18 @@ static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
ir->data = pi->ir_data;
spin_lock_irqsave(&svm->ir_list_lock, flags);
+
+ /*
+ * Update the target pCPU for IOMMU doorbells if the vCPU is running.
+ * If the vCPU is NOT running, i.e. is blocking or scheduled out, KVM
+ * will update the pCPU info when the vCPU awkened and/or scheduled in.
+ * See also avic_vcpu_load().
+ */
+ entry = READ_ONCE(*(svm->avic_physical_id_cache));
+ if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK)
+ amd_iommu_update_ga(entry & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK,
+ true, pi->ir_data);
+
list_add(&ir->node, &svm->ir_list);
spin_unlock_irqrestore(&svm->ir_list_lock, flags);
out:
@@ -986,10 +999,11 @@ static inline int
avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, bool r)
{
int ret = 0;
- unsigned long flags;
struct amd_svm_iommu_ir *ir;
struct vcpu_svm *svm = to_svm(vcpu);
+ lockdep_assert_held(&svm->ir_list_lock);
+
if (!kvm_arch_has_assigned_device(vcpu->kvm))
return 0;
@@ -997,19 +1011,15 @@ avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, bool r)
* Here, we go through the per-vcpu ir_list to update all existing
* interrupt remapping table entry targeting this vcpu.
*/
- spin_lock_irqsave(&svm->ir_list_lock, flags);
-
if (list_empty(&svm->ir_list))
- goto out;
+ return 0;
list_for_each_entry(ir, &svm->ir_list, node) {
ret = amd_iommu_update_ga(cpu, r, ir->data);
if (ret)
- break;
+ return ret;
}
-out:
- spin_unlock_irqrestore(&svm->ir_list_lock, flags);
- return ret;
+ return 0;
}
void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
@@ -1017,6 +1027,7 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
u64 entry;
int h_physical_id = kvm_cpu_get_apicid(cpu);
struct vcpu_svm *svm = to_svm(vcpu);
+ unsigned long flags;
lockdep_assert_preemption_disabled();
@@ -1033,6 +1044,15 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
if (kvm_vcpu_is_blocking(vcpu))
return;
+ /*
+ * Grab the per-vCPU interrupt remapping lock even if the VM doesn't
+ * _currently_ have assigned devices, as that can change. Holding
+ * ir_list_lock ensures that either svm_ir_list_add() will consume
+ * up-to-date entry information, or that this task will wait until
+ * svm_ir_list_add() completes to set the new target pCPU.
+ */
+ spin_lock_irqsave(&svm->ir_list_lock, flags);
+
entry = READ_ONCE(*(svm->avic_physical_id_cache));
WARN_ON_ONCE(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK);
@@ -1042,25 +1062,48 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, true);
+
+ spin_unlock_irqrestore(&svm->ir_list_lock, flags);
}
void avic_vcpu_put(struct kvm_vcpu *vcpu)
{
u64 entry;
struct vcpu_svm *svm = to_svm(vcpu);
+ unsigned long flags;
lockdep_assert_preemption_disabled();
+ /*
+ * Note, reading the Physical ID entry outside of ir_list_lock is safe
+ * as only the pCPU that has loaded (or is loading) the vCPU is allowed
+ * to modify the entry, and preemption is disabled. I.e. the vCPU
+ * can't be scheduled out and thus avic_vcpu_{put,load}() can't run
+ * recursively.
+ */
entry = READ_ONCE(*(svm->avic_physical_id_cache));
/* Nothing to do if IsRunning == '0' due to vCPU blocking. */
if (!(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK))
return;
+ /*
+ * Take and hold the per-vCPU interrupt remapping lock while updating
+ * the Physical ID entry even though the lock doesn't protect against
+ * multiple writers (see above). Holding ir_list_lock ensures that
+ * either svm_ir_list_add() will consume up-to-date entry information,
+ * or that this task will wait until svm_ir_list_add() completes to
+ * mark the vCPU as not running.
+ */
+ spin_lock_irqsave(&svm->ir_list_lock, flags);
+
avic_update_iommu_vcpu_affinity(vcpu, -1, 0);
entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
+
+ spin_unlock_irqrestore(&svm->ir_list_lock, flags);
+
}
void avic_refresh_virtual_apic_mode(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 96936ddf1b3c..dd496c9e5f91 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -107,7 +107,7 @@ static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu)
static bool nested_vmcb_needs_vls_intercept(struct vcpu_svm *svm)
{
- if (!svm->v_vmload_vmsave_enabled)
+ if (!guest_can_use(&svm->vcpu, X86_FEATURE_V_VMSAVE_VMLOAD))
return true;
if (!nested_npt_enabled(svm))
@@ -552,6 +552,7 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12
bool new_vmcb12 = false;
struct vmcb *vmcb01 = svm->vmcb01.ptr;
struct vmcb *vmcb02 = svm->nested.vmcb02.ptr;
+ struct kvm_vcpu *vcpu = &svm->vcpu;
nested_vmcb02_compute_g_pat(svm);
@@ -577,18 +578,18 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12
vmcb_mark_dirty(vmcb02, VMCB_DT);
}
- kvm_set_rflags(&svm->vcpu, vmcb12->save.rflags | X86_EFLAGS_FIXED);
+ kvm_set_rflags(vcpu, vmcb12->save.rflags | X86_EFLAGS_FIXED);
- svm_set_efer(&svm->vcpu, svm->nested.save.efer);
+ svm_set_efer(vcpu, svm->nested.save.efer);
- svm_set_cr0(&svm->vcpu, svm->nested.save.cr0);
- svm_set_cr4(&svm->vcpu, svm->nested.save.cr4);
+ svm_set_cr0(vcpu, svm->nested.save.cr0);
+ svm_set_cr4(vcpu, svm->nested.save.cr4);
svm->vcpu.arch.cr2 = vmcb12->save.cr2;
- kvm_rax_write(&svm->vcpu, vmcb12->save.rax);
- kvm_rsp_write(&svm->vcpu, vmcb12->save.rsp);
- kvm_rip_write(&svm->vcpu, vmcb12->save.rip);
+ kvm_rax_write(vcpu, vmcb12->save.rax);
+ kvm_rsp_write(vcpu, vmcb12->save.rsp);
+ kvm_rip_write(vcpu, vmcb12->save.rip);
/* In case we don't even reach vcpu_run, the fields are not updated */
vmcb02->save.rax = vmcb12->save.rax;
@@ -602,7 +603,8 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12
vmcb_mark_dirty(vmcb02, VMCB_DR);
}
- if (unlikely(svm->lbrv_enabled && (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))) {
+ if (unlikely(guest_can_use(vcpu, X86_FEATURE_LBRV) &&
+ (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))) {
/*
* Reserved bits of DEBUGCTL are ignored. Be consistent with
* svm_set_msr's definition of reserved bits.
@@ -658,7 +660,8 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm,
* exit_int_info, exit_int_info_err, next_rip, insn_len, insn_bytes.
*/
- if (svm->vgif_enabled && (svm->nested.ctl.int_ctl & V_GIF_ENABLE_MASK))
+ if (guest_can_use(vcpu, X86_FEATURE_VGIF) &&
+ (svm->nested.ctl.int_ctl & V_GIF_ENABLE_MASK))
int_ctl_vmcb12_bits |= (V_GIF_MASK | V_GIF_ENABLE_MASK);
else
int_ctl_vmcb01_bits |= (V_GIF_MASK | V_GIF_ENABLE_MASK);
@@ -695,10 +698,9 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm,
vmcb02->control.tsc_offset = vcpu->arch.tsc_offset;
- if (svm->tsc_ratio_msr != kvm_caps.default_tsc_scaling_ratio) {
- WARN_ON(!svm->tsc_scaling_enabled);
+ if (guest_can_use(vcpu, X86_FEATURE_TSCRATEMSR) &&
+ svm->tsc_ratio_msr != kvm_caps.default_tsc_scaling_ratio)
nested_svm_update_tsc_ratio_msr(vcpu);
- }
vmcb02->control.int_ctl =
(svm->nested.ctl.int_ctl & int_ctl_vmcb12_bits) |
@@ -717,7 +719,7 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm,
* what a nrips=0 CPU would do (L1 is responsible for advancing RIP
* prior to injecting the event).
*/
- if (svm->nrips_enabled)
+ if (guest_can_use(vcpu, X86_FEATURE_NRIPS))
vmcb02->control.next_rip = svm->nested.ctl.next_rip;
else if (boot_cpu_has(X86_FEATURE_NRIPS))
vmcb02->control.next_rip = vmcb12_rip;
@@ -727,7 +729,7 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm,
svm->soft_int_injected = true;
svm->soft_int_csbase = vmcb12_csbase;
svm->soft_int_old_rip = vmcb12_rip;
- if (svm->nrips_enabled)
+ if (guest_can_use(vcpu, X86_FEATURE_NRIPS))
svm->soft_int_next_rip = svm->nested.ctl.next_rip;
else
svm->soft_int_next_rip = vmcb12_rip;
@@ -735,15 +737,21 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm,
vmcb02->control.virt_ext = vmcb01->control.virt_ext &
LBR_CTL_ENABLE_MASK;
- if (svm->lbrv_enabled)
+ if (guest_can_use(vcpu, X86_FEATURE_LBRV))
vmcb02->control.virt_ext |=
(svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK);
if (!nested_vmcb_needs_vls_intercept(svm))
vmcb02->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
- pause_count12 = svm->pause_filter_enabled ? svm->nested.ctl.pause_filter_count : 0;
- pause_thresh12 = svm->pause_threshold_enabled ? svm->nested.ctl.pause_filter_thresh : 0;
+ if (guest_can_use(vcpu, X86_FEATURE_PAUSEFILTER))
+ pause_count12 = svm->nested.ctl.pause_filter_count;
+ else
+ pause_count12 = 0;
+ if (guest_can_use(vcpu, X86_FEATURE_PFTHRESHOLD))
+ pause_thresh12 = svm->nested.ctl.pause_filter_thresh;
+ else
+ pause_thresh12 = 0;
if (kvm_pause_in_guest(svm->vcpu.kvm)) {
/* use guest values since host doesn't intercept PAUSE */
vmcb02->control.pause_filter_count = pause_count12;
@@ -1027,7 +1035,7 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
if (vmcb12->control.exit_code != SVM_EXIT_ERR)
nested_save_pending_event_to_vmcb12(svm, vmcb12);
- if (svm->nrips_enabled)
+ if (guest_can_use(vcpu, X86_FEATURE_NRIPS))
vmcb12->control.next_rip = vmcb02->control.next_rip;
vmcb12->control.int_ctl = svm->nested.ctl.int_ctl;
@@ -1066,7 +1074,8 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
if (!nested_exit_on_intr(svm))
kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
- if (unlikely(svm->lbrv_enabled && (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))) {
+ if (unlikely(guest_can_use(vcpu, X86_FEATURE_LBRV) &&
+ (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))) {
svm_copy_lbrs(vmcb12, vmcb02);
svm_update_lbrv(vcpu);
} else if (unlikely(vmcb01->control.virt_ext & LBR_CTL_ENABLE_MASK)) {
@@ -1101,10 +1110,10 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
vmcb_mark_dirty(vmcb01, VMCB_INTERCEPTS);
}
- if (svm->tsc_ratio_msr != kvm_caps.default_tsc_scaling_ratio) {
- WARN_ON(!svm->tsc_scaling_enabled);
+ if (kvm_caps.has_tsc_control &&
+ vcpu->arch.tsc_scaling_ratio != vcpu->arch.l1_tsc_scaling_ratio) {
vcpu->arch.tsc_scaling_ratio = vcpu->arch.l1_tsc_scaling_ratio;
- __svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio);
+ svm_write_tsc_multiplier(vcpu);
}
svm->nested.ctl.nested_cr3 = 0;
@@ -1537,7 +1546,7 @@ void nested_svm_update_tsc_ratio_msr(struct kvm_vcpu *vcpu)
vcpu->arch.tsc_scaling_ratio =
kvm_calc_nested_tsc_multiplier(vcpu->arch.l1_tsc_scaling_ratio,
svm->tsc_ratio_msr);
- __svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio);
+ svm_write_tsc_multiplier(vcpu);
}
/* Inverse operation of nested_copy_vmcb_control_to_cache(). asid is copied too. */
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index d3aec1f2cad2..4900c078045a 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -23,6 +23,7 @@
#include <asm/pkru.h>
#include <asm/trapnr.h>
#include <asm/fpu/xcr.h>
+#include <asm/debugreg.h>
#include "mmu.h"
#include "x86.h"
@@ -54,9 +55,14 @@ module_param_named(sev, sev_enabled, bool, 0444);
/* enable/disable SEV-ES support */
static bool sev_es_enabled = true;
module_param_named(sev_es, sev_es_enabled, bool, 0444);
+
+/* enable/disable SEV-ES DebugSwap support */
+static bool sev_es_debug_swap_enabled = true;
+module_param_named(debug_swap, sev_es_debug_swap_enabled, bool, 0444);
#else
#define sev_enabled false
#define sev_es_enabled false
+#define sev_es_debug_swap_enabled false
#endif /* CONFIG_KVM_AMD_SEV */
static u8 sev_enc_bit;
@@ -606,6 +612,9 @@ static int sev_es_sync_vmsa(struct vcpu_svm *svm)
save->xss = svm->vcpu.arch.ia32_xss;
save->dr6 = svm->vcpu.arch.dr6;
+ if (sev_es_debug_swap_enabled)
+ save->sev_features |= SVM_SEV_FEAT_DEBUG_SWAP;
+
pr_debug("Virtual Machine Save Area (VMSA):\n");
print_hex_dump_debug("", DUMP_PREFIX_NONE, 16, 1, save, sizeof(*save), false);
@@ -619,6 +628,11 @@ static int __sev_launch_update_vmsa(struct kvm *kvm, struct kvm_vcpu *vcpu,
struct vcpu_svm *svm = to_svm(vcpu);
int ret;
+ if (vcpu->guest_debug) {
+ pr_warn_once("KVM_SET_GUEST_DEBUG for SEV-ES guest is not supported");
+ return -EINVAL;
+ }
+
/* Perform some pre-encryption checks against the VMSA */
ret = sev_es_sync_vmsa(svm);
if (ret)
@@ -1725,7 +1739,7 @@ static void sev_migrate_from(struct kvm *dst_kvm, struct kvm *src_kvm)
* Note, the source is not required to have the same number of
* vCPUs as the destination when migrating a vanilla SEV VM.
*/
- src_vcpu = kvm_get_vcpu(dst_kvm, i);
+ src_vcpu = kvm_get_vcpu(src_kvm, i);
src_svm = to_svm(src_vcpu);
/*
@@ -2171,7 +2185,7 @@ void __init sev_hardware_setup(void)
bool sev_es_supported = false;
bool sev_supported = false;
- if (!sev_enabled || !npt_enabled)
+ if (!sev_enabled || !npt_enabled || !nrips)
goto out;
/*
@@ -2256,6 +2270,9 @@ out:
sev_enabled = sev_supported;
sev_es_enabled = sev_es_supported;
+ if (!sev_es_enabled || !cpu_feature_enabled(X86_FEATURE_DEBUG_SWAP) ||
+ !cpu_feature_enabled(X86_FEATURE_NO_NESTED_DATA_BP))
+ sev_es_debug_swap_enabled = false;
#endif
}
@@ -2881,7 +2898,10 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu)
svm->sev_es.ghcb_sa);
break;
case SVM_VMGEXIT_NMI_COMPLETE:
- ret = svm_invoke_exit_handler(vcpu, SVM_EXIT_IRET);
+ ++vcpu->stat.nmi_window_exits;
+ svm->nmi_masked = false;
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ ret = 1;
break;
case SVM_VMGEXIT_AP_HLT_LOOP:
ret = kvm_emulate_ap_reset_hold(vcpu);
@@ -2942,8 +2962,35 @@ int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in)
count, in);
}
+static void sev_es_vcpu_after_set_cpuid(struct vcpu_svm *svm)
+{
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+
+ if (boot_cpu_has(X86_FEATURE_V_TSC_AUX)) {
+ bool v_tsc_aux = guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) ||
+ guest_cpuid_has(vcpu, X86_FEATURE_RDPID);
+
+ set_msr_interception(vcpu, svm->msrpm, MSR_TSC_AUX, v_tsc_aux, v_tsc_aux);
+ }
+}
+
+void sev_vcpu_after_set_cpuid(struct vcpu_svm *svm)
+{
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+ struct kvm_cpuid_entry2 *best;
+
+ /* For sev guests, the memory encryption bit is not reserved in CR3. */
+ best = kvm_find_cpuid_entry(vcpu, 0x8000001F);
+ if (best)
+ vcpu->arch.reserved_gpa_bits &= ~(1UL << (best->ebx & 0x3f));
+
+ if (sev_es_guest(svm->vcpu.kvm))
+ sev_es_vcpu_after_set_cpuid(svm);
+}
+
static void sev_es_init_vmcb(struct vcpu_svm *svm)
{
+ struct vmcb *vmcb = svm->vmcb01.ptr;
struct kvm_vcpu *vcpu = &svm->vcpu;
svm->vmcb->control.nested_ctl |= SVM_NESTED_CTL_SEV_ES_ENABLE;
@@ -2952,9 +2999,12 @@ static void sev_es_init_vmcb(struct vcpu_svm *svm)
/*
* An SEV-ES guest requires a VMSA area that is a separate from the
* VMCB page. Do not include the encryption mask on the VMSA physical
- * address since hardware will access it using the guest key.
+ * address since hardware will access it using the guest key. Note,
+ * the VMSA will be NULL if this vCPU is the destination for intrahost
+ * migration, and will be copied later.
*/
- svm->vmcb->control.vmsa_pa = __pa(svm->sev_es.vmsa);
+ if (svm->sev_es.vmsa)
+ svm->vmcb->control.vmsa_pa = __pa(svm->sev_es.vmsa);
/* Can't intercept CR register access, HV can't modify CR registers */
svm_clr_intercept(svm, INTERCEPT_CR0_READ);
@@ -2972,8 +3022,23 @@ static void sev_es_init_vmcb(struct vcpu_svm *svm)
svm_set_intercept(svm, TRAP_CR4_WRITE);
svm_set_intercept(svm, TRAP_CR8_WRITE);
- /* No support for enable_vmware_backdoor */
- clr_exception_intercept(svm, GP_VECTOR);
+ vmcb->control.intercepts[INTERCEPT_DR] = 0;
+ if (!sev_es_debug_swap_enabled) {
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_READ);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_WRITE);
+ recalc_intercepts(svm);
+ } else {
+ /*
+ * Disable #DB intercept iff DebugSwap is enabled. KVM doesn't
+ * allow debugging SEV-ES guests, and enables DebugSwap iff
+ * NO_NESTED_DATA_BP is supported, so there's no reason to
+ * intercept #DB when DebugSwap is enabled. For simplicity
+ * with respect to guest debug, intercept #DB for other VMs
+ * even if NO_NESTED_DATA_BP is supported, i.e. even if the
+ * guest can't DoS the CPU with infinite #DB vectoring.
+ */
+ clr_exception_intercept(svm, DB_VECTOR);
+ }
/* Can't intercept XSETBV, HV can't modify XCR0 directly */
svm_clr_intercept(svm, INTERCEPT_XSETBV);
@@ -2985,14 +3050,6 @@ static void sev_es_init_vmcb(struct vcpu_svm *svm)
set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1);
set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 1, 1);
set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 1, 1);
-
- if (boot_cpu_has(X86_FEATURE_V_TSC_AUX) &&
- (guest_cpuid_has(&svm->vcpu, X86_FEATURE_RDTSCP) ||
- guest_cpuid_has(&svm->vcpu, X86_FEATURE_RDPID))) {
- set_msr_interception(vcpu, svm->msrpm, MSR_TSC_AUX, 1, 1);
- if (guest_cpuid_has(&svm->vcpu, X86_FEATURE_RDTSCP))
- svm_clr_intercept(svm, INTERCEPT_RDTSCP);
- }
}
void sev_init_vmcb(struct vcpu_svm *svm)
@@ -3000,6 +3057,12 @@ void sev_init_vmcb(struct vcpu_svm *svm)
svm->vmcb->control.nested_ctl |= SVM_NESTED_CTL_SEV_ENABLE;
clr_exception_intercept(svm, UD_VECTOR);
+ /*
+ * Don't intercept #GP for SEV guests, e.g. for the VMware backdoor, as
+ * KVM can't decrypt guest memory to decode the faulting instruction.
+ */
+ clr_exception_intercept(svm, GP_VECTOR);
+
if (sev_es_guest(svm->vcpu.kvm))
sev_es_init_vmcb(svm);
}
@@ -3018,20 +3081,41 @@ void sev_es_vcpu_reset(struct vcpu_svm *svm)
void sev_es_prepare_switch_to_guest(struct sev_es_save_area *hostsa)
{
/*
- * As an SEV-ES guest, hardware will restore the host state on VMEXIT,
- * of which one step is to perform a VMLOAD. KVM performs the
- * corresponding VMSAVE in svm_prepare_guest_switch for both
- * traditional and SEV-ES guests.
+ * All host state for SEV-ES guests is categorized into three swap types
+ * based on how it is handled by hardware during a world switch:
+ *
+ * A: VMRUN: Host state saved in host save area
+ * VMEXIT: Host state loaded from host save area
+ *
+ * B: VMRUN: Host state _NOT_ saved in host save area
+ * VMEXIT: Host state loaded from host save area
+ *
+ * C: VMRUN: Host state _NOT_ saved in host save area
+ * VMEXIT: Host state initialized to default(reset) values
+ *
+ * Manually save type-B state, i.e. state that is loaded by VMEXIT but
+ * isn't saved by VMRUN, that isn't already saved by VMSAVE (performed
+ * by common SVM code).
*/
-
- /* XCR0 is restored on VMEXIT, save the current host value */
hostsa->xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
-
- /* PKRU is restored on VMEXIT, save the current host value */
hostsa->pkru = read_pkru();
-
- /* MSR_IA32_XSS is restored on VMEXIT, save the currnet host value */
hostsa->xss = host_xss;
+
+ /*
+ * If DebugSwap is enabled, debug registers are loaded but NOT saved by
+ * the CPU (Type-B). If DebugSwap is disabled/unsupported, the CPU both
+ * saves and loads debug registers (Type-A).
+ */
+ if (sev_es_debug_swap_enabled) {
+ hostsa->dr0 = native_get_debugreg(0);
+ hostsa->dr1 = native_get_debugreg(1);
+ hostsa->dr2 = native_get_debugreg(2);
+ hostsa->dr3 = native_get_debugreg(3);
+ hostsa->dr0_addr_mask = amd_get_dr_addr_mask(0);
+ hostsa->dr1_addr_mask = amd_get_dr_addr_mask(1);
+ hostsa->dr2_addr_mask = amd_get_dr_addr_mask(2);
+ hostsa->dr3_addr_mask = amd_get_dr_addr_mask(3);
+ }
}
void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index d4bfdc607fe7..9507df93f410 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -39,10 +39,9 @@
#include <asm/spec-ctrl.h>
#include <asm/cpu_device_id.h>
#include <asm/traps.h>
+#include <asm/reboot.h>
#include <asm/fpu/api.h>
-#include <asm/virtext.h>
-
#include <trace/events/ipi.h>
#include "trace.h"
@@ -203,7 +202,7 @@ static int nested = true;
module_param(nested, int, S_IRUGO);
/* enable/disable Next RIP Save */
-static int nrips = true;
+int nrips = true;
module_param(nrips, int, 0444);
/* enable/disable Virtual VMLOAD VMSAVE */
@@ -365,6 +364,8 @@ static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
svm->vmcb->control.int_state |= SVM_INTERRUPT_SHADOW_MASK;
}
+static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
+ void *insn, int insn_len);
static int __svm_skip_emulated_instruction(struct kvm_vcpu *vcpu,
bool commit_side_effects)
@@ -385,6 +386,14 @@ static int __svm_skip_emulated_instruction(struct kvm_vcpu *vcpu,
}
if (!svm->next_rip) {
+ /*
+ * FIXME: Drop this when kvm_emulate_instruction() does the
+ * right thing and treats "can't emulate" as outright failure
+ * for EMULTYPE_SKIP.
+ */
+ if (!svm_can_emulate_instruction(vcpu, EMULTYPE_SKIP, NULL, 0))
+ return 0;
+
if (unlikely(!commit_side_effects))
old_rflags = svm->vmcb->save.rflags;
@@ -517,14 +526,21 @@ static void svm_init_osvw(struct kvm_vcpu *vcpu)
vcpu->arch.osvw.status |= 1;
}
-static bool kvm_is_svm_supported(void)
+static bool __kvm_is_svm_supported(void)
{
- int cpu = raw_smp_processor_id();
- const char *msg;
+ int cpu = smp_processor_id();
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+
u64 vm_cr;
- if (!cpu_has_svm(&msg)) {
- pr_err("SVM not supported by CPU %d, %s\n", cpu, msg);
+ if (c->x86_vendor != X86_VENDOR_AMD &&
+ c->x86_vendor != X86_VENDOR_HYGON) {
+ pr_err("CPU %d isn't AMD or Hygon\n", cpu);
+ return false;
+ }
+
+ if (!cpu_has(c, X86_FEATURE_SVM)) {
+ pr_err("SVM not supported by CPU %d\n", cpu);
return false;
}
@@ -542,25 +558,55 @@ static bool kvm_is_svm_supported(void)
return true;
}
+static bool kvm_is_svm_supported(void)
+{
+ bool supported;
+
+ migrate_disable();
+ supported = __kvm_is_svm_supported();
+ migrate_enable();
+
+ return supported;
+}
+
static int svm_check_processor_compat(void)
{
- if (!kvm_is_svm_supported())
+ if (!__kvm_is_svm_supported())
return -EIO;
return 0;
}
-void __svm_write_tsc_multiplier(u64 multiplier)
+static void __svm_write_tsc_multiplier(u64 multiplier)
{
- preempt_disable();
-
if (multiplier == __this_cpu_read(current_tsc_ratio))
- goto out;
+ return;
wrmsrl(MSR_AMD64_TSC_RATIO, multiplier);
__this_cpu_write(current_tsc_ratio, multiplier);
-out:
- preempt_enable();
+}
+
+static inline void kvm_cpu_svm_disable(void)
+{
+ uint64_t efer;
+
+ wrmsrl(MSR_VM_HSAVE_PA, 0);
+ rdmsrl(MSR_EFER, efer);
+ if (efer & EFER_SVME) {
+ /*
+ * Force GIF=1 prior to disabling SVM, e.g. to ensure INIT and
+ * NMI aren't blocked.
+ */
+ stgi();
+ wrmsrl(MSR_EFER, efer & ~EFER_SVME);
+ }
+}
+
+static void svm_emergency_disable(void)
+{
+ kvm_rebooting = true;
+
+ kvm_cpu_svm_disable();
}
static void svm_hardware_disable(void)
@@ -569,7 +615,7 @@ static void svm_hardware_disable(void)
if (tsc_scaling)
__svm_write_tsc_multiplier(SVM_TSC_RATIO_DEFAULT);
- cpu_svm_disable();
+ kvm_cpu_svm_disable();
amd_pmu_disable_virt();
}
@@ -637,6 +683,21 @@ static int svm_hardware_enable(void)
amd_pmu_enable_virt();
+ /*
+ * If TSC_AUX virtualization is supported, TSC_AUX becomes a swap type
+ * "B" field (see sev_es_prepare_switch_to_guest()) for SEV-ES guests.
+ * Since Linux does not change the value of TSC_AUX once set, prime the
+ * TSC_AUX field now to avoid a RDMSR on every vCPU run.
+ */
+ if (boot_cpu_has(X86_FEATURE_V_TSC_AUX)) {
+ struct sev_es_save_area *hostsa;
+ u32 msr_hi;
+
+ hostsa = (struct sev_es_save_area *)(page_address(sd->save_area) + 0x400);
+
+ rdmsr(MSR_TSC_AUX, hostsa->tsc_aux, msr_hi);
+ }
+
return 0;
}
@@ -677,6 +738,39 @@ free_save_area:
}
+static void set_dr_intercepts(struct vcpu_svm *svm)
+{
+ struct vmcb *vmcb = svm->vmcb01.ptr;
+
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR0_READ);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR1_READ);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR2_READ);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR3_READ);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR4_READ);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR5_READ);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR6_READ);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR0_WRITE);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR1_WRITE);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR2_WRITE);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR3_WRITE);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR4_WRITE);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR5_WRITE);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR6_WRITE);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_READ);
+ vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_WRITE);
+
+ recalc_intercepts(svm);
+}
+
+static void clr_dr_intercepts(struct vcpu_svm *svm)
+{
+ struct vmcb *vmcb = svm->vmcb01.ptr;
+
+ vmcb->control.intercepts[INTERCEPT_DR] = 0;
+
+ recalc_intercepts(svm);
+}
+
static int direct_access_msr_slot(u32 msr)
{
u32 i;
@@ -947,50 +1041,24 @@ static void svm_disable_lbrv(struct kvm_vcpu *vcpu)
svm_copy_lbrs(svm->vmcb01.ptr, svm->vmcb);
}
-static int svm_get_lbr_msr(struct vcpu_svm *svm, u32 index)
+static struct vmcb *svm_get_lbr_vmcb(struct vcpu_svm *svm)
{
/*
- * If the LBR virtualization is disabled, the LBR msrs are always
- * kept in the vmcb01 to avoid copying them on nested guest entries.
- *
- * If nested, and the LBR virtualization is enabled/disabled, the msrs
- * are moved between the vmcb01 and vmcb02 as needed.
+ * If LBR virtualization is disabled, the LBR MSRs are always kept in
+ * vmcb01. If LBR virtualization is enabled and L1 is running VMs of
+ * its own, the MSRs are moved between vmcb01 and vmcb02 as needed.
*/
- struct vmcb *vmcb =
- (svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK) ?
- svm->vmcb : svm->vmcb01.ptr;
-
- switch (index) {
- case MSR_IA32_DEBUGCTLMSR:
- return vmcb->save.dbgctl;
- case MSR_IA32_LASTBRANCHFROMIP:
- return vmcb->save.br_from;
- case MSR_IA32_LASTBRANCHTOIP:
- return vmcb->save.br_to;
- case MSR_IA32_LASTINTFROMIP:
- return vmcb->save.last_excp_from;
- case MSR_IA32_LASTINTTOIP:
- return vmcb->save.last_excp_to;
- default:
- KVM_BUG(false, svm->vcpu.kvm,
- "%s: Unknown MSR 0x%x", __func__, index);
- return 0;
- }
+ return svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK ? svm->vmcb :
+ svm->vmcb01.ptr;
}
void svm_update_lbrv(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
-
- bool enable_lbrv = svm_get_lbr_msr(svm, MSR_IA32_DEBUGCTLMSR) &
- DEBUGCTLMSR_LBR;
-
- bool current_enable_lbrv = !!(svm->vmcb->control.virt_ext &
- LBR_CTL_ENABLE_MASK);
-
- if (unlikely(is_guest_mode(vcpu) && svm->lbrv_enabled))
- if (unlikely(svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))
- enable_lbrv = true;
+ bool current_enable_lbrv = svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK;
+ bool enable_lbrv = (svm_get_lbr_vmcb(svm)->save.dbgctl & DEBUGCTLMSR_LBR) ||
+ (is_guest_mode(vcpu) && guest_can_use(vcpu, X86_FEATURE_LBRV) &&
+ (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK));
if (enable_lbrv == current_enable_lbrv)
return;
@@ -1101,21 +1169,23 @@ static u64 svm_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu)
return svm->tsc_ratio_msr;
}
-static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
+static void svm_write_tsc_offset(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
svm->vmcb01.ptr->control.tsc_offset = vcpu->arch.l1_tsc_offset;
- svm->vmcb->control.tsc_offset = offset;
+ svm->vmcb->control.tsc_offset = vcpu->arch.tsc_offset;
vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
}
-static void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier)
+void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu)
{
- __svm_write_tsc_multiplier(multiplier);
+ preempt_disable();
+ if (to_svm(vcpu)->guest_state_loaded)
+ __svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio);
+ preempt_enable();
}
-
/* Evaluate instruction intercepts that depend on guest CPUID features. */
static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu,
struct vcpu_svm *svm)
@@ -1156,8 +1226,6 @@ static inline void init_vmcb_after_set_cpuid(struct kvm_vcpu *vcpu)
set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 0, 0);
set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 0, 0);
-
- svm->v_vmload_vmsave_enabled = false;
} else {
/*
* If hardware supports Virtual VMLOAD VMSAVE then enable it
@@ -1201,10 +1269,9 @@ static void init_vmcb(struct kvm_vcpu *vcpu)
* Guest access to VMware backdoor ports could legitimately
* trigger #GP because of TSS I/O permission bitmap.
* We intercept those #GP and allow access to them anyway
- * as VMware does. Don't intercept #GP for SEV guests as KVM can't
- * decrypt guest memory to decode the faulting instruction.
+ * as VMware does.
*/
- if (enable_vmware_backdoor && !sev_guest(vcpu->kvm))
+ if (enable_vmware_backdoor)
set_exception_intercept(svm, GP_VECTOR);
svm_set_intercept(svm, INTERCEPT_INTR);
@@ -1480,7 +1547,14 @@ static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
if (tsc_scaling)
__svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio);
- if (likely(tsc_aux_uret_slot >= 0))
+ /*
+ * TSC_AUX is always virtualized for SEV-ES guests when the feature is
+ * available. The user return MSR support is not required in this case
+ * because TSC_AUX is restored on #VMEXIT from the host save area
+ * (which has been initialized in svm_hardware_enable()).
+ */
+ if (likely(tsc_aux_uret_slot >= 0) &&
+ (!boot_cpu_has(X86_FEATURE_V_TSC_AUX) || !sev_es_guest(vcpu->kvm)))
kvm_set_user_return_msr(tsc_aux_uret_slot, svm->tsc_aux, -1ull);
svm->guest_state_loaded = true;
@@ -1949,7 +2023,7 @@ static void svm_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
- if (vcpu->arch.guest_state_protected)
+ if (WARN_ON_ONCE(sev_es_guest(vcpu->kvm)))
return;
get_debugreg(vcpu->arch.db[0], 0);
@@ -2510,12 +2584,13 @@ static int iret_interception(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
+ WARN_ON_ONCE(sev_es_guest(vcpu->kvm));
+
++vcpu->stat.nmi_window_exits;
svm->awaiting_iret_completion = true;
svm_clr_iret_intercept(svm);
- if (!sev_es_guest(vcpu->kvm))
- svm->nmi_iret_rip = kvm_rip_read(vcpu);
+ svm->nmi_iret_rip = kvm_rip_read(vcpu);
kvm_make_request(KVM_REQ_EVENT, vcpu);
return 1;
@@ -2680,6 +2755,13 @@ static int dr_interception(struct kvm_vcpu *vcpu)
unsigned long val;
int err = 0;
+ /*
+ * SEV-ES intercepts DR7 only to disable guest debugging and the guest issues a VMGEXIT
+ * for DR7 write only. KVM cannot change DR7 (always swapped as type 'A') so return early.
+ */
+ if (sev_es_guest(vcpu->kvm))
+ return 1;
+
if (vcpu->guest_debug == 0) {
/*
* No more DR vmexits; force a reload of the debug registers
@@ -2764,7 +2846,8 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
switch (msr_info->index) {
case MSR_AMD64_TSC_RATIO:
- if (!msr_info->host_initiated && !svm->tsc_scaling_enabled)
+ if (!msr_info->host_initiated &&
+ !guest_can_use(vcpu, X86_FEATURE_TSCRATEMSR))
return 1;
msr_info->data = svm->tsc_ratio_msr;
break;
@@ -2802,11 +2885,19 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
msr_info->data = svm->tsc_aux;
break;
case MSR_IA32_DEBUGCTLMSR:
+ msr_info->data = svm_get_lbr_vmcb(svm)->save.dbgctl;
+ break;
case MSR_IA32_LASTBRANCHFROMIP:
+ msr_info->data = svm_get_lbr_vmcb(svm)->save.br_from;
+ break;
case MSR_IA32_LASTBRANCHTOIP:
+ msr_info->data = svm_get_lbr_vmcb(svm)->save.br_to;
+ break;
case MSR_IA32_LASTINTFROMIP:
+ msr_info->data = svm_get_lbr_vmcb(svm)->save.last_excp_from;
+ break;
case MSR_IA32_LASTINTTOIP:
- msr_info->data = svm_get_lbr_msr(svm, msr_info->index);
+ msr_info->data = svm_get_lbr_vmcb(svm)->save.last_excp_to;
break;
case MSR_VM_HSAVE_PA:
msr_info->data = svm->nested.hsave_msr;
@@ -2906,7 +2997,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
switch (ecx) {
case MSR_AMD64_TSC_RATIO:
- if (!svm->tsc_scaling_enabled) {
+ if (!guest_can_use(vcpu, X86_FEATURE_TSCRATEMSR)) {
if (!msr->host_initiated)
return 1;
@@ -2928,7 +3019,8 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
svm->tsc_ratio_msr = data;
- if (svm->tsc_scaling_enabled && is_guest_mode(vcpu))
+ if (guest_can_use(vcpu, X86_FEATURE_TSCRATEMSR) &&
+ is_guest_mode(vcpu))
nested_svm_update_tsc_ratio_msr(vcpu);
break;
@@ -3017,6 +3109,16 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
break;
case MSR_TSC_AUX:
/*
+ * TSC_AUX is always virtualized for SEV-ES guests when the
+ * feature is available. The user return MSR support is not
+ * required in this case because TSC_AUX is restored on #VMEXIT
+ * from the host save area (which has been initialized in
+ * svm_hardware_enable()).
+ */
+ if (boot_cpu_has(X86_FEATURE_V_TSC_AUX) && sev_es_guest(vcpu->kvm))
+ break;
+
+ /*
* TSC_AUX is usually changed only during boot and never read
* directly. Intercept TSC_AUX instead of exposing it to the
* guest via direct_access_msrs, and switch it via user return.
@@ -3037,13 +3139,8 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
if (data & DEBUGCTL_RESERVED_BITS)
return 1;
- if (svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK)
- svm->vmcb->save.dbgctl = data;
- else
- svm->vmcb01.ptr->save.dbgctl = data;
-
+ svm_get_lbr_vmcb(svm)->save.dbgctl = data;
svm_update_lbrv(vcpu);
-
break;
case MSR_VM_HSAVE_PA:
/*
@@ -3769,6 +3866,19 @@ static void svm_enable_nmi_window(struct kvm_vcpu *vcpu)
if (svm_get_nmi_mask(vcpu) && !svm->awaiting_iret_completion)
return; /* IRET will cause a vm exit */
+ /*
+ * SEV-ES guests are responsible for signaling when a vCPU is ready to
+ * receive a new NMI, as SEV-ES guests can't be single-stepped, i.e.
+ * KVM can't intercept and single-step IRET to detect when NMIs are
+ * unblocked (architecturally speaking). See SVM_VMGEXIT_NMI_COMPLETE.
+ *
+ * Note, GIF is guaranteed to be '1' for SEV-ES guests as hardware
+ * ignores SEV-ES guest writes to EFER.SVME *and* CLGI/STGI are not
+ * supported NAEs in the GHCB protocol.
+ */
+ if (sev_es_guest(vcpu->kvm))
+ return;
+
if (!gif_set(svm)) {
if (vgif)
svm_set_intercept(svm, INTERCEPT_STGI);
@@ -3918,12 +4028,11 @@ static void svm_complete_interrupts(struct kvm_vcpu *vcpu)
svm->soft_int_injected = false;
/*
- * If we've made progress since setting HF_IRET_MASK, we've
+ * If we've made progress since setting awaiting_iret_completion, we've
* executed an IRET and can allow NMI injection.
*/
if (svm->awaiting_iret_completion &&
- (sev_es_guest(vcpu->kvm) ||
- kvm_rip_read(vcpu) != svm->nmi_iret_rip)) {
+ kvm_rip_read(vcpu) != svm->nmi_iret_rip) {
svm->awaiting_iret_completion = false;
svm->nmi_masked = false;
kvm_make_request(KVM_REQ_EVENT, vcpu);
@@ -4207,30 +4316,38 @@ static bool svm_has_emulated_msr(struct kvm *kvm, u32 index)
static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
- struct kvm_cpuid_entry2 *best;
- vcpu->arch.xsaves_enabled = guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
- boot_cpu_has(X86_FEATURE_XSAVE) &&
- boot_cpu_has(X86_FEATURE_XSAVES);
-
- /* Update nrips enabled cache */
- svm->nrips_enabled = kvm_cpu_cap_has(X86_FEATURE_NRIPS) &&
- guest_cpuid_has(vcpu, X86_FEATURE_NRIPS);
-
- svm->tsc_scaling_enabled = tsc_scaling && guest_cpuid_has(vcpu, X86_FEATURE_TSCRATEMSR);
- svm->lbrv_enabled = lbrv && guest_cpuid_has(vcpu, X86_FEATURE_LBRV);
-
- svm->v_vmload_vmsave_enabled = vls && guest_cpuid_has(vcpu, X86_FEATURE_V_VMSAVE_VMLOAD);
-
- svm->pause_filter_enabled = kvm_cpu_cap_has(X86_FEATURE_PAUSEFILTER) &&
- guest_cpuid_has(vcpu, X86_FEATURE_PAUSEFILTER);
+ /*
+ * SVM doesn't provide a way to disable just XSAVES in the guest, KVM
+ * can only disable all variants of by disallowing CR4.OSXSAVE from
+ * being set. As a result, if the host has XSAVE and XSAVES, and the
+ * guest has XSAVE enabled, the guest can execute XSAVES without
+ * faulting. Treat XSAVES as enabled in this case regardless of
+ * whether it's advertised to the guest so that KVM context switches
+ * XSS on VM-Enter/VM-Exit. Failure to do so would effectively give
+ * the guest read/write access to the host's XSS.
+ */
+ if (boot_cpu_has(X86_FEATURE_XSAVE) &&
+ boot_cpu_has(X86_FEATURE_XSAVES) &&
+ guest_cpuid_has(vcpu, X86_FEATURE_XSAVE))
+ kvm_governed_feature_set(vcpu, X86_FEATURE_XSAVES);
- svm->pause_threshold_enabled = kvm_cpu_cap_has(X86_FEATURE_PFTHRESHOLD) &&
- guest_cpuid_has(vcpu, X86_FEATURE_PFTHRESHOLD);
+ kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_NRIPS);
+ kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_TSCRATEMSR);
+ kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_LBRV);
- svm->vgif_enabled = vgif && guest_cpuid_has(vcpu, X86_FEATURE_VGIF);
+ /*
+ * Intercept VMLOAD if the vCPU mode is Intel in order to emulate that
+ * VMLOAD drops bits 63:32 of SYSENTER (ignoring the fact that exposing
+ * SVM on Intel is bonkers and extremely unlikely to work).
+ */
+ if (!guest_cpuid_is_intel(vcpu))
+ kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_V_VMSAVE_VMLOAD);
- svm->vnmi_enabled = vnmi && guest_cpuid_has(vcpu, X86_FEATURE_VNMI);
+ kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_PAUSEFILTER);
+ kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_PFTHRESHOLD);
+ kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_VGIF);
+ kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_VNMI);
svm_recalc_instruction_intercepts(vcpu, svm);
@@ -4242,12 +4359,8 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
set_msr_interception(vcpu, svm->msrpm, MSR_IA32_FLUSH_CMD, 0,
!!guest_cpuid_has(vcpu, X86_FEATURE_FLUSH_L1D));
- /* For sev guests, the memory encryption bit is not reserved in CR3. */
- if (sev_guest(vcpu->kvm)) {
- best = kvm_find_cpuid_entry(vcpu, 0x8000001F);
- if (best)
- vcpu->arch.reserved_gpa_bits &= ~(1UL << (best->ebx & 0x3f));
- }
+ if (sev_guest(vcpu->kvm))
+ sev_vcpu_after_set_cpuid(svm);
init_vmcb_after_set_cpuid(vcpu);
}
@@ -4651,16 +4764,25 @@ static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
* and cannot be decrypted by KVM, i.e. KVM would read cyphertext and
* decode garbage.
*
- * Inject #UD if KVM reached this point without an instruction buffer.
- * In practice, this path should never be hit by a well-behaved guest,
- * e.g. KVM doesn't intercept #UD or #GP for SEV guests, but this path
- * is still theoretically reachable, e.g. via unaccelerated fault-like
- * AVIC access, and needs to be handled by KVM to avoid putting the
- * guest into an infinite loop. Injecting #UD is somewhat arbitrary,
- * but its the least awful option given lack of insight into the guest.
+ * If KVM is NOT trying to simply skip an instruction, inject #UD if
+ * KVM reached this point without an instruction buffer. In practice,
+ * this path should never be hit by a well-behaved guest, e.g. KVM
+ * doesn't intercept #UD or #GP for SEV guests, but this path is still
+ * theoretically reachable, e.g. via unaccelerated fault-like AVIC
+ * access, and needs to be handled by KVM to avoid putting the guest
+ * into an infinite loop. Injecting #UD is somewhat arbitrary, but
+ * its the least awful option given lack of insight into the guest.
+ *
+ * If KVM is trying to skip an instruction, simply resume the guest.
+ * If a #NPF occurs while the guest is vectoring an INT3/INTO, then KVM
+ * will attempt to re-inject the INT3/INTO and skip the instruction.
+ * In that scenario, retrying the INT3/INTO and hoping the guest will
+ * make forward progress is the only option that has a chance of
+ * success (and in practice it will work the vast majority of the time).
*/
if (unlikely(!insn)) {
- kvm_queue_exception(vcpu, UD_VECTOR);
+ if (!(emul_type & EMULTYPE_SKIP))
+ kvm_queue_exception(vcpu, UD_VECTOR);
return false;
}
@@ -5112,9 +5234,11 @@ static __init int svm_hardware_setup(void)
svm_adjust_mmio_mask();
+ nrips = nrips && boot_cpu_has(X86_FEATURE_NRIPS);
+
/*
* Note, SEV setup consumes npt_enabled and enable_mmio_caching (which
- * may be modified by svm_adjust_mmio_mask()).
+ * may be modified by svm_adjust_mmio_mask()), as well as nrips.
*/
sev_hardware_setup();
@@ -5126,11 +5250,6 @@ static __init int svm_hardware_setup(void)
goto err;
}
- if (nrips) {
- if (!boot_cpu_has(X86_FEATURE_NRIPS))
- nrips = false;
- }
-
enable_apicv = avic = avic && avic_hardware_setup();
if (!enable_apicv) {
@@ -5213,6 +5332,13 @@ static struct kvm_x86_init_ops svm_init_ops __initdata = {
.pmu_ops = &amd_pmu_ops,
};
+static void __svm_exit(void)
+{
+ kvm_x86_vendor_exit();
+
+ cpu_emergency_unregister_virt_callback(svm_emergency_disable);
+}
+
static int __init svm_init(void)
{
int r;
@@ -5226,6 +5352,8 @@ static int __init svm_init(void)
if (r)
return r;
+ cpu_emergency_register_virt_callback(svm_emergency_disable);
+
/*
* Common KVM initialization _must_ come last, after this, /dev/kvm is
* exposed to userspace!
@@ -5238,14 +5366,14 @@ static int __init svm_init(void)
return 0;
err_kvm_init:
- kvm_x86_vendor_exit();
+ __svm_exit();
return r;
}
static void __exit svm_exit(void)
{
kvm_exit();
- kvm_x86_vendor_exit();
+ __svm_exit();
}
module_init(svm_init)
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 8239c8de45ac..be67ab7fdd10 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -22,6 +22,7 @@
#include <asm/svm.h>
#include <asm/sev-common.h>
+#include "cpuid.h"
#include "kvm_cache_regs.h"
#define __sme_page_pa(x) __sme_set(page_to_pfn(x) << PAGE_SHIFT)
@@ -33,6 +34,7 @@
#define MSRPM_OFFSETS 32
extern u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
extern bool npt_enabled;
+extern int nrips;
extern int vgif;
extern bool intercept_smi;
extern bool x2avic_enabled;
@@ -260,16 +262,6 @@ struct vcpu_svm {
unsigned long soft_int_next_rip;
bool soft_int_injected;
- /* optional nested SVM features that are enabled for this guest */
- bool nrips_enabled : 1;
- bool tsc_scaling_enabled : 1;
- bool v_vmload_vmsave_enabled : 1;
- bool lbrv_enabled : 1;
- bool pause_filter_enabled : 1;
- bool pause_threshold_enabled : 1;
- bool vgif_enabled : 1;
- bool vnmi_enabled : 1;
-
u32 ldr_reg;
u32 dfr_reg;
struct page *avic_backing_page;
@@ -406,48 +398,6 @@ static inline bool vmcb12_is_intercept(struct vmcb_ctrl_area_cached *control, u3
return test_bit(bit, (unsigned long *)&control->intercepts);
}
-static inline void set_dr_intercepts(struct vcpu_svm *svm)
-{
- struct vmcb *vmcb = svm->vmcb01.ptr;
-
- if (!sev_es_guest(svm->vcpu.kvm)) {
- vmcb_set_intercept(&vmcb->control, INTERCEPT_DR0_READ);
- vmcb_set_intercept(&vmcb->control, INTERCEPT_DR1_READ);
- vmcb_set_intercept(&vmcb->control, INTERCEPT_DR2_READ);
- vmcb_set_intercept(&vmcb->control, INTERCEPT_DR3_READ);
- vmcb_set_intercept(&vmcb->control, INTERCEPT_DR4_READ);
- vmcb_set_intercept(&vmcb->control, INTERCEPT_DR5_READ);
- vmcb_set_intercept(&vmcb->control, INTERCEPT_DR6_READ);
- vmcb_set_intercept(&vmcb->control, INTERCEPT_DR0_WRITE);
- vmcb_set_intercept(&vmcb->control, INTERCEPT_DR1_WRITE);
- vmcb_set_intercept(&vmcb->control, INTERCEPT_DR2_WRITE);
- vmcb_set_intercept(&vmcb->control, INTERCEPT_DR3_WRITE);
- vmcb_set_intercept(&vmcb->control, INTERCEPT_DR4_WRITE);
- vmcb_set_intercept(&vmcb->control, INTERCEPT_DR5_WRITE);
- vmcb_set_intercept(&vmcb->control, INTERCEPT_DR6_WRITE);
- }
-
- vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_READ);
- vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_WRITE);
-
- recalc_intercepts(svm);
-}
-
-static inline void clr_dr_intercepts(struct vcpu_svm *svm)
-{
- struct vmcb *vmcb = svm->vmcb01.ptr;
-
- vmcb->control.intercepts[INTERCEPT_DR] = 0;
-
- /* DR7 access must remain intercepted for an SEV-ES guest */
- if (sev_es_guest(svm->vcpu.kvm)) {
- vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_READ);
- vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_WRITE);
- }
-
- recalc_intercepts(svm);
-}
-
static inline void set_exception_intercept(struct vcpu_svm *svm, u32 bit)
{
struct vmcb *vmcb = svm->vmcb01.ptr;
@@ -493,7 +443,8 @@ static inline bool svm_is_intercept(struct vcpu_svm *svm, int bit)
static inline bool nested_vgif_enabled(struct vcpu_svm *svm)
{
- return svm->vgif_enabled && (svm->nested.ctl.int_ctl & V_GIF_ENABLE_MASK);
+ return guest_can_use(&svm->vcpu, X86_FEATURE_VGIF) &&
+ (svm->nested.ctl.int_ctl & V_GIF_ENABLE_MASK);
}
static inline struct vmcb *get_vgif_vmcb(struct vcpu_svm *svm)
@@ -544,7 +495,7 @@ static inline bool nested_npt_enabled(struct vcpu_svm *svm)
static inline bool nested_vnmi_enabled(struct vcpu_svm *svm)
{
- return svm->vnmi_enabled &&
+ return guest_can_use(&svm->vcpu, X86_FEATURE_VNMI) &&
(svm->nested.ctl.int_ctl & V_NMI_ENABLE_MASK);
}
@@ -660,7 +611,7 @@ int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
bool has_error_code, u32 error_code);
int nested_svm_exit_special(struct vcpu_svm *svm);
void nested_svm_update_tsc_ratio_msr(struct kvm_vcpu *vcpu);
-void __svm_write_tsc_multiplier(u64 multiplier);
+void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu);
void nested_copy_vmcb_control_to_cache(struct vcpu_svm *svm,
struct vmcb_control_area *control);
void nested_copy_vmcb_save_to_cache(struct vcpu_svm *svm,
@@ -733,6 +684,7 @@ void __init sev_hardware_setup(void);
void sev_hardware_unsetup(void);
int sev_cpu_init(struct svm_cpu_data *sd);
void sev_init_vmcb(struct vcpu_svm *svm);
+void sev_vcpu_after_set_cpuid(struct vcpu_svm *svm);
void sev_free_vcpu(struct kvm_vcpu *vcpu);
int sev_handle_vmgexit(struct kvm_vcpu *vcpu);
int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in);
diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h
index d0abee35d7ba..41a4533f9989 100644
--- a/arch/x86/kvm/vmx/capabilities.h
+++ b/arch/x86/kvm/vmx/capabilities.h
@@ -252,7 +252,7 @@ static inline bool cpu_has_vmx_pml(void)
static inline bool cpu_has_vmx_xsaves(void)
{
return vmcs_config.cpu_based_2nd_exec_ctrl &
- SECONDARY_EXEC_XSAVES;
+ SECONDARY_EXEC_ENABLE_XSAVES;
}
static inline bool cpu_has_vmx_waitpkg(void)
diff --git a/arch/x86/kvm/vmx/hyperv.c b/arch/x86/kvm/vmx/hyperv.c
index 79450e1ed7cf..313b8bb5b8a7 100644
--- a/arch/x86/kvm/vmx/hyperv.c
+++ b/arch/x86/kvm/vmx/hyperv.c
@@ -78,7 +78,7 @@
SECONDARY_EXEC_DESC | \
SECONDARY_EXEC_ENABLE_RDTSCP | \
SECONDARY_EXEC_ENABLE_INVPCID | \
- SECONDARY_EXEC_XSAVES | \
+ SECONDARY_EXEC_ENABLE_XSAVES | \
SECONDARY_EXEC_RDSEED_EXITING | \
SECONDARY_EXEC_RDRAND_EXITING | \
SECONDARY_EXEC_TSC_SCALING | \
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 516391cc0d64..c5ec0ef51ff7 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -2307,7 +2307,7 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs0
SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
SECONDARY_EXEC_ENABLE_INVPCID |
SECONDARY_EXEC_ENABLE_RDTSCP |
- SECONDARY_EXEC_XSAVES |
+ SECONDARY_EXEC_ENABLE_XSAVES |
SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE |
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
SECONDARY_EXEC_APIC_REGISTER_VIRT |
@@ -6331,7 +6331,7 @@ static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu,
* If if it were, XSS would have to be checked against
* the XSS exit bitmap in vmcs12.
*/
- return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES);
+ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_XSAVES);
case EXIT_REASON_UMWAIT:
case EXIT_REASON_TPAUSE:
return nested_cpu_has2(vmcs12,
@@ -6426,7 +6426,7 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
vmx = to_vmx(vcpu);
vmcs12 = get_vmcs12(vcpu);
- if (nested_vmx_allowed(vcpu) &&
+ if (guest_can_use(vcpu, X86_FEATURE_VMX) &&
(vmx->nested.vmxon || vmx->nested.smm.vmxon)) {
kvm_state.hdr.vmx.vmxon_pa = vmx->nested.vmxon_ptr;
kvm_state.hdr.vmx.vmcs12_pa = vmx->nested.current_vmptr;
@@ -6567,7 +6567,7 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
if (kvm_state->flags & ~KVM_STATE_NESTED_EVMCS)
return -EINVAL;
} else {
- if (!nested_vmx_allowed(vcpu))
+ if (!guest_can_use(vcpu, X86_FEATURE_VMX))
return -EINVAL;
if (!page_address_valid(vcpu, kvm_state->hdr.vmx.vmxon_pa))
@@ -6601,7 +6601,8 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
return -EINVAL;
if ((kvm_state->flags & KVM_STATE_NESTED_EVMCS) &&
- (!nested_vmx_allowed(vcpu) || !vmx->nested.enlightened_vmcs_enabled))
+ (!guest_can_use(vcpu, X86_FEATURE_VMX) ||
+ !vmx->nested.enlightened_vmcs_enabled))
return -EINVAL;
vmx_leave_nested(vcpu);
@@ -6874,7 +6875,7 @@ static void nested_vmx_setup_secondary_ctls(u32 ept_caps,
SECONDARY_EXEC_ENABLE_INVPCID |
SECONDARY_EXEC_ENABLE_VMFUNC |
SECONDARY_EXEC_RDSEED_EXITING |
- SECONDARY_EXEC_XSAVES |
+ SECONDARY_EXEC_ENABLE_XSAVES |
SECONDARY_EXEC_TSC_SCALING |
SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE;
diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h
index 96952263b029..b4b9d51438c6 100644
--- a/arch/x86/kvm/vmx/nested.h
+++ b/arch/x86/kvm/vmx/nested.h
@@ -168,7 +168,7 @@ static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12)
static inline bool nested_cpu_has_xsaves(struct vmcs12 *vmcs12)
{
- return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES);
+ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_XSAVES);
}
static inline bool nested_cpu_has_pml(struct vmcs12 *vmcs12)
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
index 80c769c58a87..f2efa0bf7ae8 100644
--- a/arch/x86/kvm/vmx/pmu_intel.c
+++ b/arch/x86/kvm/vmx/pmu_intel.c
@@ -22,23 +22,51 @@
#define MSR_PMC_FULL_WIDTH_BIT (MSR_IA32_PMC0 - MSR_IA32_PERFCTR0)
+enum intel_pmu_architectural_events {
+ /*
+ * The order of the architectural events matters as support for each
+ * event is enumerated via CPUID using the index of the event.
+ */
+ INTEL_ARCH_CPU_CYCLES,
+ INTEL_ARCH_INSTRUCTIONS_RETIRED,
+ INTEL_ARCH_REFERENCE_CYCLES,
+ INTEL_ARCH_LLC_REFERENCES,
+ INTEL_ARCH_LLC_MISSES,
+ INTEL_ARCH_BRANCHES_RETIRED,
+ INTEL_ARCH_BRANCHES_MISPREDICTED,
+
+ NR_REAL_INTEL_ARCH_EVENTS,
+
+ /*
+ * Pseudo-architectural event used to implement IA32_FIXED_CTR2, a.k.a.
+ * TSC reference cycles. The architectural reference cycles event may
+ * or may not actually use the TSC as the reference, e.g. might use the
+ * core crystal clock or the bus clock (yeah, "architectural").
+ */
+ PSEUDO_ARCH_REFERENCE_CYCLES = NR_REAL_INTEL_ARCH_EVENTS,
+ NR_INTEL_ARCH_EVENTS,
+};
+
static struct {
u8 eventsel;
u8 unit_mask;
} const intel_arch_events[] = {
- [0] = { 0x3c, 0x00 },
- [1] = { 0xc0, 0x00 },
- [2] = { 0x3c, 0x01 },
- [3] = { 0x2e, 0x4f },
- [4] = { 0x2e, 0x41 },
- [5] = { 0xc4, 0x00 },
- [6] = { 0xc5, 0x00 },
- /* The above index must match CPUID 0x0A.EBX bit vector */
- [7] = { 0x00, 0x03 },
+ [INTEL_ARCH_CPU_CYCLES] = { 0x3c, 0x00 },
+ [INTEL_ARCH_INSTRUCTIONS_RETIRED] = { 0xc0, 0x00 },
+ [INTEL_ARCH_REFERENCE_CYCLES] = { 0x3c, 0x01 },
+ [INTEL_ARCH_LLC_REFERENCES] = { 0x2e, 0x4f },
+ [INTEL_ARCH_LLC_MISSES] = { 0x2e, 0x41 },
+ [INTEL_ARCH_BRANCHES_RETIRED] = { 0xc4, 0x00 },
+ [INTEL_ARCH_BRANCHES_MISPREDICTED] = { 0xc5, 0x00 },
+ [PSEUDO_ARCH_REFERENCE_CYCLES] = { 0x00, 0x03 },
};
/* mapping between fixed pmc index and intel_arch_events array */
-static int fixed_pmc_events[] = {1, 0, 7};
+static int fixed_pmc_events[] = {
+ [0] = INTEL_ARCH_INSTRUCTIONS_RETIRED,
+ [1] = INTEL_ARCH_CPU_CYCLES,
+ [2] = PSEUDO_ARCH_REFERENCE_CYCLES,
+};
static void reprogram_fixed_counters(struct kvm_pmu *pmu, u64 data)
{
@@ -80,16 +108,18 @@ static bool intel_hw_event_available(struct kvm_pmc *pmc)
u8 unit_mask = (pmc->eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8;
int i;
- for (i = 0; i < ARRAY_SIZE(intel_arch_events); i++) {
+ BUILD_BUG_ON(ARRAY_SIZE(intel_arch_events) != NR_INTEL_ARCH_EVENTS);
+
+ /*
+ * Disallow events reported as unavailable in guest CPUID. Note, this
+ * doesn't apply to pseudo-architectural events.
+ */
+ for (i = 0; i < NR_REAL_INTEL_ARCH_EVENTS; i++) {
if (intel_arch_events[i].eventsel != event_select ||
intel_arch_events[i].unit_mask != unit_mask)
continue;
- /* disable event that reported as not present by cpuid */
- if ((i < 7) && !(pmu->available_event_types & (1 << i)))
- return false;
-
- break;
+ return pmu->available_event_types & BIT(i);
}
return true;
@@ -438,16 +468,17 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
static void setup_fixed_pmc_eventsel(struct kvm_pmu *pmu)
{
- size_t size = ARRAY_SIZE(fixed_pmc_events);
- struct kvm_pmc *pmc;
- u32 event;
int i;
+ BUILD_BUG_ON(ARRAY_SIZE(fixed_pmc_events) != KVM_PMC_MAX_FIXED);
+
for (i = 0; i < pmu->nr_arch_fixed_counters; i++) {
- pmc = &pmu->fixed_counters[i];
- event = fixed_pmc_events[array_index_nospec(i, size)];
+ int index = array_index_nospec(i, KVM_PMC_MAX_FIXED);
+ struct kvm_pmc *pmc = &pmu->fixed_counters[index];
+ u32 event = fixed_pmc_events[index];
+
pmc->eventsel = (intel_arch_events[event].unit_mask << 8) |
- intel_arch_events[event].eventsel;
+ intel_arch_events[event].eventsel;
}
}
@@ -508,10 +539,8 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
if (pmu->version == 1) {
pmu->nr_arch_fixed_counters = 0;
} else {
- pmu->nr_arch_fixed_counters =
- min3(ARRAY_SIZE(fixed_pmc_events),
- (size_t) edx.split.num_counters_fixed,
- (size_t)kvm_pmu_cap.num_counters_fixed);
+ pmu->nr_arch_fixed_counters = min_t(int, edx.split.num_counters_fixed,
+ kvm_pmu_cap.num_counters_fixed);
edx.split.bit_width_fixed = min_t(int, edx.split.bit_width_fixed,
kvm_pmu_cap.bit_width_fixed);
pmu->counter_bitmask[KVM_PMC_FIXED] =
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index b483a8baaacf..72e3943f3693 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -41,13 +41,12 @@
#include <asm/idtentry.h>
#include <asm/io.h>
#include <asm/irq_remapping.h>
-#include <asm/kexec.h>
+#include <asm/reboot.h>
#include <asm/perf_event.h>
#include <asm/mmu_context.h>
#include <asm/mshyperv.h>
#include <asm/mwait.h>
#include <asm/spec-ctrl.h>
-#include <asm/virtext.h>
#include <asm/vmx.h>
#include "capabilities.h"
@@ -237,9 +236,6 @@ static const struct {
#define L1D_CACHE_ORDER 4
static void *vmx_l1d_flush_pages;
-/* Control for disabling CPU Fill buffer clear */
-static bool __read_mostly vmx_fb_clear_ctrl_available;
-
static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
{
struct page *page;
@@ -255,14 +251,9 @@ static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
return 0;
}
- if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) {
- u64 msr;
-
- rdmsrl(MSR_IA32_ARCH_CAPABILITIES, msr);
- if (msr & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) {
- l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED;
- return 0;
- }
+ if (host_arch_capabilities & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) {
+ l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED;
+ return 0;
}
/* If set to auto use the default l1tf mitigation method */
@@ -366,22 +357,9 @@ static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp)
static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp)
{
if (WARN_ON_ONCE(l1tf_vmx_mitigation >= ARRAY_SIZE(vmentry_l1d_param)))
- return sprintf(s, "???\n");
+ return sysfs_emit(s, "???\n");
- return sprintf(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option);
-}
-
-static void vmx_setup_fb_clear_ctrl(void)
-{
- u64 msr;
-
- if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES) &&
- !boot_cpu_has_bug(X86_BUG_MDS) &&
- !boot_cpu_has_bug(X86_BUG_TAA)) {
- rdmsrl(MSR_IA32_ARCH_CAPABILITIES, msr);
- if (msr & ARCH_CAP_FB_CLEAR_CTRL)
- vmx_fb_clear_ctrl_available = true;
- }
+ return sysfs_emit(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option);
}
static __always_inline void vmx_disable_fb_clear(struct vcpu_vmx *vmx)
@@ -409,7 +387,9 @@ static __always_inline void vmx_enable_fb_clear(struct vcpu_vmx *vmx)
static void vmx_update_fb_clear_dis(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx)
{
- vmx->disable_fb_clear = vmx_fb_clear_ctrl_available;
+ vmx->disable_fb_clear = (host_arch_capabilities & ARCH_CAP_FB_CLEAR_CTRL) &&
+ !boot_cpu_has_bug(X86_BUG_MDS) &&
+ !boot_cpu_has_bug(X86_BUG_TAA);
/*
* If guest will not execute VERW, there is no need to set FB_CLEAR_DIS
@@ -754,17 +734,51 @@ static int vmx_set_guest_uret_msr(struct vcpu_vmx *vmx,
return ret;
}
-#ifdef CONFIG_KEXEC_CORE
-static void crash_vmclear_local_loaded_vmcss(void)
+/*
+ * Disable VMX and clear CR4.VMXE (even if VMXOFF faults)
+ *
+ * Note, VMXOFF causes a #UD if the CPU is !post-VMXON, but it's impossible to
+ * atomically track post-VMXON state, e.g. this may be called in NMI context.
+ * Eat all faults as all other faults on VMXOFF faults are mode related, i.e.
+ * faults are guaranteed to be due to the !post-VMXON check unless the CPU is
+ * magically in RM, VM86, compat mode, or at CPL>0.
+ */
+static int kvm_cpu_vmxoff(void)
+{
+ asm_volatile_goto("1: vmxoff\n\t"
+ _ASM_EXTABLE(1b, %l[fault])
+ ::: "cc", "memory" : fault);
+
+ cr4_clear_bits(X86_CR4_VMXE);
+ return 0;
+
+fault:
+ cr4_clear_bits(X86_CR4_VMXE);
+ return -EIO;
+}
+
+static void vmx_emergency_disable(void)
{
int cpu = raw_smp_processor_id();
struct loaded_vmcs *v;
+ kvm_rebooting = true;
+
+ /*
+ * Note, CR4.VMXE can be _cleared_ in NMI context, but it can only be
+ * set in task context. If this races with VMX is disabled by an NMI,
+ * VMCLEAR and VMXOFF may #UD, but KVM will eat those faults due to
+ * kvm_rebooting set.
+ */
+ if (!(__read_cr4() & X86_CR4_VMXE))
+ return;
+
list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu),
loaded_vmcss_on_cpu_link)
vmcs_clear(v->vmcs);
+
+ kvm_cpu_vmxoff();
}
-#endif /* CONFIG_KEXEC_CORE */
static void __loaded_vmcs_clear(void *arg)
{
@@ -1899,25 +1913,14 @@ u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu)
return kvm_caps.default_tsc_scaling_ratio;
}
-static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
+static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu)
{
- vmcs_write64(TSC_OFFSET, offset);
+ vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
}
-static void vmx_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier)
+static void vmx_write_tsc_multiplier(struct kvm_vcpu *vcpu)
{
- vmcs_write64(TSC_MULTIPLIER, multiplier);
-}
-
-/*
- * nested_vmx_allowed() checks whether a guest should be allowed to use VMX
- * instructions and MSRs (i.e., nested VMX). Nested VMX is disabled for
- * all guests if the "nested" module option is off, and can also be disabled
- * for a single guest by disabling its VMX cpuid bit.
- */
-bool nested_vmx_allowed(struct kvm_vcpu *vcpu)
-{
- return nested && guest_cpuid_has(vcpu, X86_FEATURE_VMX);
+ vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio);
}
/*
@@ -2047,7 +2050,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
[msr_info->index - MSR_IA32_SGXLEPUBKEYHASH0];
break;
case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR:
- if (!nested_vmx_allowed(vcpu))
+ if (!guest_can_use(vcpu, X86_FEATURE_VMX))
return 1;
if (vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index,
&msr_info->data))
@@ -2355,7 +2358,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR:
if (!msr_info->host_initiated)
return 1; /* they are read-only */
- if (!nested_vmx_allowed(vcpu))
+ if (!guest_can_use(vcpu, X86_FEATURE_VMX))
return 1;
return vmx_set_vmx_msr(vcpu, msr_index, data);
case MSR_IA32_RTIT_CTL:
@@ -2729,11 +2732,11 @@ static int setup_vmcs_config(struct vmcs_config *vmcs_conf,
return 0;
}
-static bool kvm_is_vmx_supported(void)
+static bool __kvm_is_vmx_supported(void)
{
- int cpu = raw_smp_processor_id();
+ int cpu = smp_processor_id();
- if (!cpu_has_vmx()) {
+ if (!(cpuid_ecx(1) & feature_bit(VMX))) {
pr_err("VMX not supported by CPU %d\n", cpu);
return false;
}
@@ -2747,13 +2750,24 @@ static bool kvm_is_vmx_supported(void)
return true;
}
+static bool kvm_is_vmx_supported(void)
+{
+ bool supported;
+
+ migrate_disable();
+ supported = __kvm_is_vmx_supported();
+ migrate_enable();
+
+ return supported;
+}
+
static int vmx_check_processor_compat(void)
{
int cpu = raw_smp_processor_id();
struct vmcs_config vmcs_conf;
struct vmx_capability vmx_cap;
- if (!kvm_is_vmx_supported())
+ if (!__kvm_is_vmx_supported())
return -EIO;
if (setup_vmcs_config(&vmcs_conf, &vmx_cap) < 0) {
@@ -2833,7 +2847,7 @@ static void vmx_hardware_disable(void)
{
vmclear_local_loaded_vmcss();
- if (cpu_vmxoff())
+ if (kvm_cpu_vmxoff())
kvm_spurious_fault();
hv_reset_evmcs();
@@ -3071,13 +3085,6 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
vmx->rmode.vm86_active = 1;
- /*
- * Very old userspace does not call KVM_SET_TSS_ADDR before entering
- * vcpu. Warn the user that an update is overdue.
- */
- if (!kvm_vmx->tss_addr)
- pr_warn_once("KVM_SET_TSS_ADDR needs to be called before running vCPU\n");
-
vmx_segment_cache_clear(vmx);
vmcs_writel(GUEST_TR_BASE, kvm_vmx->tss_addr);
@@ -3350,7 +3357,7 @@ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
vmx->emulation_required = vmx_emulation_required(vcpu);
}
-static int vmx_get_max_tdp_level(void)
+static int vmx_get_max_ept_level(void)
{
if (cpu_has_vmx_ept_5levels())
return 5;
@@ -4553,16 +4560,19 @@ vmx_adjust_secondary_exec_control(struct vcpu_vmx *vmx, u32 *exec_control,
* based on a single guest CPUID bit, with a dedicated feature bit. This also
* verifies that the control is actually supported by KVM and hardware.
*/
-#define vmx_adjust_sec_exec_control(vmx, exec_control, name, feat_name, ctrl_name, exiting) \
-({ \
- bool __enabled; \
- \
- if (cpu_has_vmx_##name()) { \
- __enabled = guest_cpuid_has(&(vmx)->vcpu, \
- X86_FEATURE_##feat_name); \
- vmx_adjust_secondary_exec_control(vmx, exec_control, \
- SECONDARY_EXEC_##ctrl_name, __enabled, exiting); \
- } \
+#define vmx_adjust_sec_exec_control(vmx, exec_control, name, feat_name, ctrl_name, exiting) \
+({ \
+ struct kvm_vcpu *__vcpu = &(vmx)->vcpu; \
+ bool __enabled; \
+ \
+ if (cpu_has_vmx_##name()) { \
+ if (kvm_is_governed_feature(X86_FEATURE_##feat_name)) \
+ __enabled = guest_can_use(__vcpu, X86_FEATURE_##feat_name); \
+ else \
+ __enabled = guest_cpuid_has(__vcpu, X86_FEATURE_##feat_name); \
+ vmx_adjust_secondary_exec_control(vmx, exec_control, SECONDARY_EXEC_##ctrl_name,\
+ __enabled, exiting); \
+ } \
})
/* More macro magic for ENABLE_/opt-in versus _EXITING/opt-out controls. */
@@ -4622,19 +4632,7 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
if (!enable_pml || !atomic_read(&vcpu->kvm->nr_memslots_dirty_logging))
exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
- if (cpu_has_vmx_xsaves()) {
- /* Exposing XSAVES only when XSAVE is exposed */
- bool xsaves_enabled =
- boot_cpu_has(X86_FEATURE_XSAVE) &&
- guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
- guest_cpuid_has(vcpu, X86_FEATURE_XSAVES);
-
- vcpu->arch.xsaves_enabled = xsaves_enabled;
-
- vmx_adjust_secondary_exec_control(vmx, &exec_control,
- SECONDARY_EXEC_XSAVES,
- xsaves_enabled, false);
- }
+ vmx_adjust_sec_exec_feature(vmx, &exec_control, xsaves, XSAVES);
/*
* RDPID is also gated by ENABLE_RDTSCP, turn on the control if either
@@ -4653,6 +4651,7 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
SECONDARY_EXEC_ENABLE_RDTSCP,
rdpid_or_rdtscp_enabled, false);
}
+
vmx_adjust_sec_exec_feature(vmx, &exec_control, invpcid, INVPCID);
vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdrand, RDRAND);
@@ -6796,8 +6795,10 @@ static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu)
vmcs_write64(APIC_ACCESS_ADDR, pfn_to_hpa(pfn));
read_unlock(&vcpu->kvm->mmu_lock);
- vmx_flush_tlb_current(vcpu);
-
+ /*
+ * No need for a manual TLB flush at this point, KVM has already done a
+ * flush if there were SPTEs pointing at the previous page.
+ */
out:
/*
* Do not pin apic access page in memory, the MMU notifier
@@ -7243,13 +7244,20 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
flags);
vcpu->arch.cr2 = native_read_cr2();
+ vcpu->arch.regs_avail &= ~VMX_REGS_LAZY_LOAD_SET;
+
+ vmx->idt_vectoring_info = 0;
vmx_enable_fb_clear(vmx);
- if (unlikely(vmx->fail))
+ if (unlikely(vmx->fail)) {
vmx->exit_reason.full = 0xdead;
- else
- vmx->exit_reason.full = vmcs_read32(VM_EXIT_REASON);
+ goto out;
+ }
+
+ vmx->exit_reason.full = vmcs_read32(VM_EXIT_REASON);
+ if (likely(!vmx->exit_reason.failed_vmentry))
+ vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
if ((u16)vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI &&
is_nmi(vmx_get_intr_info(vcpu))) {
@@ -7258,6 +7266,7 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
kvm_after_interrupt(vcpu);
}
+out:
guest_state_exit_irqoff();
}
@@ -7379,8 +7388,6 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
loadsegment(es, __USER_DS);
#endif
- vcpu->arch.regs_avail &= ~VMX_REGS_LAZY_LOAD_SET;
-
pt_guest_exit(vmx);
kvm_load_host_xsave_state(vcpu);
@@ -7397,17 +7404,12 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
vmx->nested.nested_run_pending = 0;
}
- vmx->idt_vectoring_info = 0;
-
if (unlikely(vmx->fail))
return EXIT_FASTPATH_NONE;
if (unlikely((u16)vmx->exit_reason.basic == EXIT_REASON_MCE_DURING_VMENTRY))
kvm_machine_check();
- if (likely(!vmx->exit_reason.failed_vmentry))
- vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
-
trace_kvm_exit(vcpu, KVM_ISA_VMX);
if (unlikely(vmx->exit_reason.failed_vmentry))
@@ -7751,8 +7753,16 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- /* xsaves_enabled is recomputed in vmx_compute_secondary_exec_control(). */
- vcpu->arch.xsaves_enabled = false;
+ /*
+ * XSAVES is effectively enabled if and only if XSAVE is also exposed
+ * to the guest. XSAVES depends on CR4.OSXSAVE, and CR4.OSXSAVE can be
+ * set if and only if XSAVE is supported.
+ */
+ if (boot_cpu_has(X86_FEATURE_XSAVE) &&
+ guest_cpuid_has(vcpu, X86_FEATURE_XSAVE))
+ kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_XSAVES);
+
+ kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_VMX);
vmx_setup_uret_msrs(vmx);
@@ -7760,7 +7770,7 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
vmcs_set_secondary_exec_control(vmx,
vmx_secondary_exec_control(vmx));
- if (nested_vmx_allowed(vcpu))
+ if (guest_can_use(vcpu, X86_FEATURE_VMX))
vmx->msr_ia32_feature_control_valid_bits |=
FEAT_CTL_VMX_ENABLED_INSIDE_SMX |
FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX;
@@ -7769,7 +7779,7 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
~(FEAT_CTL_VMX_ENABLED_INSIDE_SMX |
FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX);
- if (nested_vmx_allowed(vcpu))
+ if (guest_can_use(vcpu, X86_FEATURE_VMX))
nested_vmx_cr_fixed1_bits_update(vcpu);
if (boot_cpu_has(X86_FEATURE_INTEL_PT) &&
@@ -8526,7 +8536,7 @@ static __init int hardware_setup(void)
*/
vmx_setup_me_spte_mask();
- kvm_configure_mmu(enable_ept, 0, vmx_get_max_tdp_level(),
+ kvm_configure_mmu(enable_ept, 0, vmx_get_max_ept_level(),
ept_caps_to_lpage_level(vmx_capability.ept));
/*
@@ -8622,10 +8632,8 @@ static void __vmx_exit(void)
{
allow_smaller_maxphyaddr = false;
-#ifdef CONFIG_KEXEC_CORE
- RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL);
- synchronize_rcu();
-#endif
+ cpu_emergency_unregister_virt_callback(vmx_emergency_disable);
+
vmx_cleanup_l1d_flush();
}
@@ -8666,18 +8674,14 @@ static int __init vmx_init(void)
if (r)
goto err_l1d_flush;
- vmx_setup_fb_clear_ctrl();
-
for_each_possible_cpu(cpu) {
INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
pi_init_cpu(cpu);
}
-#ifdef CONFIG_KEXEC_CORE
- rcu_assign_pointer(crash_vmclear_loaded_vmcss,
- crash_vmclear_local_loaded_vmcss);
-#endif
+ cpu_emergency_register_virt_callback(vmx_emergency_disable);
+
vmx_check_vmcs12_offsets();
/*
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 32384ba38499..c2130d2c8e24 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -374,7 +374,6 @@ struct kvm_vmx {
u64 *pid_table;
};
-bool nested_vmx_allowed(struct kvm_vcpu *vcpu);
void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu,
struct loaded_vmcs *buddy);
int allocate_vpid(void);
@@ -562,7 +561,7 @@ static inline u8 vmx_get_rvi(void)
SECONDARY_EXEC_APIC_REGISTER_VIRT | \
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | \
SECONDARY_EXEC_SHADOW_VMCS | \
- SECONDARY_EXEC_XSAVES | \
+ SECONDARY_EXEC_ENABLE_XSAVES | \
SECONDARY_EXEC_RDSEED_EXITING | \
SECONDARY_EXEC_RDRAND_EXITING | \
SECONDARY_EXEC_ENABLE_PML | \
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c381770bcbf1..9f18b06bbda6 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -25,6 +25,7 @@
#include "tss.h"
#include "kvm_cache_regs.h"
#include "kvm_emulate.h"
+#include "mmu/page_track.h"
#include "x86.h"
#include "cpuid.h"
#include "pmu.h"
@@ -237,6 +238,9 @@ EXPORT_SYMBOL_GPL(enable_apicv);
u64 __read_mostly host_xss;
EXPORT_SYMBOL_GPL(host_xss);
+u64 __read_mostly host_arch_capabilities;
+EXPORT_SYMBOL_GPL(host_arch_capabilities);
+
const struct _kvm_stats_desc kvm_vm_stats_desc[] = {
KVM_GENERIC_VM_STATS(),
STATS_DESC_COUNTER(VM, mmu_shadow_zapped),
@@ -1021,7 +1025,7 @@ void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu)
if (vcpu->arch.xcr0 != host_xcr0)
xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0);
- if (vcpu->arch.xsaves_enabled &&
+ if (guest_can_use(vcpu, X86_FEATURE_XSAVES) &&
vcpu->arch.ia32_xss != host_xss)
wrmsrl(MSR_IA32_XSS, vcpu->arch.ia32_xss);
}
@@ -1052,7 +1056,7 @@ void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu)
if (vcpu->arch.xcr0 != host_xcr0)
xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0);
- if (vcpu->arch.xsaves_enabled &&
+ if (guest_can_use(vcpu, X86_FEATURE_XSAVES) &&
vcpu->arch.ia32_xss != host_xss)
wrmsrl(MSR_IA32_XSS, host_xss);
}
@@ -1620,12 +1624,7 @@ static bool kvm_is_immutable_feature_msr(u32 msr)
static u64 kvm_get_arch_capabilities(void)
{
- u64 data = 0;
-
- if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) {
- rdmsrl(MSR_IA32_ARCH_CAPABILITIES, data);
- data &= KVM_SUPPORTED_ARCH_CAP;
- }
+ u64 data = host_arch_capabilities & KVM_SUPPORTED_ARCH_CAP;
/*
* If nx_huge_pages is enabled, KVM's shadow paging will ensure that
@@ -2631,7 +2630,7 @@ static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 l1_offset)
else
vcpu->arch.tsc_offset = l1_offset;
- static_call(kvm_x86_write_tsc_offset)(vcpu, vcpu->arch.tsc_offset);
+ static_call(kvm_x86_write_tsc_offset)(vcpu);
}
static void kvm_vcpu_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 l1_multiplier)
@@ -2647,8 +2646,7 @@ static void kvm_vcpu_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 l1_multipli
vcpu->arch.tsc_scaling_ratio = l1_multiplier;
if (kvm_caps.has_tsc_control)
- static_call(kvm_x86_write_tsc_multiplier)(
- vcpu, vcpu->arch.tsc_scaling_ratio);
+ static_call(kvm_x86_write_tsc_multiplier)(vcpu);
}
static inline bool kvm_check_tsc_unstable(void)
@@ -4665,7 +4663,6 @@ static int kvm_x86_dev_get_attr(struct kvm_device_attr *attr)
return 0;
default:
return -ENXIO;
- break;
}
}
@@ -6532,7 +6529,7 @@ static void kvm_free_msr_filter(struct kvm_x86_msr_filter *msr_filter)
static int kvm_add_msr_filter(struct kvm_x86_msr_filter *msr_filter,
struct kvm_msr_filter_range *user_range)
{
- unsigned long *bitmap = NULL;
+ unsigned long *bitmap;
size_t bitmap_size;
if (!user_range->nmsrs)
@@ -8245,11 +8242,6 @@ static bool emulator_get_cpuid(struct x86_emulate_ctxt *ctxt,
return kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx, exact_only);
}
-static bool emulator_guest_has_long_mode(struct x86_emulate_ctxt *ctxt)
-{
- return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_LM);
-}
-
static bool emulator_guest_has_movbe(struct x86_emulate_ctxt *ctxt)
{
return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_MOVBE);
@@ -8351,7 +8343,6 @@ static const struct x86_emulate_ops emulate_ops = {
.fix_hypercall = emulator_fix_hypercall,
.intercept = emulator_intercept,
.get_cpuid = emulator_get_cpuid,
- .guest_has_long_mode = emulator_guest_has_long_mode,
.guest_has_movbe = emulator_guest_has_movbe,
.guest_has_fxsr = emulator_guest_has_fxsr,
.guest_has_rdpid = emulator_guest_has_rdpid,
@@ -9172,7 +9163,7 @@ static int kvmclock_cpu_down_prep(unsigned int cpu)
static void tsc_khz_changed(void *data)
{
struct cpufreq_freqs *freq = data;
- unsigned long khz = 0;
+ unsigned long khz;
WARN_ON_ONCE(boot_cpu_has(X86_FEATURE_CONSTANT_TSC));
@@ -9512,6 +9503,9 @@ static int __kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
kvm_init_pmu_capability(ops->pmu_ops);
+ if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES))
+ rdmsrl(MSR_IA32_ARCH_CAPABILITIES, host_arch_capabilities);
+
r = ops->hardware_setup();
if (r != 0)
goto out_mmu_exit;
@@ -11111,12 +11105,17 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
r = -EINTR;
goto out;
}
+
/*
- * It should be impossible for the hypervisor timer to be in
- * use before KVM has ever run the vCPU.
+ * Don't bother switching APIC timer emulation from the
+ * hypervisor timer to the software timer, the only way for the
+ * APIC timer to be active is if userspace stuffed vCPU state,
+ * i.e. put the vCPU into a nonsensical state. Only an INIT
+ * will transition the vCPU out of UNINITIALIZED (without more
+ * state stuffing from userspace), which will reset the local
+ * APIC and thus cancel the timer or drop the IRQ (if the timer
+ * already expired).
*/
- WARN_ON_ONCE(kvm_lapic_hv_timer_in_use(vcpu));
-
kvm_vcpu_srcu_read_unlock(vcpu);
kvm_vcpu_block(vcpu);
kvm_vcpu_srcu_read_lock(vcpu);
@@ -11798,15 +11797,22 @@ static int sync_regs(struct kvm_vcpu *vcpu)
__set_regs(vcpu, &vcpu->run->s.regs.regs);
vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_REGS;
}
+
if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_SREGS) {
- if (__set_sregs(vcpu, &vcpu->run->s.regs.sregs))
+ struct kvm_sregs sregs = vcpu->run->s.regs.sregs;
+
+ if (__set_sregs(vcpu, &sregs))
return -EINVAL;
+
vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_SREGS;
}
+
if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_EVENTS) {
- if (kvm_vcpu_ioctl_x86_set_vcpu_events(
- vcpu, &vcpu->run->s.regs.events))
+ struct kvm_vcpu_events events = vcpu->run->s.regs.events;
+
+ if (kvm_vcpu_ioctl_x86_set_vcpu_events(vcpu, &events))
return -EINVAL;
+
vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_EVENTS;
}
@@ -12302,9 +12308,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
if (ret)
goto out;
- ret = kvm_mmu_init_vm(kvm);
- if (ret)
- goto out_page_track;
+ kvm_mmu_init_vm(kvm);
ret = static_call(kvm_x86_vm_init)(kvm);
if (ret)
@@ -12349,7 +12353,6 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
out_uninit_mmu:
kvm_mmu_uninit_vm(kvm);
-out_page_track:
kvm_page_track_cleanup(kvm);
out:
return ret;
@@ -12627,6 +12630,13 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
struct kvm_memory_slot *new,
enum kvm_mr_change change)
{
+ /*
+ * KVM doesn't support moving memslots when there are external page
+ * trackers attached to the VM, i.e. if KVMGT is in use.
+ */
+ if (change == KVM_MR_MOVE && kvm_page_track_has_external_user(kvm))
+ return -EINVAL;
+
if (change == KVM_MR_CREATE || change == KVM_MR_MOVE) {
if ((new->base_gfn + new->npages - 1) > kvm_mmu_max_gfn())
return -EINVAL;
@@ -12772,7 +12782,7 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
* See is_writable_pte() for more details (the case involving
* access-tracked SPTEs is particularly relevant).
*/
- kvm_arch_flush_remote_tlbs_memslot(kvm, new);
+ kvm_flush_remote_tlbs_memslot(kvm, new);
}
}
@@ -12781,6 +12791,9 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
const struct kvm_memory_slot *new,
enum kvm_mr_change change)
{
+ if (change == KVM_MR_DELETE)
+ kvm_page_track_delete_slot(kvm, old);
+
if (!kvm->arch.n_requested_mmu_pages &&
(change == KVM_MR_CREATE || change == KVM_MR_DELETE)) {
unsigned long nr_mmu_pages;
@@ -12797,17 +12810,6 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
kvm_arch_free_memslot(kvm, old);
}
-void kvm_arch_flush_shadow_all(struct kvm *kvm)
-{
- kvm_mmu_zap_all(kvm);
-}
-
-void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
- struct kvm_memory_slot *slot)
-{
- kvm_page_track_flush_slot(kvm, slot);
-}
-
static inline bool kvm_guest_apic_has_interrupt(struct kvm_vcpu *vcpu)
{
return (is_guest_mode(vcpu) &&
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 82e3dafc5453..1e7be1f6ab29 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -323,6 +323,7 @@ fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu);
extern u64 host_xcr0;
extern u64 host_xss;
+extern u64 host_arch_capabilities;
extern struct kvm_caps kvm_caps;
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index 8f95fb267caa..76697df8dfd5 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -40,7 +40,7 @@ SYM_TYPED_FUNC_START(__memcpy)
SYM_FUNC_END(__memcpy)
EXPORT_SYMBOL(__memcpy)
-SYM_FUNC_ALIAS(memcpy, __memcpy)
+SYM_FUNC_ALIAS_MEMFUNC(memcpy, __memcpy)
EXPORT_SYMBOL(memcpy)
SYM_FUNC_START_LOCAL(memcpy_orig)
diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S
index 0559b206fb11..ccdf3a597045 100644
--- a/arch/x86/lib/memmove_64.S
+++ b/arch/x86/lib/memmove_64.S
@@ -212,5 +212,5 @@ SYM_FUNC_START(__memmove)
SYM_FUNC_END(__memmove)
EXPORT_SYMBOL(__memmove)
-SYM_FUNC_ALIAS(memmove, __memmove)
+SYM_FUNC_ALIAS_MEMFUNC(memmove, __memmove)
EXPORT_SYMBOL(memmove)
diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S
index 7c59a704c458..3d818b849ec6 100644
--- a/arch/x86/lib/memset_64.S
+++ b/arch/x86/lib/memset_64.S
@@ -40,7 +40,7 @@ SYM_FUNC_START(__memset)
SYM_FUNC_END(__memset)
EXPORT_SYMBOL(__memset)
-SYM_FUNC_ALIAS(memset, __memset)
+SYM_FUNC_ALIAS_MEMFUNC(memset, __memset)
EXPORT_SYMBOL(memset)
SYM_FUNC_START_LOCAL(memset_orig)
diff --git a/arch/x86/lib/putuser.S b/arch/x86/lib/putuser.S
index 1451e0c4ae22..235bbda6fc82 100644
--- a/arch/x86/lib/putuser.S
+++ b/arch/x86/lib/putuser.S
@@ -56,7 +56,6 @@ SYM_FUNC_END(__put_user_1)
EXPORT_SYMBOL(__put_user_1)
SYM_FUNC_START(__put_user_nocheck_1)
- ENDBR
ASM_STAC
2: movb %al,(%_ASM_CX)
xor %ecx,%ecx
@@ -76,7 +75,6 @@ SYM_FUNC_END(__put_user_2)
EXPORT_SYMBOL(__put_user_2)
SYM_FUNC_START(__put_user_nocheck_2)
- ENDBR
ASM_STAC
4: movw %ax,(%_ASM_CX)
xor %ecx,%ecx
@@ -96,7 +94,6 @@ SYM_FUNC_END(__put_user_4)
EXPORT_SYMBOL(__put_user_4)
SYM_FUNC_START(__put_user_nocheck_4)
- ENDBR
ASM_STAC
6: movl %eax,(%_ASM_CX)
xor %ecx,%ecx
@@ -119,7 +116,6 @@ SYM_FUNC_END(__put_user_8)
EXPORT_SYMBOL(__put_user_8)
SYM_FUNC_START(__put_user_nocheck_8)
- ENDBR
ASM_STAC
9: mov %_ASM_AX,(%_ASM_CX)
#ifdef CONFIG_X86_32
diff --git a/arch/x86/platform/efi/efi_32.c b/arch/x86/platform/efi/efi_32.c
index e06a199423c0..b2cc7b4552a1 100644
--- a/arch/x86/platform/efi/efi_32.c
+++ b/arch/x86/platform/efi/efi_32.c
@@ -140,3 +140,15 @@ void __init efi_runtime_update_mappings(void)
}
}
}
+
+void arch_efi_call_virt_setup(void)
+{
+ efi_fpu_begin();
+ firmware_restrict_branch_speculation_start();
+}
+
+void arch_efi_call_virt_teardown(void)
+{
+ firmware_restrict_branch_speculation_end();
+ efi_fpu_end();
+}
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index 77f7ac3668cb..91d31ac422d6 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -474,19 +474,34 @@ void __init efi_dump_pagetable(void)
* can not change under us.
* It should be ensured that there are no concurrent calls to this function.
*/
-void efi_enter_mm(void)
+static void efi_enter_mm(void)
{
efi_prev_mm = current->active_mm;
current->active_mm = &efi_mm;
switch_mm(efi_prev_mm, &efi_mm, NULL);
}
-void efi_leave_mm(void)
+static void efi_leave_mm(void)
{
current->active_mm = efi_prev_mm;
switch_mm(&efi_mm, efi_prev_mm, NULL);
}
+void arch_efi_call_virt_setup(void)
+{
+ efi_sync_low_kernel_mappings();
+ efi_fpu_begin();
+ firmware_restrict_branch_speculation_start();
+ efi_enter_mm();
+}
+
+void arch_efi_call_virt_teardown(void)
+{
+ efi_leave_mm();
+ firmware_restrict_branch_speculation_end();
+ efi_fpu_end();
+}
+
static DEFINE_SPINLOCK(efi_runtime_lock);
/*
diff --git a/arch/x86/purgatory/Makefile b/arch/x86/purgatory/Makefile
index c2a29be35c01..08aa0f25f12a 100644
--- a/arch/x86/purgatory/Makefile
+++ b/arch/x86/purgatory/Makefile
@@ -19,6 +19,10 @@ CFLAGS_sha256.o := -D__DISABLE_EXPORTS -D__NO_FORTIFY
# optimization flags.
KBUILD_CFLAGS := $(filter-out -fprofile-sample-use=% -fprofile-use=%,$(KBUILD_CFLAGS))
+# When LTO is enabled, llvm emits many text sections, which is not supported
+# by kexec. Remove -flto=* flags.
+KBUILD_CFLAGS := $(filter-out $(CC_FLAGS_LTO),$(KBUILD_CFLAGS))
+
# When linking purgatory.ro with -r unresolved symbols are not checked,
# also link a purgatory.chk binary without -r to check for unresolved symbols.
PURGATORY_LDFLAGS := -e purgatory_start -z nodefaultlib
diff --git a/arch/x86/um/Makefile b/arch/x86/um/Makefile
index ee89f6bb9242..8bc72a51b257 100644
--- a/arch/x86/um/Makefile
+++ b/arch/x86/um/Makefile
@@ -13,15 +13,16 @@ obj-y = bugs_$(BITS).o delay.o fault.o ldt.o \
ptrace_$(BITS).o ptrace_user.o setjmp_$(BITS).o signal.o \
stub_$(BITS).o stub_segv.o \
sys_call_table_$(BITS).o sysrq_$(BITS).o tls_$(BITS).o \
- mem_$(BITS).o subarch.o os-$(OS)/
+ mem_$(BITS).o subarch.o os-Linux/
ifeq ($(CONFIG_X86_32),y)
-obj-y += checksum_32.o syscalls_32.o
+obj-y += syscalls_32.o
obj-$(CONFIG_ELF_CORE) += elfcore.o
subarch-y = ../lib/string_32.o ../lib/atomic64_32.o ../lib/atomic64_cx8_32.o
subarch-y += ../lib/cmpxchg8b_emu.o ../lib/atomic64_386_32.o
+subarch-y += ../lib/checksum_32.o
subarch-y += ../kernel/sys_ia32.o
else
diff --git a/arch/x86/um/asm/mm_context.h b/arch/x86/um/asm/mm_context.h
index 4a73d63e4760..dc32dc023c2f 100644
--- a/arch/x86/um/asm/mm_context.h
+++ b/arch/x86/um/asm/mm_context.h
@@ -11,8 +11,6 @@
#include <linux/mutex.h>
#include <asm/ldt.h>
-extern void ldt_host_info(void);
-
#define LDT_PAGES_MAX \
((LDT_ENTRIES * LDT_ENTRY_SIZE)/PAGE_SIZE)
#define LDT_ENTRIES_PER_PAGE \
diff --git a/arch/x86/um/checksum_32.S b/arch/x86/um/checksum_32.S
deleted file mode 100644
index aed782ab7721..000000000000
--- a/arch/x86/um/checksum_32.S
+++ /dev/null
@@ -1,214 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * INET An implementation of the TCP/IP protocol suite for the LINUX
- * operating system. INET is implemented using the BSD Socket
- * interface as the means of communication with the user level.
- *
- * IP/TCP/UDP checksumming routines
- *
- * Authors: Jorge Cwik, <jorge@laser.satlink.net>
- * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
- * Tom May, <ftom@netcom.com>
- * Pentium Pro/II routines:
- * Alexander Kjeldaas <astor@guardian.no>
- * Finn Arne Gangstad <finnag@guardian.no>
- * Lots of code moved from tcp.c and ip.c; see those files
- * for more names.
- *
- * Changes: Ingo Molnar, converted csum_partial_copy() to 2.1 exception
- * handling.
- * Andi Kleen, add zeroing on error
- * converted to pure assembler
- */
-
-#include <asm/errno.h>
-#include <asm/asm.h>
-#include <asm/export.h>
-
-/*
- * computes a partial checksum, e.g. for TCP/UDP fragments
- */
-
-/*
-unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum)
- */
-
-.text
-.align 4
-.globl csum_partial
-
-#ifndef CONFIG_X86_USE_PPRO_CHECKSUM
-
- /*
- * Experiments with Ethernet and SLIP connections show that buff
- * is aligned on either a 2-byte or 4-byte boundary. We get at
- * least a twofold speedup on 486 and Pentium if it is 4-byte aligned.
- * Fortunately, it is easy to convert 2-byte alignment to 4-byte
- * alignment for the unrolled loop.
- */
-csum_partial:
- pushl %esi
- pushl %ebx
- movl 20(%esp),%eax # Function arg: unsigned int sum
- movl 16(%esp),%ecx # Function arg: int len
- movl 12(%esp),%esi # Function arg: unsigned char *buff
- testl $2, %esi # Check alignment.
- jz 2f # Jump if alignment is ok.
- subl $2, %ecx # Alignment uses up two bytes.
- jae 1f # Jump if we had at least two bytes.
- addl $2, %ecx # ecx was < 2. Deal with it.
- jmp 4f
-1: movw (%esi), %bx
- addl $2, %esi
- addw %bx, %ax
- adcl $0, %eax
-2:
- movl %ecx, %edx
- shrl $5, %ecx
- jz 2f
- testl %esi, %esi
-1: movl (%esi), %ebx
- adcl %ebx, %eax
- movl 4(%esi), %ebx
- adcl %ebx, %eax
- movl 8(%esi), %ebx
- adcl %ebx, %eax
- movl 12(%esi), %ebx
- adcl %ebx, %eax
- movl 16(%esi), %ebx
- adcl %ebx, %eax
- movl 20(%esi), %ebx
- adcl %ebx, %eax
- movl 24(%esi), %ebx
- adcl %ebx, %eax
- movl 28(%esi), %ebx
- adcl %ebx, %eax
- lea 32(%esi), %esi
- dec %ecx
- jne 1b
- adcl $0, %eax
-2: movl %edx, %ecx
- andl $0x1c, %edx
- je 4f
- shrl $2, %edx # This clears CF
-3: adcl (%esi), %eax
- lea 4(%esi), %esi
- dec %edx
- jne 3b
- adcl $0, %eax
-4: andl $3, %ecx
- jz 7f
- cmpl $2, %ecx
- jb 5f
- movw (%esi),%cx
- leal 2(%esi),%esi
- je 6f
- shll $16,%ecx
-5: movb (%esi),%cl
-6: addl %ecx,%eax
- adcl $0, %eax
-7:
- popl %ebx
- popl %esi
- RET
-
-#else
-
-/* Version for PentiumII/PPro */
-
-csum_partial:
- pushl %esi
- pushl %ebx
- movl 20(%esp),%eax # Function arg: unsigned int sum
- movl 16(%esp),%ecx # Function arg: int len
- movl 12(%esp),%esi # Function arg: const unsigned char *buf
-
- testl $2, %esi
- jnz 30f
-10:
- movl %ecx, %edx
- movl %ecx, %ebx
- andl $0x7c, %ebx
- shrl $7, %ecx
- addl %ebx,%esi
- shrl $2, %ebx
- negl %ebx
- lea 45f(%ebx,%ebx,2), %ebx
- testl %esi, %esi
- jmp *%ebx
-
- # Handle 2-byte-aligned regions
-20: addw (%esi), %ax
- lea 2(%esi), %esi
- adcl $0, %eax
- jmp 10b
-
-30: subl $2, %ecx
- ja 20b
- je 32f
- movzbl (%esi),%ebx # csumming 1 byte, 2-aligned
- addl %ebx, %eax
- adcl $0, %eax
- jmp 80f
-32:
- addw (%esi), %ax # csumming 2 bytes, 2-aligned
- adcl $0, %eax
- jmp 80f
-
-40:
- addl -128(%esi), %eax
- adcl -124(%esi), %eax
- adcl -120(%esi), %eax
- adcl -116(%esi), %eax
- adcl -112(%esi), %eax
- adcl -108(%esi), %eax
- adcl -104(%esi), %eax
- adcl -100(%esi), %eax
- adcl -96(%esi), %eax
- adcl -92(%esi), %eax
- adcl -88(%esi), %eax
- adcl -84(%esi), %eax
- adcl -80(%esi), %eax
- adcl -76(%esi), %eax
- adcl -72(%esi), %eax
- adcl -68(%esi), %eax
- adcl -64(%esi), %eax
- adcl -60(%esi), %eax
- adcl -56(%esi), %eax
- adcl -52(%esi), %eax
- adcl -48(%esi), %eax
- adcl -44(%esi), %eax
- adcl -40(%esi), %eax
- adcl -36(%esi), %eax
- adcl -32(%esi), %eax
- adcl -28(%esi), %eax
- adcl -24(%esi), %eax
- adcl -20(%esi), %eax
- adcl -16(%esi), %eax
- adcl -12(%esi), %eax
- adcl -8(%esi), %eax
- adcl -4(%esi), %eax
-45:
- lea 128(%esi), %esi
- adcl $0, %eax
- dec %ecx
- jge 40b
- movl %edx, %ecx
-50: andl $3, %ecx
- jz 80f
-
- # Handle the last 1-3 bytes without jumping
- notl %ecx # 1->2, 2->1, 3->0, higher bits are masked
- movl $0xffffff,%ebx # by the shll and shrl instructions
- shll $3,%ecx
- shrl %cl,%ebx
- andl -128(%esi),%ebx # esi is 4-aligned so should be ok
- addl %ebx,%eax
- adcl $0,%eax
-80:
- popl %ebx
- popl %esi
- RET
-
-#endif
- EXPORT_SYMBOL(csum_partial)
diff --git a/arch/x86/xen/efi.c b/arch/x86/xen/efi.c
index 863d0d6b3edc..7250d0e0e1a9 100644
--- a/arch/x86/xen/efi.c
+++ b/arch/x86/xen/efi.c
@@ -138,7 +138,7 @@ void __init xen_efi_init(struct boot_params *boot_params)
if (efi_systab_xen == NULL)
return;
- strncpy((char *)&boot_params->efi_info.efi_loader_signature, "Xen",
+ strscpy((char *)&boot_params->efi_info.efi_loader_signature, "Xen",
sizeof(boot_params->efi_info.efi_loader_signature));
boot_params->efi_info.efi_systab = (__u32)__pa(efi_systab_xen);
boot_params->efi_info.efi_systab_hi = (__u32)(__pa(efi_systab_xen) >> 32);
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index b8db2148c07d..0337392a3121 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -32,7 +32,7 @@ EXPORT_SYMBOL_GPL(hypercall_page);
* &HYPERVISOR_shared_info->vcpu_info[cpu]. See xen_hvm_init_shared_info
* and xen_vcpu_setup for details. By default it points to share_info->vcpu_info
* but during boot it is switched to point to xen_vcpu_info.
- * The pointer is used in __xen_evtchn_do_upcall to acknowledge pending events.
+ * The pointer is used in xen_evtchn_do_upcall to acknowledge pending events.
*/
DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu);
DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info);
diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c
index 9a192f51f1b0..3f8c34707c50 100644
--- a/arch/x86/xen/enlighten_hvm.c
+++ b/arch/x86/xen/enlighten_hvm.c
@@ -136,7 +136,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_xen_hvm_callback)
inc_irq_stat(irq_hv_callback_count);
- xen_hvm_evtchn_do_upcall();
+ xen_evtchn_do_upcall();
set_irq_regs(old_regs);
}
diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index 49352fad7d1d..bbbfdd495ebd 100644
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -101,6 +101,17 @@ struct tls_descs {
struct desc_struct desc[3];
};
+DEFINE_PER_CPU(enum xen_lazy_mode, xen_lazy_mode) = XEN_LAZY_NONE;
+DEFINE_PER_CPU(unsigned int, xen_lazy_nesting);
+
+enum xen_lazy_mode xen_get_lazy_mode(void)
+{
+ if (in_interrupt())
+ return XEN_LAZY_NONE;
+
+ return this_cpu_read(xen_lazy_mode);
+}
+
/*
* Updating the 3 TLS descriptors in the GDT on every task switch is
* surprisingly expensive so we avoid updating them if they haven't
@@ -362,10 +373,25 @@ static noinstr unsigned long xen_get_debugreg(int reg)
return HYPERVISOR_get_debugreg(reg);
}
+static void xen_start_context_switch(struct task_struct *prev)
+{
+ BUG_ON(preemptible());
+
+ if (this_cpu_read(xen_lazy_mode) == XEN_LAZY_MMU) {
+ arch_leave_lazy_mmu_mode();
+ set_ti_thread_flag(task_thread_info(prev), TIF_LAZY_MMU_UPDATES);
+ }
+ enter_lazy(XEN_LAZY_CPU);
+}
+
static void xen_end_context_switch(struct task_struct *next)
{
+ BUG_ON(preemptible());
+
xen_mc_flush();
- paravirt_end_context_switch(next);
+ leave_lazy(XEN_LAZY_CPU);
+ if (test_and_clear_ti_thread_flag(task_thread_info(next), TIF_LAZY_MMU_UPDATES))
+ arch_enter_lazy_mmu_mode();
}
static unsigned long xen_store_tr(void)
@@ -472,7 +498,7 @@ static void xen_set_ldt(const void *addr, unsigned entries)
MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
- xen_mc_issue(PARAVIRT_LAZY_CPU);
+ xen_mc_issue(XEN_LAZY_CPU);
}
static void xen_load_gdt(const struct desc_ptr *dtr)
@@ -568,7 +594,7 @@ static void xen_load_tls(struct thread_struct *t, unsigned int cpu)
* exception between the new %fs descriptor being loaded and
* %fs being effectively cleared at __switch_to().
*/
- if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU)
+ if (xen_get_lazy_mode() == XEN_LAZY_CPU)
loadsegment(fs, 0);
xen_mc_batch();
@@ -577,7 +603,7 @@ static void xen_load_tls(struct thread_struct *t, unsigned int cpu)
load_TLS_descriptor(t, cpu, 1);
load_TLS_descriptor(t, cpu, 2);
- xen_mc_issue(PARAVIRT_LAZY_CPU);
+ xen_mc_issue(XEN_LAZY_CPU);
}
static void xen_load_gs_index(unsigned int idx)
@@ -909,7 +935,7 @@ static void xen_load_sp0(unsigned long sp0)
mcs = xen_mc_entry(0);
MULTI_stack_switch(mcs.mc, __KERNEL_DS, sp0);
- xen_mc_issue(PARAVIRT_LAZY_CPU);
+ xen_mc_issue(XEN_LAZY_CPU);
this_cpu_write(cpu_tss_rw.x86_tss.sp0, sp0);
}
@@ -973,7 +999,7 @@ static void xen_write_cr0(unsigned long cr0)
MULTI_fpu_taskswitch(mcs.mc, (cr0 & X86_CR0_TS) != 0);
- xen_mc_issue(PARAVIRT_LAZY_CPU);
+ xen_mc_issue(XEN_LAZY_CPU);
}
static void xen_write_cr4(unsigned long cr4)
@@ -1156,7 +1182,7 @@ static const typeof(pv_ops) xen_cpu_ops __initconst = {
#endif
.io_delay = xen_io_delay,
- .start_context_switch = paravirt_start_context_switch,
+ .start_context_switch = xen_start_context_switch,
.end_context_switch = xen_end_context_switch,
},
};
diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
index 1652c39e3dfb..b6830554ff69 100644
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -236,7 +236,7 @@ static void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
u.val = pmd_val_ma(val);
xen_extend_mmu_update(&u);
- xen_mc_issue(PARAVIRT_LAZY_MMU);
+ xen_mc_issue(XEN_LAZY_MMU);
preempt_enable();
}
@@ -270,7 +270,7 @@ static bool xen_batched_set_pte(pte_t *ptep, pte_t pteval)
{
struct mmu_update u;
- if (paravirt_get_lazy_mode() != PARAVIRT_LAZY_MMU)
+ if (xen_get_lazy_mode() != XEN_LAZY_MMU)
return false;
xen_mc_batch();
@@ -279,7 +279,7 @@ static bool xen_batched_set_pte(pte_t *ptep, pte_t pteval)
u.val = pte_val_ma(pteval);
xen_extend_mmu_update(&u);
- xen_mc_issue(PARAVIRT_LAZY_MMU);
+ xen_mc_issue(XEN_LAZY_MMU);
return true;
}
@@ -325,7 +325,7 @@ void xen_ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr,
u.val = pte_val_ma(pte);
xen_extend_mmu_update(&u);
- xen_mc_issue(PARAVIRT_LAZY_MMU);
+ xen_mc_issue(XEN_LAZY_MMU);
}
/* Assume pteval_t is equivalent to all the other *val_t types. */
@@ -419,7 +419,7 @@ static void xen_set_pud_hyper(pud_t *ptr, pud_t val)
u.val = pud_val_ma(val);
xen_extend_mmu_update(&u);
- xen_mc_issue(PARAVIRT_LAZY_MMU);
+ xen_mc_issue(XEN_LAZY_MMU);
preempt_enable();
}
@@ -499,7 +499,7 @@ static void __init xen_set_p4d_hyper(p4d_t *ptr, p4d_t val)
__xen_set_p4d_hyper(ptr, val);
- xen_mc_issue(PARAVIRT_LAZY_MMU);
+ xen_mc_issue(XEN_LAZY_MMU);
preempt_enable();
}
@@ -531,7 +531,7 @@ static void xen_set_p4d(p4d_t *ptr, p4d_t val)
if (user_ptr)
__xen_set_p4d_hyper((p4d_t *)user_ptr, val);
- xen_mc_issue(PARAVIRT_LAZY_MMU);
+ xen_mc_issue(XEN_LAZY_MMU);
}
#if CONFIG_PGTABLE_LEVELS >= 5
@@ -1245,7 +1245,7 @@ static noinline void xen_flush_tlb(void)
op->cmd = MMUEXT_TLB_FLUSH_LOCAL;
MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
- xen_mc_issue(PARAVIRT_LAZY_MMU);
+ xen_mc_issue(XEN_LAZY_MMU);
preempt_enable();
}
@@ -1265,7 +1265,7 @@ static void xen_flush_tlb_one_user(unsigned long addr)
op->arg1.linear_addr = addr & PAGE_MASK;
MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
- xen_mc_issue(PARAVIRT_LAZY_MMU);
+ xen_mc_issue(XEN_LAZY_MMU);
preempt_enable();
}
@@ -1302,7 +1302,7 @@ static void xen_flush_tlb_multi(const struct cpumask *cpus,
MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);
- xen_mc_issue(PARAVIRT_LAZY_MMU);
+ xen_mc_issue(XEN_LAZY_MMU);
}
static unsigned long xen_read_cr3(void)
@@ -1361,7 +1361,7 @@ static void xen_write_cr3(unsigned long cr3)
else
__xen_write_cr3(false, 0);
- xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */
+ xen_mc_issue(XEN_LAZY_CPU); /* interrupts restored */
}
/*
@@ -1396,7 +1396,7 @@ static void __init xen_write_cr3_init(unsigned long cr3)
__xen_write_cr3(true, cr3);
- xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */
+ xen_mc_issue(XEN_LAZY_CPU); /* interrupts restored */
}
static int xen_pgd_alloc(struct mm_struct *mm)
@@ -1557,7 +1557,7 @@ static inline void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn,
if (level == PT_PTE && USE_SPLIT_PTE_PTLOCKS && !pinned)
__pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
- xen_mc_issue(PARAVIRT_LAZY_MMU);
+ xen_mc_issue(XEN_LAZY_MMU);
}
}
@@ -1587,7 +1587,7 @@ static inline void xen_release_ptpage(unsigned long pfn, unsigned level)
__set_pfn_prot(pfn, PAGE_KERNEL);
- xen_mc_issue(PARAVIRT_LAZY_MMU);
+ xen_mc_issue(XEN_LAZY_MMU);
ClearPagePinned(page);
}
@@ -1804,7 +1804,7 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
*/
xen_mc_batch();
__xen_write_cr3(true, __pa(init_top_pgt));
- xen_mc_issue(PARAVIRT_LAZY_CPU);
+ xen_mc_issue(XEN_LAZY_CPU);
/* We can't that easily rip out L3 and L2, as the Xen pagetables are
* set out this way: [L4], [L1], [L2], [L3], [L1], [L1] ... for
@@ -2083,6 +2083,23 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
#endif
}
+static void xen_enter_lazy_mmu(void)
+{
+ enter_lazy(XEN_LAZY_MMU);
+}
+
+static void xen_flush_lazy_mmu(void)
+{
+ preempt_disable();
+
+ if (xen_get_lazy_mode() == XEN_LAZY_MMU) {
+ arch_leave_lazy_mmu_mode();
+ arch_enter_lazy_mmu_mode();
+ }
+
+ preempt_enable();
+}
+
static void __init xen_post_allocator_init(void)
{
pv_ops.mmu.set_pte = xen_set_pte;
@@ -2107,7 +2124,7 @@ static void xen_leave_lazy_mmu(void)
{
preempt_disable();
xen_mc_flush();
- paravirt_leave_lazy_mmu();
+ leave_lazy(XEN_LAZY_MMU);
preempt_enable();
}
@@ -2166,9 +2183,9 @@ static const typeof(pv_ops) xen_mmu_ops __initconst = {
.exit_mmap = xen_exit_mmap,
.lazy_mode = {
- .enter = paravirt_enter_lazy_mmu,
+ .enter = xen_enter_lazy_mmu,
.leave = xen_leave_lazy_mmu,
- .flush = paravirt_flush_lazy_mmu,
+ .flush = xen_flush_lazy_mmu,
},
.set_fixmap = xen_set_fixmap,
@@ -2385,7 +2402,7 @@ static noinline void xen_flush_tlb_all(void)
op->cmd = MMUEXT_TLB_FLUSH_ALL;
MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
- xen_mc_issue(PARAVIRT_LAZY_MMU);
+ xen_mc_issue(XEN_LAZY_MMU);
preempt_enable();
}
diff --git a/arch/x86/xen/multicalls.h b/arch/x86/xen/multicalls.h
index 1c51b2c87f30..c3867b585e0d 100644
--- a/arch/x86/xen/multicalls.h
+++ b/arch/x86/xen/multicalls.h
@@ -26,7 +26,7 @@ static inline void xen_mc_batch(void)
/* need to disable interrupts until this entry is complete */
local_irq_save(flags);
- trace_xen_mc_batch(paravirt_get_lazy_mode());
+ trace_xen_mc_batch(xen_get_lazy_mode());
__this_cpu_write(xen_mc_irq_flags, flags);
}
@@ -44,7 +44,7 @@ static inline void xen_mc_issue(unsigned mode)
{
trace_xen_mc_issue(mode);
- if ((paravirt_get_lazy_mode() & mode) == 0)
+ if ((xen_get_lazy_mode() & mode) == 0)
xen_mc_flush();
/* restore flags saved in xen_mc_batch */