diff options
Diffstat (limited to 'arch/x86/kvm/x86.c')
-rw-r--r-- | arch/x86/kvm/x86.c | 694 |
1 files changed, 476 insertions, 218 deletions
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index fafd81d2c9ea..0ed07d8d2caa 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -51,6 +51,7 @@ #include <linux/kvm_irqfd.h> #include <linux/irqbypass.h> #include <linux/sched/stat.h> +#include <linux/sched/isolation.h> #include <linux/mem_encrypt.h> #include <trace/events/kvm.h> @@ -67,6 +68,7 @@ #include <asm/mshyperv.h> #include <asm/hypervisor.h> #include <asm/intel_pt.h> +#include <clocksource/hyperv_timer.h> #define CREATE_TRACE_POINTS #include "trace.h" @@ -152,6 +154,9 @@ EXPORT_SYMBOL_GPL(enable_vmware_backdoor); static bool __read_mostly force_emulation_prefix = false; module_param(force_emulation_prefix, bool, S_IRUGO); +int __read_mostly pi_inject_timer = -1; +module_param(pi_inject_timer, bint, S_IRUGO | S_IWUSR); + #define KVM_NR_SHARED_MSRS 16 struct kvm_shared_msrs_global { @@ -355,7 +360,8 @@ EXPORT_SYMBOL_GPL(kvm_set_apic_base); asmlinkage __visible void kvm_spurious_fault(void) { /* Fault while not rebooting. We want the trace. */ - BUG(); + if (!kvm_rebooting) + BUG(); } EXPORT_SYMBOL_GPL(kvm_spurious_fault); @@ -669,8 +675,14 @@ static int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, data, offset, len, access); } +static inline u64 pdptr_rsvd_bits(struct kvm_vcpu *vcpu) +{ + return rsvd_bits(cpuid_maxphyaddr(vcpu), 63) | rsvd_bits(5, 8) | + rsvd_bits(1, 2); +} + /* - * Load the pae pdptrs. Return true is they are all valid. + * Load the pae pdptrs. Return 1 if they are all valid, 0 otherwise. */ int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3) { @@ -689,8 +701,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3) } for (i = 0; i < ARRAY_SIZE(pdpte); ++i) { if ((pdpte[i] & PT_PRESENT_MASK) && - (pdpte[i] & - vcpu->arch.mmu->guest_rsvd_check.rsvd_bits_mask[0][2])) { + (pdpte[i] & pdptr_rsvd_bits(vcpu))) { ret = 0; goto out; } @@ -716,7 +727,7 @@ bool pdptrs_changed(struct kvm_vcpu *vcpu) gfn_t gfn; int r; - if (is_long_mode(vcpu) || !is_pae(vcpu) || !is_paging(vcpu)) + if (!is_pae_paging(vcpu)) return false; if (!test_bit(VCPU_EXREG_PDPTR, @@ -959,8 +970,8 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) if (is_long_mode(vcpu) && (cr3 & rsvd_bits(cpuid_maxphyaddr(vcpu), 63))) return 1; - else if (is_pae(vcpu) && is_paging(vcpu) && - !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) + else if (is_pae_paging(vcpu) && + !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) return 1; kvm_mmu_new_cr3(vcpu, cr3, skip_tlb_flush); @@ -1135,6 +1146,44 @@ static u32 msrs_to_save[] = { MSR_IA32_RTIT_ADDR1_A, MSR_IA32_RTIT_ADDR1_B, MSR_IA32_RTIT_ADDR2_A, MSR_IA32_RTIT_ADDR2_B, MSR_IA32_RTIT_ADDR3_A, MSR_IA32_RTIT_ADDR3_B, + MSR_IA32_UMWAIT_CONTROL, + + MSR_ARCH_PERFMON_FIXED_CTR0, MSR_ARCH_PERFMON_FIXED_CTR1, + MSR_ARCH_PERFMON_FIXED_CTR0 + 2, MSR_ARCH_PERFMON_FIXED_CTR0 + 3, + MSR_CORE_PERF_FIXED_CTR_CTRL, MSR_CORE_PERF_GLOBAL_STATUS, + MSR_CORE_PERF_GLOBAL_CTRL, MSR_CORE_PERF_GLOBAL_OVF_CTRL, + MSR_ARCH_PERFMON_PERFCTR0, MSR_ARCH_PERFMON_PERFCTR1, + MSR_ARCH_PERFMON_PERFCTR0 + 2, MSR_ARCH_PERFMON_PERFCTR0 + 3, + MSR_ARCH_PERFMON_PERFCTR0 + 4, MSR_ARCH_PERFMON_PERFCTR0 + 5, + MSR_ARCH_PERFMON_PERFCTR0 + 6, MSR_ARCH_PERFMON_PERFCTR0 + 7, + MSR_ARCH_PERFMON_PERFCTR0 + 8, MSR_ARCH_PERFMON_PERFCTR0 + 9, + MSR_ARCH_PERFMON_PERFCTR0 + 10, MSR_ARCH_PERFMON_PERFCTR0 + 11, + MSR_ARCH_PERFMON_PERFCTR0 + 12, MSR_ARCH_PERFMON_PERFCTR0 + 13, + MSR_ARCH_PERFMON_PERFCTR0 + 14, MSR_ARCH_PERFMON_PERFCTR0 + 15, + MSR_ARCH_PERFMON_PERFCTR0 + 16, MSR_ARCH_PERFMON_PERFCTR0 + 17, + MSR_ARCH_PERFMON_PERFCTR0 + 18, MSR_ARCH_PERFMON_PERFCTR0 + 19, + MSR_ARCH_PERFMON_PERFCTR0 + 20, MSR_ARCH_PERFMON_PERFCTR0 + 21, + MSR_ARCH_PERFMON_PERFCTR0 + 22, MSR_ARCH_PERFMON_PERFCTR0 + 23, + MSR_ARCH_PERFMON_PERFCTR0 + 24, MSR_ARCH_PERFMON_PERFCTR0 + 25, + MSR_ARCH_PERFMON_PERFCTR0 + 26, MSR_ARCH_PERFMON_PERFCTR0 + 27, + MSR_ARCH_PERFMON_PERFCTR0 + 28, MSR_ARCH_PERFMON_PERFCTR0 + 29, + MSR_ARCH_PERFMON_PERFCTR0 + 30, MSR_ARCH_PERFMON_PERFCTR0 + 31, + MSR_ARCH_PERFMON_EVENTSEL0, MSR_ARCH_PERFMON_EVENTSEL1, + MSR_ARCH_PERFMON_EVENTSEL0 + 2, MSR_ARCH_PERFMON_EVENTSEL0 + 3, + MSR_ARCH_PERFMON_EVENTSEL0 + 4, MSR_ARCH_PERFMON_EVENTSEL0 + 5, + MSR_ARCH_PERFMON_EVENTSEL0 + 6, MSR_ARCH_PERFMON_EVENTSEL0 + 7, + MSR_ARCH_PERFMON_EVENTSEL0 + 8, MSR_ARCH_PERFMON_EVENTSEL0 + 9, + MSR_ARCH_PERFMON_EVENTSEL0 + 10, MSR_ARCH_PERFMON_EVENTSEL0 + 11, + MSR_ARCH_PERFMON_EVENTSEL0 + 12, MSR_ARCH_PERFMON_EVENTSEL0 + 13, + MSR_ARCH_PERFMON_EVENTSEL0 + 14, MSR_ARCH_PERFMON_EVENTSEL0 + 15, + MSR_ARCH_PERFMON_EVENTSEL0 + 16, MSR_ARCH_PERFMON_EVENTSEL0 + 17, + MSR_ARCH_PERFMON_EVENTSEL0 + 18, MSR_ARCH_PERFMON_EVENTSEL0 + 19, + MSR_ARCH_PERFMON_EVENTSEL0 + 20, MSR_ARCH_PERFMON_EVENTSEL0 + 21, + MSR_ARCH_PERFMON_EVENTSEL0 + 22, MSR_ARCH_PERFMON_EVENTSEL0 + 23, + MSR_ARCH_PERFMON_EVENTSEL0 + 24, MSR_ARCH_PERFMON_EVENTSEL0 + 25, + MSR_ARCH_PERFMON_EVENTSEL0 + 26, MSR_ARCH_PERFMON_EVENTSEL0 + 27, + MSR_ARCH_PERFMON_EVENTSEL0 + 28, MSR_ARCH_PERFMON_EVENTSEL0 + 29, + MSR_ARCH_PERFMON_EVENTSEL0 + 30, MSR_ARCH_PERFMON_EVENTSEL0 + 31, }; static unsigned num_msrs_to_save; @@ -1173,7 +1222,28 @@ static u32 emulated_msrs[] = { MSR_AMD64_VIRT_SPEC_CTRL, MSR_IA32_POWER_CTL, + /* + * The following list leaves out MSRs whose values are determined + * by arch/x86/kvm/vmx/nested.c based on CPUID or other MSRs. + * We always support the "true" VMX control MSRs, even if the host + * processor does not, so I am putting these registers here rather + * than in msrs_to_save. + */ + MSR_IA32_VMX_BASIC, + MSR_IA32_VMX_TRUE_PINBASED_CTLS, + MSR_IA32_VMX_TRUE_PROCBASED_CTLS, + MSR_IA32_VMX_TRUE_EXIT_CTLS, + MSR_IA32_VMX_TRUE_ENTRY_CTLS, + MSR_IA32_VMX_MISC, + MSR_IA32_VMX_CR0_FIXED0, + MSR_IA32_VMX_CR4_FIXED0, + MSR_IA32_VMX_VMCS_ENUM, + MSR_IA32_VMX_PROCBASED_CTLS2, + MSR_IA32_VMX_EPT_VPID_CAP, + MSR_IA32_VMX_VMFUNC, + MSR_K7_HWCR, + MSR_KVM_POLL_CONTROL, }; static unsigned num_emulated_msrs; @@ -1209,11 +1279,12 @@ static u32 msr_based_features[] = { static unsigned int num_msr_based_features; -u64 kvm_get_arch_capabilities(void) +static u64 kvm_get_arch_capabilities(void) { - u64 data; + u64 data = 0; - rdmsrl_safe(MSR_IA32_ARCH_CAPABILITIES, &data); + if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) + rdmsrl(MSR_IA32_ARCH_CAPABILITIES, data); /* * If we're doing cache flushes (either "always" or "cond") @@ -1227,9 +1298,15 @@ u64 kvm_get_arch_capabilities(void) if (l1tf_vmx_mitigation != VMENTER_L1D_FLUSH_NEVER) data |= ARCH_CAP_SKIP_VMENTRY_L1DFLUSH; + if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN)) + data |= ARCH_CAP_RDCL_NO; + if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS)) + data |= ARCH_CAP_SSB_NO; + if (!boot_cpu_has_bug(X86_BUG_MDS)) + data |= ARCH_CAP_MDS_NO; + return data; } -EXPORT_SYMBOL_GPL(kvm_get_arch_capabilities); static int kvm_get_msr_feature(struct kvm_msr_entry *msr) { @@ -1325,19 +1402,23 @@ void kvm_enable_efer_bits(u64 mask) EXPORT_SYMBOL_GPL(kvm_enable_efer_bits); /* - * Writes msr value into into the appropriate "register". + * Write @data into the MSR specified by @index. Select MSR specific fault + * checks are bypassed if @host_initiated is %true. * Returns 0 on success, non-0 otherwise. * Assumes vcpu_load() was already called. */ -int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) +static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data, + bool host_initiated) { - switch (msr->index) { + struct msr_data msr; + + switch (index) { case MSR_FS_BASE: case MSR_GS_BASE: case MSR_KERNEL_GS_BASE: case MSR_CSTAR: case MSR_LSTAR: - if (is_noncanonical_address(msr->data, vcpu)) + if (is_noncanonical_address(data, vcpu)) return 1; break; case MSR_IA32_SYSENTER_EIP: @@ -1354,38 +1435,95 @@ int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) * value, and that something deterministic happens if the guest * invokes 64-bit SYSENTER. */ - msr->data = get_canonical(msr->data, vcpu_virt_addr_bits(vcpu)); + data = get_canonical(data, vcpu_virt_addr_bits(vcpu)); } - return kvm_x86_ops->set_msr(vcpu, msr); + + msr.data = data; + msr.index = index; + msr.host_initiated = host_initiated; + + return kvm_x86_ops->set_msr(vcpu, &msr); } -EXPORT_SYMBOL_GPL(kvm_set_msr); /* - * Adapt set_msr() to msr_io()'s calling convention + * Read the MSR specified by @index into @data. Select MSR specific fault + * checks are bypassed if @host_initiated is %true. + * Returns 0 on success, non-0 otherwise. + * Assumes vcpu_load() was already called. */ -static int do_get_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) +static int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, + bool host_initiated) { struct msr_data msr; - int r; + int ret; msr.index = index; - msr.host_initiated = true; - r = kvm_get_msr(vcpu, &msr); - if (r) - return r; + msr.host_initiated = host_initiated; - *data = msr.data; - return 0; + ret = kvm_x86_ops->get_msr(vcpu, &msr); + if (!ret) + *data = msr.data; + return ret; } -static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) +int kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data) { - struct msr_data msr; + return __kvm_get_msr(vcpu, index, data, false); +} +EXPORT_SYMBOL_GPL(kvm_get_msr); - msr.data = *data; - msr.index = index; - msr.host_initiated = true; - return kvm_set_msr(vcpu, &msr); +int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data) +{ + return __kvm_set_msr(vcpu, index, data, false); +} +EXPORT_SYMBOL_GPL(kvm_set_msr); + +int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu) +{ + u32 ecx = kvm_rcx_read(vcpu); + u64 data; + + if (kvm_get_msr(vcpu, ecx, &data)) { + trace_kvm_msr_read_ex(ecx); + kvm_inject_gp(vcpu, 0); + return 1; + } + + trace_kvm_msr_read(ecx, data); + + kvm_rax_write(vcpu, data & -1u); + kvm_rdx_write(vcpu, (data >> 32) & -1u); + return kvm_skip_emulated_instruction(vcpu); +} +EXPORT_SYMBOL_GPL(kvm_emulate_rdmsr); + +int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu) +{ + u32 ecx = kvm_rcx_read(vcpu); + u64 data = kvm_read_edx_eax(vcpu); + + if (kvm_set_msr(vcpu, ecx, data)) { + trace_kvm_msr_write_ex(ecx, data); + kvm_inject_gp(vcpu, 0); + return 1; + } + + trace_kvm_msr_write(ecx, data); + return kvm_skip_emulated_instruction(vcpu); +} +EXPORT_SYMBOL_GPL(kvm_emulate_wrmsr); + +/* + * Adapt set_msr() to msr_io()'s calling convention + */ +static int do_get_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) +{ + return __kvm_get_msr(vcpu, index, data, true); +} + +static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) +{ + return __kvm_set_msr(vcpu, index, *data, true); } #ifdef CONFIG_X86_64 @@ -1434,12 +1572,8 @@ static void update_pvclock_gtod(struct timekeeper *tk) void kvm_set_pending_timer(struct kvm_vcpu *vcpu) { - /* - * Note: KVM_REQ_PENDING_TIMER is implicitly checked in - * vcpu_enter_guest. This function is only called from - * the physical CPU that is running vcpu. - */ kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu); + kvm_vcpu_kick(vcpu); } static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) @@ -1518,9 +1652,6 @@ static void kvm_get_time_scale(uint64_t scaled_hz, uint64_t base_hz, *pshift = shift; *pmultiplier = div_frac(scaled64, tps32); - - pr_debug("%s: base_hz %llu => %llu, shift %d, mul %u\n", - __func__, base_hz, scaled_hz, shift, *pmultiplier); } #ifdef CONFIG_X86_64 @@ -1728,7 +1859,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); offset = kvm_compute_tsc_offset(vcpu, data); - ns = ktime_get_boot_ns(); + ns = ktime_get_boottime_ns(); elapsed = ns - kvm->arch.last_tsc_nsec; if (vcpu->arch.virtual_tsc_khz) { @@ -1763,12 +1894,10 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) vcpu->arch.virtual_tsc_khz == kvm->arch.last_tsc_khz) { if (!kvm_check_tsc_unstable()) { offset = kvm->arch.cur_tsc_offset; - pr_debug("kvm: matched tsc offset for %llu\n", data); } else { u64 delta = nsec_to_cycles(vcpu, elapsed); data += delta; offset = kvm_compute_tsc_offset(vcpu, data); - pr_debug("kvm: adjusted tsc offset by %llu\n", delta); } matched = true; already_matched = (vcpu->arch.this_tsc_generation == kvm->arch.cur_tsc_generation); @@ -1787,8 +1916,6 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) kvm->arch.cur_tsc_write = data; kvm->arch.cur_tsc_offset = offset; matched = false; - pr_debug("kvm: new tsc generation %llu, clock %llu\n", - kvm->arch.cur_tsc_generation, data); } /* @@ -2070,7 +2197,7 @@ u64 get_kvmclock_ns(struct kvm *kvm) spin_lock(&ka->pvclock_gtod_sync_lock); if (!ka->use_master_clock) { spin_unlock(&ka->pvclock_gtod_sync_lock); - return ktime_get_boot_ns() + ka->kvmclock_offset; + return ktime_get_boottime_ns() + ka->kvmclock_offset; } hv_clock.tsc_timestamp = ka->master_cycle_now; @@ -2086,7 +2213,7 @@ u64 get_kvmclock_ns(struct kvm *kvm) &hv_clock.tsc_to_system_mul); ret = __pvclock_read_cycles(&hv_clock, rdtsc()); } else - ret = ktime_get_boot_ns() + ka->kvmclock_offset; + ret = ktime_get_boottime_ns() + ka->kvmclock_offset; put_cpu(); @@ -2185,7 +2312,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) } if (!use_master_clock) { host_tsc = rdtsc(); - kernel_ns = ktime_get_boot_ns(); + kernel_ns = ktime_get_boottime_ns(); } tsc_timestamp = kvm_read_l1_tsc(v, host_tsc); @@ -2437,6 +2564,8 @@ static void record_steal_time(struct kvm_vcpu *vcpu) * Doing a TLB flush here, on the guest's behalf, can avoid * expensive IPIs. */ + trace_kvm_pv_tlb_flush(vcpu->vcpu_id, + vcpu->arch.st.steal.preempted & KVM_VCPU_FLUSH_TLB); if (xchg(&vcpu->arch.st.steal.preempted, 0) & KVM_VCPU_FLUSH_TLB) kvm_vcpu_flush_tlb(vcpu, false); @@ -2544,13 +2673,24 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) } break; case MSR_IA32_MISC_ENABLE: - vcpu->arch.ia32_misc_enable_msr = data; + if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT) && + ((vcpu->arch.ia32_misc_enable_msr ^ data) & MSR_IA32_MISC_ENABLE_MWAIT)) { + if (!guest_cpuid_has(vcpu, X86_FEATURE_XMM3)) + return 1; + vcpu->arch.ia32_misc_enable_msr = data; + kvm_update_cpuid(vcpu); + } else { + vcpu->arch.ia32_misc_enable_msr = data; + } break; case MSR_IA32_SMBASE: if (!msr_info->host_initiated) return 1; vcpu->arch.smbase = data; break; + case MSR_IA32_POWER_CTL: + vcpu->arch.msr_ia32_power_ctl = data; + break; case MSR_IA32_TSC: kvm_write_tsc(vcpu, msr_info); break; @@ -2625,6 +2765,14 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; break; + case MSR_KVM_POLL_CONTROL: + /* only enable bit supported */ + if (data & (-1ULL << 1)) + return 1; + + vcpu->arch.msr_kvm_poll_control = data; + break; + case MSR_IA32_MCG_CTL: case MSR_IA32_MCG_STATUS: case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1: @@ -2714,18 +2862,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) } EXPORT_SYMBOL_GPL(kvm_set_msr_common); - -/* - * Reads an msr value (of 'msr_index') into 'pdata'. - * Returns 0 on success, non-0 otherwise. - * Assumes vcpu_load() was already called. - */ -int kvm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) -{ - return kvm_x86_ops->get_msr(vcpu, msr); -} -EXPORT_SYMBOL_GPL(kvm_get_msr); - static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host) { u64 data; @@ -2802,6 +2938,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; msr_info->data = vcpu->arch.arch_capabilities; break; + case MSR_IA32_POWER_CTL: + msr_info->data = vcpu->arch.msr_ia32_power_ctl; + break; case MSR_IA32_TSC: msr_info->data = kvm_scale_tsc(vcpu, rdtsc()) + vcpu->arch.tsc_offset; break; @@ -2874,6 +3013,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_KVM_PV_EOI_EN: msr_info->data = vcpu->arch.pv_eoi.msr_val; break; + case MSR_KVM_POLL_CONTROL: + msr_info->data = vcpu->arch.msr_kvm_poll_control; + break; case MSR_IA32_P5_MC_ADDR: case MSR_IA32_P5_MC_TYPE: case MSR_IA32_MCG_CAP: @@ -3066,7 +3208,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_HYPERV_EVENTFD: case KVM_CAP_HYPERV_TLBFLUSH: case KVM_CAP_HYPERV_SEND_IPI: - case KVM_CAP_HYPERV_ENLIGHTENED_VMCS: case KVM_CAP_HYPERV_CPUID: case KVM_CAP_PCI_SEGMENT: case KVM_CAP_DEBUGREGS: @@ -3083,6 +3224,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_SET_BOOT_CPU_ID: case KVM_CAP_SPLIT_IRQCHIP: case KVM_CAP_IMMEDIATE_EXIT: + case KVM_CAP_PMU_EVENT_FILTER: case KVM_CAP_GET_MSR_FEATURES: case KVM_CAP_MSR_PLATFORM_INFO: case KVM_CAP_EXCEPTION_PAYLOAD: @@ -3095,7 +3237,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = KVM_CLOCK_TSC_STABLE; break; case KVM_CAP_X86_DISABLE_EXITS: - r |= KVM_X86_DISABLE_EXITS_HLT | KVM_X86_DISABLE_EXITS_PAUSE; + r |= KVM_X86_DISABLE_EXITS_HLT | KVM_X86_DISABLE_EXITS_PAUSE | + KVM_X86_DISABLE_EXITS_CSTATE; if(kvm_can_mwait_in_guest()) r |= KVM_X86_DISABLE_EXITS_MWAIT; break; @@ -3141,6 +3284,12 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = kvm_x86_ops->get_nested_state ? kvm_x86_ops->get_nested_state(NULL, NULL, 0) : 0; break; + case KVM_CAP_HYPERV_DIRECT_TLBFLUSH: + r = kvm_x86_ops->enable_direct_tlbflush != NULL; + break; + case KVM_CAP_HYPERV_ENLIGHTENED_VMCS: + r = kvm_x86_ops->nested_enable_evmcs != NULL; + break; default: break; } @@ -3264,6 +3413,10 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) kvm_x86_ops->vcpu_load(vcpu, cpu); + fpregs_assert_state_consistent(); + if (test_thread_flag(TIF_NEED_FPU_LOAD)) + switch_fpu_return(); + /* Apply any externally detected TSC adjustments (due to suspend) */ if (unlikely(vcpu->arch.tsc_offset_adjustment)) { adjust_tsc_offset_host(vcpu, vcpu->arch.tsc_offset_adjustment); @@ -3460,8 +3613,7 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu, for (bank = 0; bank < bank_num; bank++) vcpu->arch.mce_banks[bank*4] = ~(u64)0; - if (kvm_x86_ops->setup_mce) - kvm_x86_ops->setup_mce(vcpu); + kvm_x86_ops->setup_mce(vcpu); out: return r; } @@ -3911,6 +4063,11 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, r = -EFAULT; } return r; + case KVM_CAP_HYPERV_DIRECT_TLBFLUSH: + if (!kvm_x86_ops->enable_direct_tlbflush) + return -ENOTTY; + + return kvm_x86_ops->enable_direct_tlbflush(vcpu); default: return -EINVAL; @@ -4612,6 +4769,8 @@ split_irqchip_unlock: kvm->arch.hlt_in_guest = true; if (cap->args[0] & KVM_X86_DISABLE_EXITS_PAUSE) kvm->arch.pause_in_guest = true; + if (cap->args[0] & KVM_X86_DISABLE_EXITS_CSTATE) + kvm->arch.cstate_in_guest = true; r = 0; break; case KVM_CAP_MSR_PLATFORM_INFO: @@ -4926,6 +5085,9 @@ set_identity_unlock: r = kvm_vm_ioctl_hv_eventfd(kvm, &hvevfd); break; } + case KVM_SET_PMU_EVENT_FILTER: + r = kvm_vm_ioctl_set_pmu_event_filter(kvm, argp); + break; default: r = -ENOTTY; } @@ -4938,6 +5100,11 @@ static void kvm_init_msr_list(void) u32 dummy[2]; unsigned i, j; + BUILD_BUG_ON_MSG(INTEL_PMC_MAX_FIXED != 4, + "Please update the fixed PMCs in msrs_to_save[]"); + BUILD_BUG_ON_MSG(INTEL_PMC_MAX_GENERIC != 32, + "Please update the generic perfctr/eventsel MSRs in msrs_to_save[]"); + for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) { if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0) continue; @@ -5261,6 +5428,13 @@ int kvm_write_guest_virt_system(struct kvm_vcpu *vcpu, gva_t addr, void *val, /* kvm_write_guest_virt_system can pull in tons of pages. */ vcpu->arch.l1tf_flush_l1d = true; + /* + * FIXME: this should call handle_emulation_failure if X86EMUL_IO_NEEDED + * is returned, but our callers are not ready for that and they blindly + * call kvm_inject_page_fault. Ensure that they at least do not leak + * uninitialized kernel stack memory into cr2 and error code. + */ + memset(exception, 0, sizeof(*exception)); return kvm_write_guest_virt_helper(addr, val, bytes, vcpu, PFERR_WRITE_MASK, exception); } @@ -5269,7 +5443,6 @@ EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system); int handle_ud(struct kvm_vcpu *vcpu) { int emul_type = EMULTYPE_TRAP_UD; - enum emulation_result er; char sig[5]; /* ud2; .ascii "kvm" */ struct x86_exception e; @@ -5278,15 +5451,10 @@ int handle_ud(struct kvm_vcpu *vcpu) sig, sizeof(sig), &e) == 0 && memcmp(sig, "\xf\xbkvm", sizeof(sig)) == 0) { kvm_rip_write(vcpu, kvm_rip_read(vcpu) + sizeof(sig)); - emul_type = 0; + emul_type = EMULTYPE_TRAP_UD_FORCED; } - er = kvm_emulate_instruction(vcpu, emul_type); - if (er == EMULATE_USER_EXIT) - return 0; - if (er != EMULATE_DONE) - kvm_queue_exception(vcpu, UD_VECTOR); - return 1; + return kvm_emulate_instruction(vcpu, emul_type); } EXPORT_SYMBOL_GPL(handle_ud); @@ -5319,7 +5487,7 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva, */ if (vcpu_match_mmio_gva(vcpu, gva) && !permission_fault(vcpu, vcpu->arch.walk_mmu, - vcpu->arch.access, 0, access)) { + vcpu->arch.mmio_access, 0, access)) { *gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT | (gva & (PAGE_SIZE - 1)); trace_vcpu_match_mmio(gva, *gpa, write, false); @@ -5913,28 +6081,13 @@ static void emulator_set_segment(struct x86_emulate_ctxt *ctxt, u16 selector, static int emulator_get_msr(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata) { - struct msr_data msr; - int r; - - msr.index = msr_index; - msr.host_initiated = false; - r = kvm_get_msr(emul_to_vcpu(ctxt), &msr); - if (r) - return r; - - *pdata = msr.data; - return 0; + return kvm_get_msr(emul_to_vcpu(ctxt), msr_index, pdata); } static int emulator_set_msr(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 data) { - struct msr_data msr; - - msr.data = data; - msr.index = msr_index; - msr.host_initiated = false; - return kvm_set_msr(emul_to_vcpu(ctxt), &msr); + return kvm_set_msr(emul_to_vcpu(ctxt), msr_index, data); } static u64 emulator_get_smbase(struct x86_emulate_ctxt *ctxt) @@ -6017,6 +6170,11 @@ static void emulator_post_leave_smm(struct x86_emulate_ctxt *ctxt) kvm_smm_changed(emul_to_vcpu(ctxt)); } +static int emulator_set_xcr(struct x86_emulate_ctxt *ctxt, u32 index, u64 xcr) +{ + return __kvm_set_xcr(emul_to_vcpu(ctxt), index, xcr); +} + static const struct x86_emulate_ops emulate_ops = { .read_gpr = emulator_read_gpr, .write_gpr = emulator_write_gpr, @@ -6058,6 +6216,7 @@ static const struct x86_emulate_ops emulate_ops = { .set_hflags = emulator_set_hflags, .pre_leave_smm = emulator_pre_leave_smm, .post_leave_smm = emulator_post_leave_smm, + .set_xcr = emulator_set_xcr, }; static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask) @@ -6117,7 +6276,7 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu) vcpu->arch.emulate_regs_need_sync_from_vcpu = false; } -int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip) +void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip) { struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; int ret; @@ -6129,37 +6288,43 @@ int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip) ctxt->_eip = ctxt->eip + inc_eip; ret = emulate_int_real(ctxt, irq); - if (ret != X86EMUL_CONTINUE) - return EMULATE_FAIL; - - ctxt->eip = ctxt->_eip; - kvm_rip_write(vcpu, ctxt->eip); - kvm_set_rflags(vcpu, ctxt->eflags); - - return EMULATE_DONE; + if (ret != X86EMUL_CONTINUE) { + kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); + } else { + ctxt->eip = ctxt->_eip; + kvm_rip_write(vcpu, ctxt->eip); + kvm_set_rflags(vcpu, ctxt->eflags); + } } EXPORT_SYMBOL_GPL(kvm_inject_realmode_interrupt); static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type) { - int r = EMULATE_DONE; - ++vcpu->stat.insn_emulation_fail; trace_kvm_emulate_insn_failed(vcpu); - if (emulation_type & EMULTYPE_NO_UD_ON_FAIL) - return EMULATE_FAIL; + if (emulation_type & EMULTYPE_VMWARE_GP) { + kvm_queue_exception_e(vcpu, GP_VECTOR, 0); + return 1; + } - if (!is_guest_mode(vcpu) && kvm_x86_ops->get_cpl(vcpu) == 0) { + if (emulation_type & EMULTYPE_SKIP) { vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; vcpu->run->internal.ndata = 0; - r = EMULATE_USER_EXIT; + return 0; } kvm_queue_exception(vcpu, UD_VECTOR); - return r; + if (!is_guest_mode(vcpu) && kvm_x86_ops->get_cpl(vcpu) == 0) { + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; + vcpu->run->internal.ndata = 0; + return 0; + } + + return 1; } static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2, @@ -6314,7 +6479,7 @@ static int kvm_vcpu_check_hw_bp(unsigned long addr, u32 type, u32 dr7, return dr6; } -static void kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu, int *r) +static int kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu) { struct kvm_run *kvm_run = vcpu->run; @@ -6323,18 +6488,20 @@ static void kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu, int *r) kvm_run->debug.arch.pc = vcpu->arch.singlestep_rip; kvm_run->debug.arch.exception = DB_VECTOR; kvm_run->exit_reason = KVM_EXIT_DEBUG; - *r = EMULATE_USER_EXIT; - } else { - kvm_queue_exception_p(vcpu, DB_VECTOR, DR6_BS); + return 0; } + kvm_queue_exception_p(vcpu, DB_VECTOR, DR6_BS); + return 1; } int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu) { unsigned long rflags = kvm_x86_ops->get_rflags(vcpu); - int r = EMULATE_DONE; + int r; - kvm_x86_ops->skip_emulated_instruction(vcpu); + r = kvm_x86_ops->skip_emulated_instruction(vcpu); + if (unlikely(!r)) + return 0; /* * rflags is the old, "raw" value of the flags. The new value has @@ -6345,8 +6512,8 @@ int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu) * that sets the TF flag". */ if (unlikely(rflags & X86_EFLAGS_TF)) - kvm_vcpu_do_singlestep(vcpu, &r); - return r == EMULATE_DONE; + r = kvm_vcpu_do_singlestep(vcpu); + return r; } EXPORT_SYMBOL_GPL(kvm_skip_emulated_instruction); @@ -6365,7 +6532,7 @@ static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r) kvm_run->debug.arch.pc = eip; kvm_run->debug.arch.exception = DB_VECTOR; kvm_run->exit_reason = KVM_EXIT_DEBUG; - *r = EMULATE_USER_EXIT; + *r = 0; return true; } } @@ -6378,10 +6545,10 @@ static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r) vcpu->arch.db); if (dr6 != 0) { - vcpu->arch.dr6 &= ~15; + vcpu->arch.dr6 &= ~DR_TRAP_BITS; vcpu->arch.dr6 |= dr6 | DR6_RTM; kvm_queue_exception(vcpu, DB_VECTOR); - *r = EMULATE_DONE; + *r = 1; return true; } } @@ -6465,32 +6632,48 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, trace_kvm_emulate_insn_start(vcpu); ++vcpu->stat.insn_emulation; if (r != EMULATION_OK) { - if (emulation_type & EMULTYPE_TRAP_UD) - return EMULATE_FAIL; + if ((emulation_type & EMULTYPE_TRAP_UD) || + (emulation_type & EMULTYPE_TRAP_UD_FORCED)) { + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; + } if (reexecute_instruction(vcpu, cr2, write_fault_to_spt, emulation_type)) - return EMULATE_DONE; - if (ctxt->have_exception && inject_emulated_exception(vcpu)) - return EMULATE_DONE; - if (emulation_type & EMULTYPE_SKIP) - return EMULATE_FAIL; + return 1; + if (ctxt->have_exception) { + /* + * #UD should result in just EMULATION_FAILED, and trap-like + * exception should not be encountered during decode. + */ + WARN_ON_ONCE(ctxt->exception.vector == UD_VECTOR || + exception_type(ctxt->exception.vector) == EXCPT_TRAP); + inject_emulated_exception(vcpu); + return 1; + } return handle_emulation_failure(vcpu, emulation_type); } } - if ((emulation_type & EMULTYPE_VMWARE) && - !is_vmware_backdoor_opcode(ctxt)) - return EMULATE_FAIL; + if ((emulation_type & EMULTYPE_VMWARE_GP) && + !is_vmware_backdoor_opcode(ctxt)) { + kvm_queue_exception_e(vcpu, GP_VECTOR, 0); + return 1; + } + /* + * Note, EMULTYPE_SKIP is intended for use *only* by vendor callbacks + * for kvm_skip_emulated_instruction(). The caller is responsible for + * updating interruptibility state and injecting single-step #DBs. + */ if (emulation_type & EMULTYPE_SKIP) { kvm_rip_write(vcpu, ctxt->_eip); if (ctxt->eflags & X86_EFLAGS_RF) kvm_set_rflags(vcpu, ctxt->eflags & ~X86_EFLAGS_RF); - return EMULATE_DONE; + return 1; } if (retry_instruction(ctxt, cr2, emulation_type)) - return EMULATE_DONE; + return 1; /* this is needed for vmware backdoor interface to work since it changes registers values during IO operation */ @@ -6506,18 +6689,18 @@ restart: r = x86_emulate_insn(ctxt); if (r == EMULATION_INTERCEPTED) - return EMULATE_DONE; + return 1; if (r == EMULATION_FAILED) { if (reexecute_instruction(vcpu, cr2, write_fault_to_spt, emulation_type)) - return EMULATE_DONE; + return 1; return handle_emulation_failure(vcpu, emulation_type); } if (ctxt->have_exception) { - r = EMULATE_DONE; + r = 1; if (inject_emulated_exception(vcpu)) return r; } else if (vcpu->arch.pio.count) { @@ -6528,27 +6711,30 @@ restart: writeback = false; vcpu->arch.complete_userspace_io = complete_emulated_pio; } - r = EMULATE_USER_EXIT; + r = 0; } else if (vcpu->mmio_needed) { + ++vcpu->stat.mmio_exits; + if (!vcpu->mmio_is_write) writeback = false; - r = EMULATE_USER_EXIT; + r = 0; vcpu->arch.complete_userspace_io = complete_emulated_mmio; } else if (r == EMULATION_RESTART) goto restart; else - r = EMULATE_DONE; + r = 1; if (writeback) { unsigned long rflags = kvm_x86_ops->get_rflags(vcpu); toggle_interruptibility(vcpu, ctxt->interruptibility); vcpu->arch.emulate_regs_need_sync_to_vcpu = false; - kvm_rip_write(vcpu, ctxt->eip); - if (r == EMULATE_DONE && ctxt->tf) - kvm_vcpu_do_singlestep(vcpu, &r); if (!ctxt->have_exception || - exception_type(ctxt->exception.vector) == EXCPT_TRAP) + exception_type(ctxt->exception.vector) == EXCPT_TRAP) { + kvm_rip_write(vcpu, ctxt->eip); + if (r && ctxt->tf) + r = kvm_vcpu_do_singlestep(vcpu); __kvm_set_rflags(vcpu, ctxt->eflags); + } /* * For STI, interrupts are shadowed; so KVM_REQ_EVENT will @@ -6705,7 +6891,7 @@ static void kvm_hyperv_tsc_notifier(void) struct kvm_vcpu *vcpu; int cpu; - spin_lock(&kvm_lock); + mutex_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) kvm_make_mclock_inprogress_request(kvm); @@ -6731,7 +6917,7 @@ static void kvm_hyperv_tsc_notifier(void) spin_unlock(&ka->pvclock_gtod_sync_lock); } - spin_unlock(&kvm_lock); + mutex_unlock(&kvm_lock); } #endif @@ -6782,17 +6968,17 @@ static void __kvmclock_cpufreq_notifier(struct cpufreq_freqs *freq, int cpu) smp_call_function_single(cpu, tsc_khz_changed, freq, 1); - spin_lock(&kvm_lock); + mutex_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) { kvm_for_each_vcpu(i, vcpu, kvm) { if (vcpu->cpu != cpu) continue; kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); - if (vcpu->cpu != smp_processor_id()) + if (vcpu->cpu != raw_smp_processor_id()) send_ipi = 1; } } - spin_unlock(&kvm_lock); + mutex_unlock(&kvm_lock); if (freq->old < freq->new && send_ipi) { /* @@ -6857,7 +7043,6 @@ static void kvm_timer_init(void) cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block, CPUFREQ_TRANSITION_NOTIFIER); } - pr_debug("kvm: max_tsc_khz = %ld\n", max_tsc_khz); cpuhp_setup_state(CPUHP_AP_X86_KVM_CLK_ONLINE, "x86/kvm/clk:online", kvmclock_cpu_online, kvmclock_cpu_down_prep); @@ -6907,35 +7092,6 @@ static struct perf_guest_info_callbacks kvm_guest_cbs = { .handle_intel_pt_intr = kvm_handle_intel_pt_intr, }; -static void kvm_set_mmio_spte_mask(void) -{ - u64 mask; - int maxphyaddr = boot_cpu_data.x86_phys_bits; - - /* - * Set the reserved bits and the present bit of an paging-structure - * entry to generate page fault with PFER.RSV = 1. - */ - - /* - * Mask the uppermost physical address bit, which would be reserved as - * long as the supported physical address width is less than 52. - */ - mask = 1ull << 51; - - /* Set the present bit. */ - mask |= 1ull; - - /* - * If reserved bit is not supported, clear the present bit to disable - * mmio page fault. - */ - if (IS_ENABLED(CONFIG_X86_64) && maxphyaddr == 52) - mask &= ~1ull; - - kvm_mmu_set_mmio_spte_mask(mask, mask); -} - #ifdef CONFIG_X86_64 static void pvclock_gtod_update_fn(struct work_struct *work) { @@ -6944,12 +7100,12 @@ static void pvclock_gtod_update_fn(struct work_struct *work) struct kvm_vcpu *vcpu; int i; - spin_lock(&kvm_lock); + mutex_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) kvm_for_each_vcpu(i, vcpu, kvm) kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu); atomic_set(&kvm_guest_has_master_clock, 0); - spin_unlock(&kvm_lock); + mutex_unlock(&kvm_lock); } static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn); @@ -7032,8 +7188,6 @@ int kvm_arch_init(void *opaque) if (r) goto out_free_percpu; - kvm_set_mmio_spte_mask(); - kvm_x86_ops = ops; kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, @@ -7047,6 +7201,8 @@ int kvm_arch_init(void *opaque) host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); kvm_lapic_init(); + if (pi_inject_timer == -1) + pi_inject_timer = housekeeping_enabled(HK_FLAG_TIMER); #ifdef CONFIG_X86_64 pvclock_gtod_register_notifier(&pvclock_gtod_notifier); @@ -7172,6 +7328,23 @@ void kvm_vcpu_deactivate_apicv(struct kvm_vcpu *vcpu) kvm_x86_ops->refresh_apicv_exec_ctrl(vcpu); } +static void kvm_sched_yield(struct kvm *kvm, unsigned long dest_id) +{ + struct kvm_vcpu *target = NULL; + struct kvm_apic_map *map; + + rcu_read_lock(); + map = rcu_dereference(kvm->arch.apic_map); + + if (likely(map) && dest_id <= map->max_apic_id && map->phys_map[dest_id]) + target = map->phys_map[dest_id]->vcpu; + + rcu_read_unlock(); + + if (target && READ_ONCE(target->ready)) + kvm_vcpu_yield_to(target); +} + int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) { unsigned long nr, a0, a1, a2, a3, ret; @@ -7208,6 +7381,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) break; case KVM_HC_KICK_CPU: kvm_pv_kick_cpu_op(vcpu->kvm, a0, a1); + kvm_sched_yield(vcpu->kvm, a1); ret = 0; break; #ifdef CONFIG_X86_64 @@ -7218,6 +7392,10 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) case KVM_HC_SEND_IPI: ret = kvm_pv_send_ipi(vcpu->kvm, a0, a1, a2, a3, op_64_bit); break; + case KVM_HC_SCHED_YIELD: + kvm_sched_yield(vcpu->kvm, a0); + ret = 0; + break; default: ret = -KVM_ENOSYS; break; @@ -7950,14 +8128,10 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) } trace_kvm_entry(vcpu->vcpu_id); - if (lapic_in_kernel(vcpu) && - vcpu->arch.apic->lapic_timer.timer_advance_ns) - wait_lapic_expire(vcpu); guest_enter_irqoff(); - fpregs_assert_state_consistent(); - if (test_thread_flag(TIF_NEED_FPU_LOAD)) - switch_fpu_return(); + /* The preempt notifier should have taken care of the FPU already. */ + WARN_ON_ONCE(test_thread_flag(TIF_NEED_FPU_LOAD)); if (unlikely(vcpu->arch.switch_db_regs)) { set_debugreg(0, 7); @@ -8001,13 +8175,29 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) vcpu->mode = OUTSIDE_GUEST_MODE; smp_wmb(); - kvm_before_interrupt(vcpu); - kvm_x86_ops->handle_external_intr(vcpu); - kvm_after_interrupt(vcpu); + kvm_x86_ops->handle_exit_irqoff(vcpu); + /* + * Consume any pending interrupts, including the possible source of + * VM-Exit on SVM and any ticks that occur between VM-Exit and now. + * An instruction is required after local_irq_enable() to fully unblock + * interrupts on processors that implement an interrupt shadow, the + * stat.exits increment will do nicely. + */ + kvm_before_interrupt(vcpu); + local_irq_enable(); ++vcpu->stat.exits; + local_irq_disable(); + kvm_after_interrupt(vcpu); guest_exit_irqoff(); + if (lapic_in_kernel(vcpu)) { + s64 delta = vcpu->arch.apic->lapic_timer.advance_expire_delta; + if (delta != S64_MIN) { + trace_kvm_wait_lapic_expire(vcpu->vcpu_id, delta); + vcpu->arch.apic->lapic_timer.advance_expire_delta = S64_MIN; + } + } local_irq_enable(); preempt_enable(); @@ -8136,12 +8326,11 @@ static int vcpu_run(struct kvm_vcpu *vcpu) static inline int complete_emulated_io(struct kvm_vcpu *vcpu) { int r; + vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); r = kvm_emulate_instruction(vcpu, EMULTYPE_NO_DECODE); srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); - if (r != EMULATE_DONE) - return 0; - return 1; + return r; } static int complete_emulated_pio(struct kvm_vcpu *vcpu) @@ -8219,7 +8408,7 @@ static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) { fpregs_lock(); - copy_fpregs_to_fpstate(¤t->thread.fpu); + copy_fpregs_to_fpstate(vcpu->arch.user_fpu); /* PKRU is separately restored in kvm_x86_ops->run. */ __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu->state, ~XFEATURE_MASK_PKRU); @@ -8236,7 +8425,7 @@ static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) fpregs_lock(); copy_fpregs_to_fpstate(vcpu->arch.guest_fpu); - copy_kernel_to_fpregs(¤t->thread.fpu.state); + copy_kernel_to_fpregs(&vcpu->arch.user_fpu->state); fpregs_mark_activate(); fpregs_unlock(); @@ -8509,14 +8698,17 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index, ret = emulator_task_switch(ctxt, tss_selector, idt_index, reason, has_error_code, error_code); - - if (ret) - return EMULATE_FAIL; + if (ret) { + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; + vcpu->run->internal.ndata = 0; + return 0; + } kvm_rip_write(vcpu, ctxt->eip); kvm_set_rflags(vcpu, ctxt->eflags); kvm_make_request(KVM_REQ_EVENT, vcpu); - return EMULATE_DONE; + return 1; } EXPORT_SYMBOL_GPL(kvm_task_switch); @@ -8593,7 +8785,7 @@ static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) kvm_update_cpuid(vcpu); idx = srcu_read_lock(&vcpu->kvm->srcu); - if (!is_long_mode(vcpu) && is_pae(vcpu) && is_paging(vcpu)) { + if (is_pae_paging(vcpu)) { load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu)); mmu_reset_needed = 1; } @@ -8874,6 +9066,10 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) msr.host_initiated = true; kvm_write_tsc(vcpu, &msr); vcpu_put(vcpu); + + /* poll control enabled by default */ + vcpu->arch.msr_kvm_poll_control = 1; + mutex_unlock(&vcpu->mutex); if (!kvmclock_periodic_sync) @@ -9015,7 +9211,7 @@ int kvm_arch_hardware_enable(void) * before any KVM threads can be running. Unfortunately, we can't * bring the TSCs fully up to date with real time, as we aren't yet far * enough into CPU bringup that we know how much real time has actually - * elapsed; our helper function, ktime_get_boot_ns() will be using boot + * elapsed; our helper function, ktime_get_boottime_ns() will be using boot * variables that haven't been updated yet. * * So we simply find the maximum observed TSC above, then record the @@ -9106,9 +9302,9 @@ void kvm_arch_hardware_unsetup(void) kvm_x86_ops->hardware_unsetup(); } -void kvm_arch_check_processor_compat(void *rtn) +int kvm_arch_check_processor_compat(void) { - kvm_x86_ops->check_processor_compatibility(rtn); + return kvm_x86_ops->check_processor_compatibility(); } bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu) @@ -9230,6 +9426,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list); INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); + INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages); INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); atomic_set(&kvm->arch.noncoherent_dma_count, 0); @@ -9243,7 +9440,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) mutex_init(&kvm->arch.apic_map_lock); spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock); - kvm->arch.kvmclock_offset = -ktime_get_boot_ns(); + kvm->arch.kvmclock_offset = -ktime_get_boottime_ns(); pvclock_update_vm_gtod_copy(kvm); kvm->arch.guest_can_read_msr_platform_info = true; @@ -9255,10 +9452,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) kvm_page_track_init(kvm); kvm_mmu_init_vm(kvm); - if (kvm_x86_ops->vm_init) - return kvm_x86_ops->vm_init(kvm); - - return 0; + return kvm_x86_ops->vm_init(kvm); } static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) @@ -9380,6 +9574,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm) kvm_ioapic_destroy(kvm); kvm_free_vcpus(kvm); kvfree(rcu_dereference_check(kvm->arch.apic_map, 1)); + kfree(srcu_dereference_check(kvm->arch.pmu_event_filter, &kvm->srcu, 1)); kvm_mmu_uninit_vm(kvm); kvm_page_track_cleanup(kvm); kvm_hv_destroy_vm(kvm); @@ -9561,8 +9756,13 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, * Scan sptes if dirty logging has been stopped, dropping those * which can be collapsed into a single large-page spte. Later * page faults will create the large-page sptes. + * + * There is no need to do this in any of the following cases: + * CREATE: No dirty mappings will already exist. + * MOVE/DELETE: The old mappings will already have been cleaned up by + * kvm_arch_flush_shadow_memslot() */ - if ((change != KVM_MR_DELETE) && + if (change == KVM_MR_FLAGS_ONLY && (old->flags & KVM_MEM_LOG_DIRTY_PAGES) && !(new->flags & KVM_MEM_LOG_DIRTY_PAGES)) kvm_mmu_zap_collapsible_sptes(kvm, new); @@ -9638,6 +9838,22 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) return kvm_vcpu_running(vcpu) || kvm_vcpu_has_events(vcpu); } +bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu) +{ + if (READ_ONCE(vcpu->arch.pv.pv_unhalted)) + return true; + + if (kvm_test_request(KVM_REQ_NMI, vcpu) || + kvm_test_request(KVM_REQ_SMI, vcpu) || + kvm_test_request(KVM_REQ_EVENT, vcpu)) + return true; + + if (vcpu->arch.apicv_active && kvm_x86_ops->dy_apicv_has_pending_interrupt(vcpu)) + return true; + + return false; +} + bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu) { return vcpu->arch.preempted_in_kernel; @@ -9788,6 +10004,36 @@ static int apf_get_user(struct kvm_vcpu *vcpu, u32 *val) sizeof(u32)); } +static bool kvm_can_deliver_async_pf(struct kvm_vcpu *vcpu) +{ + if (!vcpu->arch.apf.delivery_as_pf_vmexit && is_guest_mode(vcpu)) + return false; + + if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) || + (vcpu->arch.apf.send_user_only && + kvm_x86_ops->get_cpl(vcpu) == 0)) + return false; + + return true; +} + +bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu) +{ + if (unlikely(!lapic_in_kernel(vcpu) || + kvm_event_needs_reinjection(vcpu) || + vcpu->arch.exception.pending)) + return false; + + if (kvm_hlt_in_guest(vcpu->kvm) && !kvm_can_deliver_async_pf(vcpu)) + return false; + + /* + * If interrupts are off we cannot even use an artificial + * halt state. + */ + return kvm_x86_ops->interrupt_allowed(vcpu); +} + void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, struct kvm_async_pf *work) { @@ -9796,11 +10042,8 @@ void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, trace_kvm_async_pf_not_present(work->arch.token, work->gva); kvm_add_async_pf_gfn(vcpu, work->arch.gfn); - if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) || - (vcpu->arch.apf.send_user_only && - kvm_x86_ops->get_cpl(vcpu) == 0)) - kvm_make_request(KVM_REQ_APF_HALT, vcpu); - else if (!apf_put_user(vcpu, KVM_PV_REASON_PAGE_NOT_PRESENT)) { + if (kvm_can_deliver_async_pf(vcpu) && + !apf_put_user(vcpu, KVM_PV_REASON_PAGE_NOT_PRESENT)) { fault.vector = PF_VECTOR; fault.error_code_valid = true; fault.error_code = 0; @@ -9808,6 +10051,16 @@ void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, fault.address = work->arch.token; fault.async_page_fault = true; kvm_inject_page_fault(vcpu, &fault); + } else { + /* + * It is not possible to deliver a paravirtualized asynchronous + * page fault, but putting the guest in an artificial halt state + * can be beneficial nevertheless: if an interrupt arrives, we + * can deliver it timely and perhaps the guest will schedule + * another process. When the instruction that triggered a page + * fault is retried, hopefully the page will be ready in the host. + */ + kvm_make_request(KVM_REQ_APF_HALT, vcpu); } } @@ -9896,7 +10149,7 @@ EXPORT_SYMBOL_GPL(kvm_arch_has_noncoherent_dma); bool kvm_arch_has_irq_bypass(void) { - return kvm_x86_ops->update_pi_irte != NULL; + return true; } int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons, @@ -9936,9 +10189,6 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, bool set) { - if (!kvm_x86_ops->update_pi_irte) - return -EINVAL; - return kvm_x86_ops->update_pi_irte(kvm, host_irq, guest_irq, set); } @@ -9948,6 +10198,13 @@ bool kvm_vector_hashing_enabled(void) } EXPORT_SYMBOL_GPL(kvm_vector_hashing_enabled); +bool kvm_arch_no_poll(struct kvm_vcpu *vcpu) +{ + return (vcpu->arch.msr_kvm_poll_control & 1) == 0; +} +EXPORT_SYMBOL_GPL(kvm_arch_no_poll); + + EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq); @@ -9958,11 +10215,12 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmrun); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit_inject); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmenter_failed); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_write_tsc_offset); -EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ple_window); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ple_window_update); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pml_full); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pi_irte_update); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_unaccelerated_access); |