summaryrefslogtreecommitdiff
path: root/arch/x86/kvm/x86.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kvm/x86.c')
-rw-r--r--arch/x86/kvm/x86.c248
1 files changed, 192 insertions, 56 deletions
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 41cce5031126..2c924075f6f1 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -145,21 +145,21 @@ EXPORT_STATIC_CALL_GPL(kvm_x86_get_cs_db_l_bits);
EXPORT_STATIC_CALL_GPL(kvm_x86_cache_reg);
static bool __read_mostly ignore_msrs = 0;
-module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR);
+module_param(ignore_msrs, bool, 0644);
bool __read_mostly report_ignored_msrs = true;
-module_param(report_ignored_msrs, bool, S_IRUGO | S_IWUSR);
+module_param(report_ignored_msrs, bool, 0644);
EXPORT_SYMBOL_GPL(report_ignored_msrs);
unsigned int min_timer_period_us = 200;
-module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR);
+module_param(min_timer_period_us, uint, 0644);
static bool __read_mostly kvmclock_periodic_sync = true;
-module_param(kvmclock_periodic_sync, bool, S_IRUGO);
+module_param(kvmclock_periodic_sync, bool, 0444);
/* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */
static u32 __read_mostly tsc_tolerance_ppm = 250;
-module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
+module_param(tsc_tolerance_ppm, uint, 0644);
/*
* lapic timer advance (tscdeadline mode only) in nanoseconds. '-1' enables
@@ -168,13 +168,13 @@ module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
* tuning, i.e. allows privileged userspace to set an exact advancement time.
*/
static int __read_mostly lapic_timer_advance_ns = -1;
-module_param(lapic_timer_advance_ns, int, S_IRUGO | S_IWUSR);
+module_param(lapic_timer_advance_ns, int, 0644);
static bool __read_mostly vector_hashing = true;
-module_param(vector_hashing, bool, S_IRUGO);
+module_param(vector_hashing, bool, 0444);
bool __read_mostly enable_vmware_backdoor = false;
-module_param(enable_vmware_backdoor, bool, S_IRUGO);
+module_param(enable_vmware_backdoor, bool, 0444);
EXPORT_SYMBOL_GPL(enable_vmware_backdoor);
/*
@@ -186,7 +186,7 @@ static int __read_mostly force_emulation_prefix;
module_param(force_emulation_prefix, int, 0644);
int __read_mostly pi_inject_timer = -1;
-module_param(pi_inject_timer, bint, S_IRUGO | S_IWUSR);
+module_param(pi_inject_timer, bint, 0644);
/* Enable/disable PMU virtualization */
bool __read_mostly enable_pmu = true;
@@ -962,7 +962,7 @@ void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long old_cr0, unsigned lon
kvm_mmu_reset_context(vcpu);
if (((cr0 ^ old_cr0) & X86_CR0_CD) &&
- kvm_arch_has_noncoherent_dma(vcpu->kvm) &&
+ kvm_mmu_honors_guest_mtrrs(vcpu->kvm) &&
!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
kvm_zap_gfn_range(vcpu->kvm, 0, ~0ULL);
}
@@ -2331,14 +2331,9 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock, int sec_hi_o
if (kvm_write_guest(kvm, wall_clock, &version, sizeof(version)))
return;
- /*
- * The guest calculates current wall clock time by adding
- * system time (updated by kvm_guest_time_update below) to the
- * wall clock specified here. We do the reverse here.
- */
- wall_nsec = ktime_get_real_ns() - get_kvmclock_ns(kvm);
+ wall_nsec = kvm_get_wall_clock_epoch(kvm);
- wc.nsec = do_div(wall_nsec, 1000000000);
+ wc.nsec = do_div(wall_nsec, NSEC_PER_SEC);
wc.sec = (u32)wall_nsec; /* overflow in 2106 guest time */
wc.version = version;
@@ -2714,8 +2709,9 @@ static void __kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 offset, u64 tsc,
kvm_track_tsc_matching(vcpu);
}
-static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data)
+static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 *user_value)
{
+ u64 data = user_value ? *user_value : 0;
struct kvm *kvm = vcpu->kvm;
u64 offset, ns, elapsed;
unsigned long flags;
@@ -2730,25 +2726,37 @@ static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data)
if (vcpu->arch.virtual_tsc_khz) {
if (data == 0) {
/*
- * detection of vcpu initialization -- need to sync
- * with other vCPUs. This particularly helps to keep
- * kvm_clock stable after CPU hotplug
+ * Force synchronization when creating a vCPU, or when
+ * userspace explicitly writes a zero value.
*/
synchronizing = true;
- } else {
+ } else if (kvm->arch.user_set_tsc) {
u64 tsc_exp = kvm->arch.last_tsc_write +
nsec_to_cycles(vcpu, elapsed);
u64 tsc_hz = vcpu->arch.virtual_tsc_khz * 1000LL;
/*
- * Special case: TSC write with a small delta (1 second)
- * of virtual cycle time against real time is
- * interpreted as an attempt to synchronize the CPU.
+ * Here lies UAPI baggage: when a user-initiated TSC write has
+ * a small delta (1 second) of virtual cycle time against the
+ * previously set vCPU, we assume that they were intended to be
+ * in sync and the delta was only due to the racy nature of the
+ * legacy API.
+ *
+ * This trick falls down when restoring a guest which genuinely
+ * has been running for less time than the 1 second of imprecision
+ * which we allow for in the legacy API. In this case, the first
+ * value written by userspace (on any vCPU) should not be subject
+ * to this 'correction' to make it sync up with values that only
+ * come from the kernel's default vCPU creation. Make the 1-second
+ * slop hack only trigger if the user_set_tsc flag is already set.
*/
synchronizing = data < tsc_exp + tsc_hz &&
data + tsc_hz > tsc_exp;
}
}
+ if (user_value)
+ kvm->arch.user_set_tsc = true;
+
/*
* For a reliable TSC, we can match TSC offsets, and for an unstable
* TSC, we add elapsed time in this computation. We could let the
@@ -3232,16 +3240,94 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
if (vcpu->pv_time.active)
kvm_setup_guest_pvclock(v, &vcpu->pv_time, 0);
+#ifdef CONFIG_KVM_XEN
if (vcpu->xen.vcpu_info_cache.active)
kvm_setup_guest_pvclock(v, &vcpu->xen.vcpu_info_cache,
offsetof(struct compat_vcpu_info, time));
if (vcpu->xen.vcpu_time_info_cache.active)
kvm_setup_guest_pvclock(v, &vcpu->xen.vcpu_time_info_cache, 0);
+#endif
kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock);
return 0;
}
/*
+ * The pvclock_wall_clock ABI tells the guest the wall clock time at
+ * which it started (i.e. its epoch, when its kvmclock was zero).
+ *
+ * In fact those clocks are subtly different; wall clock frequency is
+ * adjusted by NTP and has leap seconds, while the kvmclock is a
+ * simple function of the TSC without any such adjustment.
+ *
+ * Perhaps the ABI should have exposed CLOCK_TAI and a ratio between
+ * that and kvmclock, but even that would be subject to change over
+ * time.
+ *
+ * Attempt to calculate the epoch at a given moment using the *same*
+ * TSC reading via kvm_get_walltime_and_clockread() to obtain both
+ * wallclock and kvmclock times, and subtracting one from the other.
+ *
+ * Fall back to using their values at slightly different moments by
+ * calling ktime_get_real_ns() and get_kvmclock_ns() separately.
+ */
+uint64_t kvm_get_wall_clock_epoch(struct kvm *kvm)
+{
+#ifdef CONFIG_X86_64
+ struct pvclock_vcpu_time_info hv_clock;
+ struct kvm_arch *ka = &kvm->arch;
+ unsigned long seq, local_tsc_khz;
+ struct timespec64 ts;
+ uint64_t host_tsc;
+
+ do {
+ seq = read_seqcount_begin(&ka->pvclock_sc);
+
+ local_tsc_khz = 0;
+ if (!ka->use_master_clock)
+ break;
+
+ /*
+ * The TSC read and the call to get_cpu_tsc_khz() must happen
+ * on the same CPU.
+ */
+ get_cpu();
+
+ local_tsc_khz = get_cpu_tsc_khz();
+
+ if (local_tsc_khz &&
+ !kvm_get_walltime_and_clockread(&ts, &host_tsc))
+ local_tsc_khz = 0; /* Fall back to old method */
+
+ put_cpu();
+
+ /*
+ * These values must be snapshotted within the seqcount loop.
+ * After that, it's just mathematics which can happen on any
+ * CPU at any time.
+ */
+ hv_clock.tsc_timestamp = ka->master_cycle_now;
+ hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset;
+
+ } while (read_seqcount_retry(&ka->pvclock_sc, seq));
+
+ /*
+ * If the conditions were right, and obtaining the wallclock+TSC was
+ * successful, calculate the KVM clock at the corresponding time and
+ * subtract one from the other to get the guest's epoch in nanoseconds
+ * since 1970-01-01.
+ */
+ if (local_tsc_khz) {
+ kvm_get_time_scale(NSEC_PER_SEC, local_tsc_khz * NSEC_PER_USEC,
+ &hv_clock.tsc_shift,
+ &hv_clock.tsc_to_system_mul);
+ return ts.tv_nsec + NSEC_PER_SEC * ts.tv_sec -
+ __pvclock_read_cycles(&hv_clock, host_tsc);
+ }
+#endif
+ return ktime_get_real_ns() - get_kvmclock_ns(kvm);
+}
+
+/*
* kvmclock updates which are isolated to a given vcpu, such as
* vcpu->cpu migration, should not allow system_timestamp from
* the rest of the vcpus to remain static. Otherwise ntp frequency
@@ -3290,9 +3376,6 @@ static void kvmclock_sync_fn(struct work_struct *work)
kvmclock_sync_work);
struct kvm *kvm = container_of(ka, struct kvm, arch);
- if (!kvmclock_periodic_sync)
- return;
-
schedule_delayed_work(&kvm->arch.kvmclock_update_work, 0);
schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
KVMCLOCK_SYNC_PERIOD);
@@ -3641,6 +3724,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_AMD64_PATCH_LOADER:
case MSR_AMD64_BU_CFG2:
case MSR_AMD64_DC_CFG:
+ case MSR_AMD64_TW_CFG:
case MSR_F15H_EX_CFG:
break;
@@ -3670,17 +3754,36 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
vcpu->arch.perf_capabilities = data;
kvm_pmu_refresh(vcpu);
break;
- case MSR_IA32_PRED_CMD:
- if (!msr_info->host_initiated && !guest_has_pred_cmd_msr(vcpu))
- return 1;
+ case MSR_IA32_PRED_CMD: {
+ u64 reserved_bits = ~(PRED_CMD_IBPB | PRED_CMD_SBPB);
+
+ if (!msr_info->host_initiated) {
+ if ((!guest_has_pred_cmd_msr(vcpu)))
+ return 1;
+
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBPB))
+ reserved_bits |= PRED_CMD_IBPB;
+
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_SBPB))
+ reserved_bits |= PRED_CMD_SBPB;
+ }
+
+ if (!boot_cpu_has(X86_FEATURE_IBPB))
+ reserved_bits |= PRED_CMD_IBPB;
- if (!boot_cpu_has(X86_FEATURE_IBPB) || (data & ~PRED_CMD_IBPB))
+ if (!boot_cpu_has(X86_FEATURE_SBPB))
+ reserved_bits |= PRED_CMD_SBPB;
+
+ if (data & reserved_bits)
return 1;
+
if (!data)
break;
- wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);
+ wrmsrl(MSR_IA32_PRED_CMD, data);
break;
+ }
case MSR_IA32_FLUSH_CMD:
if (!msr_info->host_initiated &&
!guest_cpuid_has(vcpu, X86_FEATURE_FLUSH_L1D))
@@ -3700,13 +3803,16 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
data &= ~(u64)0x100; /* ignore ignne emulation enable */
data &= ~(u64)0x8; /* ignore TLB cache disable */
- /* Handle McStatusWrEn */
- if (data == BIT_ULL(18)) {
- vcpu->arch.msr_hwcr = data;
- } else if (data != 0) {
+ /*
+ * Allow McStatusWrEn and TscFreqSel. (Linux guests from v3.2
+ * through at least v6.6 whine if TscFreqSel is clear,
+ * depending on F/M/S.
+ */
+ if (data & ~(BIT_ULL(18) | BIT_ULL(24))) {
kvm_pr_unimpl_wrmsr(vcpu, msr, data);
return 1;
}
+ vcpu->arch.msr_hwcr = data;
break;
case MSR_FAM10H_MMIO_CONF_BASE:
if (data != 0) {
@@ -3777,7 +3883,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
break;
case MSR_IA32_TSC:
if (msr_info->host_initiated) {
- kvm_synchronize_tsc(vcpu, data);
+ kvm_synchronize_tsc(vcpu, &data);
} else {
u64 adj = kvm_compute_l1_tsc_offset(vcpu, data) - vcpu->arch.l1_tsc_offset;
adjust_tsc_offset_guest(vcpu, adj);
@@ -4065,6 +4171,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_AMD64_BU_CFG2:
case MSR_IA32_PERF_CTL:
case MSR_AMD64_DC_CFG:
+ case MSR_AMD64_TW_CFG:
case MSR_F15H_EX_CFG:
/*
* Intel Sandy Bridge CPUs must support the RAPL (running average power
@@ -5547,6 +5654,7 @@ static int kvm_arch_tsc_set_attr(struct kvm_vcpu *vcpu,
tsc = kvm_scale_tsc(rdtsc(), vcpu->arch.l1_tsc_scaling_ratio) + offset;
ns = get_kvmclock_base_ns();
+ kvm->arch.user_set_tsc = true;
__kvm_synchronize_tsc(vcpu, offset, tsc, ns, matched);
raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
@@ -6259,6 +6367,9 @@ void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
struct kvm_vcpu *vcpu;
unsigned long i;
+ if (!kvm_x86_ops.cpu_dirty_log_size)
+ return;
+
kvm_for_each_vcpu(i, vcpu, kvm)
kvm_vcpu_kick(vcpu);
}
@@ -7485,11 +7596,11 @@ int kvm_write_guest_virt_system(struct kvm_vcpu *vcpu, gva_t addr, void *val,
}
EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system);
-static int kvm_can_emulate_insn(struct kvm_vcpu *vcpu, int emul_type,
- void *insn, int insn_len)
+static int kvm_check_emulate_insn(struct kvm_vcpu *vcpu, int emul_type,
+ void *insn, int insn_len)
{
- return static_call(kvm_x86_can_emulate_instruction)(vcpu, emul_type,
- insn, insn_len);
+ return static_call(kvm_x86_check_emulate_instruction)(vcpu, emul_type,
+ insn, insn_len);
}
int handle_ud(struct kvm_vcpu *vcpu)
@@ -7499,8 +7610,10 @@ int handle_ud(struct kvm_vcpu *vcpu)
int emul_type = EMULTYPE_TRAP_UD;
char sig[5]; /* ud2; .ascii "kvm" */
struct x86_exception e;
+ int r;
- if (unlikely(!kvm_can_emulate_insn(vcpu, emul_type, NULL, 0)))
+ r = kvm_check_emulate_insn(vcpu, emul_type, NULL, 0);
+ if (r != X86EMUL_CONTINUE)
return 1;
if (fep_flags &&
@@ -8882,8 +8995,14 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
bool writeback = true;
- if (unlikely(!kvm_can_emulate_insn(vcpu, emulation_type, insn, insn_len)))
- return 1;
+ r = kvm_check_emulate_insn(vcpu, emulation_type, insn, insn_len);
+ if (r != X86EMUL_CONTINUE) {
+ if (r == X86EMUL_RETRY_INSTR || r == X86EMUL_PROPAGATE_FAULT)
+ return 1;
+
+ WARN_ON_ONCE(r != X86EMUL_UNHANDLEABLE);
+ return handle_emulation_failure(vcpu, emulation_type);
+ }
vcpu->arch.l1tf_flush_l1d = true;
@@ -10587,16 +10706,16 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
}
if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu))
record_steal_time(vcpu);
+ if (kvm_check_request(KVM_REQ_PMU, vcpu))
+ kvm_pmu_handle_event(vcpu);
+ if (kvm_check_request(KVM_REQ_PMI, vcpu))
+ kvm_pmu_deliver_pmi(vcpu);
#ifdef CONFIG_KVM_SMM
if (kvm_check_request(KVM_REQ_SMI, vcpu))
process_smi(vcpu);
#endif
if (kvm_check_request(KVM_REQ_NMI, vcpu))
process_nmi(vcpu);
- if (kvm_check_request(KVM_REQ_PMU, vcpu))
- kvm_pmu_handle_event(vcpu);
- if (kvm_check_request(KVM_REQ_PMI, vcpu))
- kvm_pmu_deliver_pmi(vcpu);
if (kvm_check_request(KVM_REQ_IOAPIC_EOI_EXIT, vcpu)) {
BUG_ON(vcpu->arch.pending_ioapic_eoi > 255);
if (test_bit(vcpu->arch.pending_ioapic_eoi,
@@ -11532,7 +11651,6 @@ static int __set_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs,
*mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0;
static_call(kvm_x86_set_cr0)(vcpu, sregs->cr0);
- vcpu->arch.cr0 = sregs->cr0;
*mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
static_call(kvm_x86_set_cr4)(vcpu, sregs->cr4);
@@ -11576,8 +11694,10 @@ static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
if (ret)
return ret;
- if (mmu_reset_needed)
+ if (mmu_reset_needed) {
kvm_mmu_reset_context(vcpu);
+ kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
+ }
max_bits = KVM_NR_INTERRUPTS;
pending_vec = find_first_bit(
@@ -11618,8 +11738,10 @@ static int __set_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2)
mmu_reset_needed = 1;
vcpu->arch.pdptrs_from_userspace = true;
}
- if (mmu_reset_needed)
+ if (mmu_reset_needed) {
kvm_mmu_reset_context(vcpu);
+ kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
+ }
return 0;
}
@@ -11970,7 +12092,7 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
if (mutex_lock_killable(&vcpu->mutex))
return;
vcpu_load(vcpu);
- kvm_synchronize_tsc(vcpu, 0);
+ kvm_synchronize_tsc(vcpu, NULL);
vcpu_put(vcpu);
/* poll control enabled by default */
@@ -12326,7 +12448,6 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
goto out_uninit_mmu;
INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list);
- INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
atomic_set(&kvm->arch.noncoherent_dma_count, 0);
/* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
@@ -13202,15 +13323,30 @@ bool noinstr kvm_arch_has_assigned_device(struct kvm *kvm)
}
EXPORT_SYMBOL_GPL(kvm_arch_has_assigned_device);
+static void kvm_noncoherent_dma_assignment_start_or_stop(struct kvm *kvm)
+{
+ /*
+ * Non-coherent DMA assignment and de-assignment will affect
+ * whether KVM honors guest MTRRs and cause changes in memtypes
+ * in TDP.
+ * So, pass %true unconditionally to indicate non-coherent DMA was,
+ * or will be involved, and that zapping SPTEs might be necessary.
+ */
+ if (__kvm_mmu_honors_guest_mtrrs(true))
+ kvm_zap_gfn_range(kvm, gpa_to_gfn(0), gpa_to_gfn(~0ULL));
+}
+
void kvm_arch_register_noncoherent_dma(struct kvm *kvm)
{
- atomic_inc(&kvm->arch.noncoherent_dma_count);
+ if (atomic_inc_return(&kvm->arch.noncoherent_dma_count) == 1)
+ kvm_noncoherent_dma_assignment_start_or_stop(kvm);
}
EXPORT_SYMBOL_GPL(kvm_arch_register_noncoherent_dma);
void kvm_arch_unregister_noncoherent_dma(struct kvm *kvm)
{
- atomic_dec(&kvm->arch.noncoherent_dma_count);
+ if (!atomic_dec_return(&kvm->arch.noncoherent_dma_count))
+ kvm_noncoherent_dma_assignment_start_or_stop(kvm);
}
EXPORT_SYMBOL_GPL(kvm_arch_unregister_noncoherent_dma);