summaryrefslogtreecommitdiff
path: root/arch/x86/kvm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kvm')
-rw-r--r--arch/x86/kvm/cpuid.c8
-rw-r--r--arch/x86/kvm/emulate.c32
-rw-r--r--arch/x86/kvm/kvm_emulate.h6
-rw-r--r--arch/x86/kvm/lapic.c7
-rw-r--r--arch/x86/kvm/mmu/mmu.c15
-rw-r--r--arch/x86/kvm/pmu.c7
-rw-r--r--arch/x86/kvm/pmu.h2
-rw-r--r--arch/x86/kvm/svm/avic.c93
-rw-r--r--arch/x86/kvm/svm/nested.c26
-rw-r--r--arch/x86/kvm/svm/svm.c104
-rw-r--r--arch/x86/kvm/svm/svm.h15
-rw-r--r--arch/x86/kvm/vmx/nested.c11
-rw-r--r--arch/x86/kvm/vmx/pmu_intel.c4
-rw-r--r--arch/x86/kvm/vmx/vmx.c29
-rw-r--r--arch/x86/kvm/vmx/vmx.h5
-rw-r--r--arch/x86/kvm/x86.c65
-rw-r--r--arch/x86/kvm/x86.h7
-rw-r--r--arch/x86/kvm/xen.c97
18 files changed, 327 insertions, 206 deletions
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 494d4d351859..8bf541b24b4b 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -282,6 +282,7 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic = vcpu->arch.apic;
struct kvm_cpuid_entry2 *best;
+ u64 guest_supported_xcr0;
best = kvm_find_cpuid_entry(vcpu, 1, 0);
if (best && apic) {
@@ -293,9 +294,11 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
kvm_apic_set_version(vcpu);
}
- vcpu->arch.guest_supported_xcr0 =
+ guest_supported_xcr0 =
cpuid_get_supported_xcr0(vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent);
+ vcpu->arch.guest_fpu.fpstate->user_xfeatures = guest_supported_xcr0;
+
kvm_update_pv_runtime(vcpu);
vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
@@ -876,7 +879,8 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
eax.split.bit_width = cap.bit_width_gp;
eax.split.mask_length = cap.events_mask_len;
- edx.split.num_counters_fixed = min(cap.num_counters_fixed, MAX_FIXED_COUNTERS);
+ edx.split.num_counters_fixed =
+ min(cap.num_counters_fixed, KVM_PMC_MAX_FIXED);
edx.split.bit_width_fixed = cap.bit_width_fixed;
if (cap.version)
edx.split.anythread_deprecated = 1;
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 5719d8cfdbd9..50ca1db7247c 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -429,8 +429,23 @@ static int fastop(struct x86_emulate_ctxt *ctxt, fastop_t fop);
FOP_END
/* Special case for SETcc - 1 instruction per cc */
+
+/*
+ * Depending on .config the SETcc functions look like:
+ *
+ * SETcc %al [3 bytes]
+ * RET [1 byte]
+ * INT3 [1 byte; CONFIG_SLS]
+ *
+ * Which gives possible sizes 4 or 5. When rounded up to the
+ * next power-of-two alignment they become 4 or 8.
+ */
+#define SETCC_LENGTH (4 + IS_ENABLED(CONFIG_SLS))
+#define SETCC_ALIGN (4 << IS_ENABLED(CONFIG_SLS))
+static_assert(SETCC_LENGTH <= SETCC_ALIGN);
+
#define FOP_SETCC(op) \
- ".align 4 \n\t" \
+ ".align " __stringify(SETCC_ALIGN) " \n\t" \
".type " #op ", @function \n\t" \
#op ": \n\t" \
#op " %al \n\t" \
@@ -665,7 +680,7 @@ static inline u8 ctxt_virt_addr_bits(struct x86_emulate_ctxt *ctxt)
static inline bool emul_is_noncanonical_address(u64 la,
struct x86_emulate_ctxt *ctxt)
{
- return get_canonical(la, ctxt_virt_addr_bits(ctxt)) != la;
+ return !__is_canonical_address(la, ctxt_virt_addr_bits(ctxt));
}
/*
@@ -715,7 +730,7 @@ static __always_inline int __linearize(struct x86_emulate_ctxt *ctxt,
case X86EMUL_MODE_PROT64:
*linear = la;
va_bits = ctxt_virt_addr_bits(ctxt);
- if (get_canonical(la, va_bits) != la)
+ if (!__is_canonical_address(la, va_bits))
goto bad;
*max_size = min_t(u64, ~0u, (1ull << va_bits) - la);
@@ -1047,7 +1062,7 @@ static int em_bsr_c(struct x86_emulate_ctxt *ctxt)
static __always_inline u8 test_cc(unsigned int condition, unsigned long flags)
{
u8 rc;
- void (*fop)(void) = (void *)em_setcc + 4 * (condition & 0xf);
+ void (*fop)(void) = (void *)em_setcc + SETCC_ALIGN * (condition & 0xf);
flags = (flags & EFLAGS_MASK) | X86_EFLAGS_IF;
asm("push %[flags]; popf; " CALL_NOSPEC
@@ -5380,8 +5395,13 @@ static int fastop(struct x86_emulate_ctxt *ctxt, fastop_t fop)
void init_decode_cache(struct x86_emulate_ctxt *ctxt)
{
- memset(&ctxt->rip_relative, 0,
- (void *)&ctxt->modrm - (void *)&ctxt->rip_relative);
+ /* Clear fields that are set conditionally but read without a guard. */
+ ctxt->rip_relative = false;
+ ctxt->rex_prefix = 0;
+ ctxt->lock_prefix = 0;
+ ctxt->rep_prefix = 0;
+ ctxt->regs_valid = 0;
+ ctxt->regs_dirty = 0;
ctxt->io_read.pos = 0;
ctxt->io_read.end = 0;
diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h
index 39eded2426ff..840ddb4a9302 100644
--- a/arch/x86/kvm/kvm_emulate.h
+++ b/arch/x86/kvm/kvm_emulate.h
@@ -336,11 +336,7 @@ struct x86_emulate_ctxt {
fastop_t fop;
};
int (*check_perm)(struct x86_emulate_ctxt *ctxt);
- /*
- * The following six fields are cleared together,
- * the rest are initialized unconditionally in x86_decode_insn
- * or elsewhere
- */
+
bool rip_relative;
u8 rex_prefix;
u8 lock_prefix;
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index d7e6fde82d25..9322e6340a74 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -2306,7 +2306,12 @@ void kvm_apic_update_apicv(struct kvm_vcpu *vcpu)
apic->irr_pending = true;
apic->isr_count = 1;
} else {
- apic->irr_pending = (apic_search_irr(apic) != -1);
+ /*
+ * Don't clear irr_pending, searching the IRR can race with
+ * updates from the CPU as APICv is still active from hardware's
+ * perspective. The flag will be cleared as appropriate when
+ * KVM injects the interrupt.
+ */
apic->isr_count = count_vectors(apic->regs + APIC_ISR);
}
}
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 593093b52395..5628d0ba637e 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -3565,7 +3565,7 @@ set_root_pgd:
out_unlock:
write_unlock(&vcpu->kvm->mmu_lock);
- return 0;
+ return r;
}
static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu)
@@ -3889,12 +3889,23 @@ static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr)
walk_shadow_page_lockless_end(vcpu);
}
+static u32 alloc_apf_token(struct kvm_vcpu *vcpu)
+{
+ /* make sure the token value is not 0 */
+ u32 id = vcpu->arch.apf.id;
+
+ if (id << 12 == 0)
+ vcpu->arch.apf.id = 1;
+
+ return (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id;
+}
+
static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
gfn_t gfn)
{
struct kvm_arch_async_pf arch;
- arch.token = (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id;
+ arch.token = alloc_apf_token(vcpu);
arch.gfn = gfn;
arch.direct_map = vcpu->arch.mmu->direct_map;
arch.cr3 = vcpu->arch.mmu->get_guest_pgd(vcpu);
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index f614f95acc6b..b1a02993782b 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -95,7 +95,7 @@ static void kvm_perf_overflow(struct perf_event *perf_event,
}
static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
- unsigned config, bool exclude_user,
+ u64 config, bool exclude_user,
bool exclude_kernel, bool intr,
bool in_tx, bool in_tx_cp)
{
@@ -181,7 +181,8 @@ static int cmp_u64(const void *a, const void *b)
void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
{
- unsigned config, type = PERF_TYPE_RAW;
+ u64 config;
+ u32 type = PERF_TYPE_RAW;
struct kvm *kvm = pmc->vcpu->kvm;
struct kvm_pmu_event_filter *filter;
bool allow_event = true;
@@ -220,7 +221,7 @@ void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
}
if (type == PERF_TYPE_RAW)
- config = eventsel & X86_RAW_EVENT_MASK;
+ config = eventsel & AMD64_RAW_EVENT_MASK;
if (pmc->current_config == eventsel && pmc_resume_counter(pmc))
return;
diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h
index 7a7b8d5b775e..9e66fba1d6a3 100644
--- a/arch/x86/kvm/pmu.h
+++ b/arch/x86/kvm/pmu.h
@@ -15,8 +15,6 @@
#define VMWARE_BACKDOOR_PMC_REAL_TIME 0x10001
#define VMWARE_BACKDOOR_PMC_APPARENT_TIME 0x10002
-#define MAX_FIXED_COUNTERS 3
-
struct kvm_event_hw_type_mapping {
u8 eventsel;
u8 unit_mask;
diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
index 90364d02f22a..fb3e20791338 100644
--- a/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@@ -27,20 +27,6 @@
#include "irq.h"
#include "svm.h"
-#define SVM_AVIC_DOORBELL 0xc001011b
-
-#define AVIC_HPA_MASK ~((0xFFFULL << 52) | 0xFFF)
-
-/*
- * 0xff is broadcast, so the max index allowed for physical APIC ID
- * table is 0xfe. APIC IDs above 0xff are reserved.
- */
-#define AVIC_MAX_PHYSICAL_ID_COUNT 255
-
-#define AVIC_UNACCEL_ACCESS_WRITE_MASK 1
-#define AVIC_UNACCEL_ACCESS_OFFSET_MASK 0xFF0
-#define AVIC_UNACCEL_ACCESS_VECTOR_MASK 0xFFFFFFFF
-
/* AVIC GATAG is encoded using VM and VCPU IDs */
#define AVIC_VCPU_ID_BITS 8
#define AVIC_VCPU_ID_MASK ((1 << AVIC_VCPU_ID_BITS) - 1)
@@ -73,12 +59,6 @@ struct amd_svm_iommu_ir {
void *data; /* Storing pointer to struct amd_ir_data */
};
-enum avic_ipi_failure_cause {
- AVIC_IPI_FAILURE_INVALID_INT_TYPE,
- AVIC_IPI_FAILURE_TARGET_NOT_RUNNING,
- AVIC_IPI_FAILURE_INVALID_TARGET,
- AVIC_IPI_FAILURE_INVALID_BACKING_PAGE,
-};
/* Note:
* This function is called from IOMMU driver to notify
@@ -289,6 +269,22 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu)
return 0;
}
+void avic_ring_doorbell(struct kvm_vcpu *vcpu)
+{
+ /*
+ * Note, the vCPU could get migrated to a different pCPU at any point,
+ * which could result in signalling the wrong/previous pCPU. But if
+ * that happens the vCPU is guaranteed to do a VMRUN (after being
+ * migrated) and thus will process pending interrupts, i.e. a doorbell
+ * is not needed (and the spurious one is harmless).
+ */
+ int cpu = READ_ONCE(vcpu->cpu);
+
+ if (cpu != get_cpu())
+ wrmsrl(MSR_AMD64_SVM_AVIC_DOORBELL, kvm_cpu_get_apicid(cpu));
+ put_cpu();
+}
+
static void avic_kick_target_vcpus(struct kvm *kvm, struct kvm_lapic *source,
u32 icrl, u32 icrh)
{
@@ -304,8 +300,13 @@ static void avic_kick_target_vcpus(struct kvm *kvm, struct kvm_lapic *source,
kvm_for_each_vcpu(i, vcpu, kvm) {
if (kvm_apic_match_dest(vcpu, source, icrl & APIC_SHORT_MASK,
GET_APIC_DEST_FIELD(icrh),
- icrl & APIC_DEST_MASK))
- kvm_vcpu_wake_up(vcpu);
+ icrl & APIC_DEST_MASK)) {
+ vcpu->arch.apic->irr_pending = true;
+ svm_complete_interrupt_delivery(vcpu,
+ icrl & APIC_MODE_MASK,
+ icrl & APIC_INT_LEVELTRIG,
+ icrl & APIC_VECTOR_MASK);
+ }
}
}
@@ -345,8 +346,6 @@ int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu)
avic_kick_target_vcpus(vcpu->kvm, apic, icrl, icrh);
break;
case AVIC_IPI_FAILURE_INVALID_TARGET:
- WARN_ONCE(1, "Invalid IPI target: index=%u, vcpu=%d, icr=%#0x:%#0x\n",
- index, vcpu->vcpu_id, icrh, icrl);
break;
case AVIC_IPI_FAILURE_INVALID_BACKING_PAGE:
WARN_ONCE(1, "Invalid backing page\n");
@@ -669,52 +668,6 @@ void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
return;
}
-int svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec)
-{
- if (!vcpu->arch.apicv_active)
- return -1;
-
- kvm_lapic_set_irr(vec, vcpu->arch.apic);
-
- /*
- * Pairs with the smp_mb_*() after setting vcpu->guest_mode in
- * vcpu_enter_guest() to ensure the write to the vIRR is ordered before
- * the read of guest_mode, which guarantees that either VMRUN will see
- * and process the new vIRR entry, or that the below code will signal
- * the doorbell if the vCPU is already running in the guest.
- */
- smp_mb__after_atomic();
-
- /*
- * Signal the doorbell to tell hardware to inject the IRQ if the vCPU
- * is in the guest. If the vCPU is not in the guest, hardware will
- * automatically process AVIC interrupts at VMRUN.
- */
- if (vcpu->mode == IN_GUEST_MODE) {
- int cpu = READ_ONCE(vcpu->cpu);
-
- /*
- * Note, the vCPU could get migrated to a different pCPU at any
- * point, which could result in signalling the wrong/previous
- * pCPU. But if that happens the vCPU is guaranteed to do a
- * VMRUN (after being migrated) and thus will process pending
- * interrupts, i.e. a doorbell is not needed (and the spurious
- * one is harmless).
- */
- if (cpu != get_cpu())
- wrmsrl(SVM_AVIC_DOORBELL, kvm_cpu_get_apicid(cpu));
- put_cpu();
- } else {
- /*
- * Wake the vCPU if it was blocking. KVM will then detect the
- * pending IRQ when checking if the vCPU has a wake event.
- */
- kvm_vcpu_wake_up(vcpu);
- }
-
- return 0;
-}
-
bool svm_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu)
{
return false;
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 1218b5a342fc..39d280e7e80e 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -1457,18 +1457,6 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
!__nested_vmcb_check_save(vcpu, &save_cached))
goto out_free;
- /*
- * While the nested guest CR3 is already checked and set by
- * KVM_SET_SREGS, it was set when nested state was yet loaded,
- * thus MMU might not be initialized correctly.
- * Set it again to fix this.
- */
-
- ret = nested_svm_load_cr3(&svm->vcpu, vcpu->arch.cr3,
- nested_npt_enabled(svm), false);
- if (WARN_ON_ONCE(ret))
- goto out_free;
-
/*
* All checks done, we can enter guest mode. Userspace provides
@@ -1494,6 +1482,20 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
svm_switch_vmcb(svm, &svm->nested.vmcb02);
nested_vmcb02_prepare_control(svm);
+
+ /*
+ * While the nested guest CR3 is already checked and set by
+ * KVM_SET_SREGS, it was set when nested state was yet loaded,
+ * thus MMU might not be initialized correctly.
+ * Set it again to fix this.
+ */
+
+ ret = nested_svm_load_cr3(&svm->vcpu, vcpu->arch.cr3,
+ nested_npt_enabled(svm), false);
+ if (WARN_ON_ONCE(ret))
+ goto out_free;
+
+
kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
ret = 0;
out_free:
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index a290efb272ad..fd3a00c892c7 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -1585,6 +1585,7 @@ void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
{
struct vcpu_svm *svm = to_svm(vcpu);
u64 hcr0 = cr0;
+ bool old_paging = is_paging(vcpu);
#ifdef CONFIG_X86_64
if (vcpu->arch.efer & EFER_LME && !vcpu->arch.guest_state_protected) {
@@ -1601,8 +1602,11 @@ void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
#endif
vcpu->arch.cr0 = cr0;
- if (!npt_enabled)
+ if (!npt_enabled) {
hcr0 |= X86_CR0_PG | X86_CR0_WP;
+ if (old_paging != is_paging(vcpu))
+ svm_set_cr4(vcpu, kvm_read_cr4(vcpu));
+ }
/*
* re-enable caching here because the QEMU bios
@@ -1646,8 +1650,12 @@ void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
svm_flush_tlb(vcpu);
vcpu->arch.cr4 = cr4;
- if (!npt_enabled)
+ if (!npt_enabled) {
cr4 |= X86_CR4_PAE;
+
+ if (!is_paging(vcpu))
+ cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE);
+ }
cr4 |= host_cr4_mce;
to_svm(vcpu)->vmcb->save.cr4 = cr4;
vmcb_mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
@@ -2685,8 +2693,23 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
u64 data = msr->data;
switch (ecx) {
case MSR_AMD64_TSC_RATIO:
- if (!msr->host_initiated && !svm->tsc_scaling_enabled)
- return 1;
+
+ if (!svm->tsc_scaling_enabled) {
+
+ if (!msr->host_initiated)
+ return 1;
+ /*
+ * In case TSC scaling is not enabled, always
+ * leave this MSR at the default value.
+ *
+ * Due to bug in qemu 6.2.0, it would try to set
+ * this msr to 0 if tsc scaling is not enabled.
+ * Ignore this value as well.
+ */
+ if (data != 0 && data != svm->tsc_ratio_msr)
+ return 1;
+ break;
+ }
if (data & TSC_RATIO_RSVD)
return 1;
@@ -3291,21 +3314,55 @@ static void svm_set_irq(struct kvm_vcpu *vcpu)
SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR;
}
-static void svm_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode,
- int trig_mode, int vector)
+void svm_complete_interrupt_delivery(struct kvm_vcpu *vcpu, int delivery_mode,
+ int trig_mode, int vector)
{
- struct kvm_vcpu *vcpu = apic->vcpu;
+ /*
+ * vcpu->arch.apicv_active must be read after vcpu->mode.
+ * Pairs with smp_store_release in vcpu_enter_guest.
+ */
+ bool in_guest_mode = (smp_load_acquire(&vcpu->mode) == IN_GUEST_MODE);
- if (svm_deliver_avic_intr(vcpu, vector)) {
- kvm_lapic_set_irr(vector, apic);
+ if (!READ_ONCE(vcpu->arch.apicv_active)) {
+ /* Process the interrupt via inject_pending_event */
kvm_make_request(KVM_REQ_EVENT, vcpu);
kvm_vcpu_kick(vcpu);
+ return;
+ }
+
+ trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode, trig_mode, vector);
+ if (in_guest_mode) {
+ /*
+ * Signal the doorbell to tell hardware to inject the IRQ. If
+ * the vCPU exits the guest before the doorbell chimes, hardware
+ * will automatically process AVIC interrupts at the next VMRUN.
+ */
+ avic_ring_doorbell(vcpu);
} else {
- trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode,
- trig_mode, vector);
+ /*
+ * Wake the vCPU if it was blocking. KVM will then detect the
+ * pending IRQ when checking if the vCPU has a wake event.
+ */
+ kvm_vcpu_wake_up(vcpu);
}
}
+static void svm_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode,
+ int trig_mode, int vector)
+{
+ kvm_lapic_set_irr(vector, apic);
+
+ /*
+ * Pairs with the smp_mb_*() after setting vcpu->guest_mode in
+ * vcpu_enter_guest() to ensure the write to the vIRR is ordered before
+ * the read of guest_mode. This guarantees that either VMRUN will see
+ * and process the new vIRR entry, or that svm_complete_interrupt_delivery
+ * will signal the doorbell if the CPU has already entered the guest.
+ */
+ smp_mb__after_atomic();
+ svm_complete_interrupt_delivery(apic->vcpu, delivery_mode, trig_mode, vector);
+}
+
static void svm_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -3353,11 +3410,13 @@ static int svm_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
if (svm->nested.nested_run_pending)
return -EBUSY;
+ if (svm_nmi_blocked(vcpu))
+ return 0;
+
/* An NMI must not be injected into L2 if it's supposed to VM-Exit. */
if (for_injection && is_guest_mode(vcpu) && nested_exit_on_nmi(svm))
return -EBUSY;
-
- return !svm_nmi_blocked(vcpu);
+ return 1;
}
static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu)
@@ -3409,9 +3468,13 @@ bool svm_interrupt_blocked(struct kvm_vcpu *vcpu)
static int svm_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection)
{
struct vcpu_svm *svm = to_svm(vcpu);
+
if (svm->nested.nested_run_pending)
return -EBUSY;
+ if (svm_interrupt_blocked(vcpu))
+ return 0;
+
/*
* An IRQ must not be injected into L2 if it's supposed to VM-Exit,
* e.g. if the IRQ arrived asynchronously after checking nested events.
@@ -3419,7 +3482,7 @@ static int svm_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection)
if (for_injection && is_guest_mode(vcpu) && nested_exit_on_intr(svm))
return -EBUSY;
- return !svm_interrupt_blocked(vcpu);
+ return 1;
}
static void svm_enable_irq_window(struct kvm_vcpu *vcpu)
@@ -4150,11 +4213,14 @@ static int svm_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
if (svm->nested.nested_run_pending)
return -EBUSY;
+ if (svm_smi_blocked(vcpu))
+ return 0;
+
/* An SMI must not be injected into L2 if it's supposed to VM-Exit. */
if (for_injection && is_guest_mode(vcpu) && nested_exit_on_smi(svm))
return -EBUSY;
- return !svm_smi_blocked(vcpu);
+ return 1;
}
static int svm_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
@@ -4248,11 +4314,18 @@ static int svm_leave_smm(struct kvm_vcpu *vcpu, const char *smstate)
* Enter the nested guest now
*/
+ vmcb_mark_all_dirty(svm->vmcb01.ptr);
+
vmcb12 = map.hva;
nested_copy_vmcb_control_to_cache(svm, &vmcb12->control);
nested_copy_vmcb_save_to_cache(svm, &vmcb12->save);
ret = enter_svm_guest_mode(vcpu, vmcb12_gpa, vmcb12, false);
+ if (ret)
+ goto unmap_save;
+
+ svm->nested.nested_run_pending = 1;
+
unmap_save:
kvm_vcpu_unmap(vcpu, &map_save, true);
unmap_map:
@@ -4637,6 +4710,7 @@ static __init void svm_set_cpu_caps(void)
/* CPUID 0x80000001 and 0x8000000A (SVM features) */
if (nested) {
kvm_cpu_cap_set(X86_FEATURE_SVM);
+ kvm_cpu_cap_set(X86_FEATURE_VMCBCLEAN);
if (nrips)
kvm_cpu_cap_set(X86_FEATURE_NRIPS);
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 73525353e424..fa98d6844728 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -489,6 +489,8 @@ void svm_set_gif(struct vcpu_svm *svm, bool value);
int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code);
void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr,
int read, int write);
+void svm_complete_interrupt_delivery(struct kvm_vcpu *vcpu, int delivery_mode,
+ int trig_mode, int vec);
/* nested.c */
@@ -556,17 +558,6 @@ extern struct kvm_x86_nested_ops svm_nested_ops;
/* avic.c */
-#define AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK (0xFF)
-#define AVIC_LOGICAL_ID_ENTRY_VALID_BIT 31
-#define AVIC_LOGICAL_ID_ENTRY_VALID_MASK (1 << 31)
-
-#define AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK (0xFFULL)
-#define AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK (0xFFFFFFFFFFULL << 12)
-#define AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK (1ULL << 62)
-#define AVIC_PHYSICAL_ID_ENTRY_VALID_MASK (1ULL << 63)
-
-#define VMCB_AVIC_APIC_BAR_MASK 0xFFFFFFFFFF000ULL
-
int avic_ga_log_notifier(u32 ga_tag);
void avic_vm_destroy(struct kvm *kvm);
int avic_vm_init(struct kvm *kvm);
@@ -583,12 +574,12 @@ bool svm_check_apicv_inhibit_reasons(ulong bit);
void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
void svm_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr);
void svm_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr);
-int svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec);
bool svm_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu);
int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
uint32_t guest_irq, bool set);
void avic_vcpu_blocking(struct kvm_vcpu *vcpu);
void avic_vcpu_unblocking(struct kvm_vcpu *vcpu);
+void avic_ring_doorbell(struct kvm_vcpu *vcpu);
/* sev.c */
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index ba34e94049c7..dc822a1d403d 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -246,8 +246,7 @@ static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx,
src = &prev->host_state;
dest = &vmx->loaded_vmcs->host_state;
- vmx_set_vmcs_host_state(dest, src->cr3, src->fs_sel, src->gs_sel,
- src->fs_base, src->gs_base);
+ vmx_set_host_fs_gs(dest, src->fs_sel, src->gs_sel, src->fs_base, src->gs_base);
dest->ldt_sel = src->ldt_sel;
#ifdef CONFIG_X86_64
dest->ds_sel = src->ds_sel;
@@ -3056,7 +3055,7 @@ static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu,
static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- unsigned long cr4;
+ unsigned long cr3, cr4;
bool vm_fail;
if (!nested_early_check)
@@ -3079,6 +3078,12 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu)
*/
vmcs_writel(GUEST_RFLAGS, 0);
+ cr3 = __get_current_cr3_fast();
+ if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) {
+ vmcs_writel(HOST_CR3, cr3);
+ vmx->loaded_vmcs->host_state.cr3 = cr3;
+ }
+
cr4 = cr4_read_shadow();
if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) {
vmcs_writel(HOST_CR4, cr4);
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
index 466d18fc0c5d..9b26596099a1 100644
--- a/arch/x86/kvm/vmx/pmu_intel.c
+++ b/arch/x86/kvm/vmx/pmu_intel.c
@@ -565,7 +565,7 @@ static void intel_pmu_init(struct kvm_vcpu *vcpu)
pmu->gp_counters[i].current_config = 0;
}
- for (i = 0; i < INTEL_PMC_MAX_FIXED; i++) {
+ for (i = 0; i < KVM_PMC_MAX_FIXED; i++) {
pmu->fixed_counters[i].type = KVM_PMC_FIXED;
pmu->fixed_counters[i].vcpu = vcpu;
pmu->fixed_counters[i].idx = i + INTEL_PMC_IDX_FIXED;
@@ -591,7 +591,7 @@ static void intel_pmu_reset(struct kvm_vcpu *vcpu)
pmc->counter = pmc->eventsel = 0;
}
- for (i = 0; i < INTEL_PMC_MAX_FIXED; i++) {
+ for (i = 0; i < KVM_PMC_MAX_FIXED; i++) {
pmc = &pmu->fixed_counters[i];
pmc_stop_counter(pmc);
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 6c27bd0c89e1..b730d799c26e 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -1080,14 +1080,9 @@ static void pt_guest_exit(struct vcpu_vmx *vmx)
wrmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
}
-void vmx_set_vmcs_host_state(struct vmcs_host_state *host, unsigned long cr3,
- u16 fs_sel, u16 gs_sel,
- unsigned long fs_base, unsigned long gs_base)
+void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel,
+ unsigned long fs_base, unsigned long gs_base)
{
- if (unlikely(cr3 != host->cr3)) {
- vmcs_writel(HOST_CR3, cr3);
- host->cr3 = cr3;
- }
if (unlikely(fs_sel != host->fs_sel)) {
if (!(fs_sel & 7))
vmcs_write16(HOST_FS_SELECTOR, fs_sel);
@@ -1182,9 +1177,7 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
gs_base = segment_base(gs_sel);
#endif
- vmx_set_vmcs_host_state(host_state, __get_current_cr3_fast(),
- fs_sel, gs_sel, fs_base, gs_base);
-
+ vmx_set_host_fs_gs(host_state, fs_sel, gs_sel, fs_base, gs_base);
vmx->guest_state_loaded = true;
}
@@ -6791,7 +6784,7 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- unsigned long cr4;
+ unsigned long cr3, cr4;
/* Record the guest's net vcpu time for enforced NMI injections. */
if (unlikely(!enable_vnmi &&
@@ -6834,6 +6827,19 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
vcpu->arch.regs_dirty = 0;
+ /*
+ * Refresh vmcs.HOST_CR3 if necessary. This must be done immediately
+ * prior to VM-Enter, as the kernel may load a new ASID (PCID) any time
+ * it switches back to the current->mm, which can occur in KVM context
+ * when switching to a temporary mm to patch kernel code, e.g. if KVM
+ * toggles a static key while handling a VM-Exit.
+ */
+ cr3 = __get_current_cr3_fast();
+ if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) {
+ vmcs_writel(HOST_CR3, cr3);
+ vmx->loaded_vmcs->host_state.cr3 = cr3;
+ }
+
cr4 = cr4_read_shadow();
if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) {
vmcs_writel(HOST_CR4, cr4);
@@ -7659,6 +7665,7 @@ static int vmx_leave_smm(struct kvm_vcpu *vcpu, const char *smstate)
if (ret)
return ret;
+ vmx->nested.nested_run_pending = 1;
vmx->nested.smm.guest_mode = false;
}
return 0;
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 7f2c82e7f38f..9c6bfcd84008 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -374,9 +374,8 @@ int allocate_vpid(void);
void free_vpid(int vpid);
void vmx_set_constant_host_state(struct vcpu_vmx *vmx);
void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu);
-void vmx_set_vmcs_host_state(struct vmcs_host_state *host, unsigned long cr3,
- u16 fs_sel, u16 gs_sel,
- unsigned long fs_base, unsigned long gs_base);
+void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel,
+ unsigned long fs_base, unsigned long gs_base);
int vmx_get_cpl(struct kvm_vcpu *vcpu);
bool vmx_emulation_required(struct kvm_vcpu *vcpu);
unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 7131d735b1ef..6db3a506b402 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -984,6 +984,18 @@ void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu)
}
EXPORT_SYMBOL_GPL(kvm_load_host_xsave_state);
+static inline u64 kvm_guest_supported_xcr0(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.guest_fpu.fpstate->user_xfeatures;
+}
+
+#ifdef CONFIG_X86_64
+static inline u64 kvm_guest_supported_xfd(struct kvm_vcpu *vcpu)
+{
+ return kvm_guest_supported_xcr0(vcpu) & XFEATURE_MASK_USER_DYNAMIC;
+}
+#endif
+
static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
{
u64 xcr0 = xcr;
@@ -1003,7 +1015,7 @@ static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
* saving. However, xcr0 bit 0 is always set, even if the
* emulated CPU does not support XSAVE (see kvm_vcpu_reset()).
*/
- valid_bits = vcpu->arch.guest_supported_xcr0 | XFEATURE_MASK_FP;
+ valid_bits = kvm_guest_supported_xcr0(vcpu) | XFEATURE_MASK_FP;
if (xcr0 & ~valid_bits)
return 1;
@@ -1737,7 +1749,7 @@ static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data,
* value, and that something deterministic happens if the guest
* invokes 64-bit SYSENTER.
*/
- data = get_canonical(data, vcpu_virt_addr_bits(vcpu));
+ data = __canonical_address(data, vcpu_virt_addr_bits(vcpu));
break;
case MSR_TSC_AUX:
if (!kvm_is_supported_user_return_msr(MSR_TSC_AUX))
@@ -2351,10 +2363,12 @@ static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
return tsc;
}
+#ifdef CONFIG_X86_64
static inline int gtod_is_based_on_tsc(int mode)
{
return mode == VDSO_CLOCKMODE_TSC || mode == VDSO_CLOCKMODE_HVCLOCK;
}
+#endif
static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu)
{
@@ -3706,8 +3720,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
!guest_cpuid_has(vcpu, X86_FEATURE_XFD))
return 1;
- if (data & ~(XFEATURE_MASK_USER_DYNAMIC &
- vcpu->arch.guest_supported_xcr0))
+ if (data & ~kvm_guest_supported_xfd(vcpu))
return 1;
fpu_update_guest_xfd(&vcpu->arch.guest_fpu, data);
@@ -3717,8 +3730,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
!guest_cpuid_has(vcpu, X86_FEATURE_XFD))
return 1;
- if (data & ~(XFEATURE_MASK_USER_DYNAMIC &
- vcpu->arch.guest_supported_xcr0))
+ if (data & ~kvm_guest_supported_xfd(vcpu))
return 1;
vcpu->arch.guest_fpu.xfd_err = data;
@@ -4233,6 +4245,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_EXIT_ON_EMULATION_FAILURE:
case KVM_CAP_VCPU_ATTRIBUTES:
case KVM_CAP_SYS_ATTRIBUTES:
+ case KVM_CAP_ENABLE_CAP:
r = 1;
break;
case KVM_CAP_EXIT_HYPERCALL:
@@ -6516,7 +6529,7 @@ static void kvm_init_msr_list(void)
u32 dummy[2];
unsigned i;
- BUILD_BUG_ON_MSG(INTEL_PMC_MAX_FIXED != 4,
+ BUILD_BUG_ON_MSG(KVM_PMC_MAX_FIXED != 3,
"Please update the fixed PMCs in msrs_to_saved_all[]");
perf_get_x86_pmu_capability(&x86_pmu);
@@ -8840,7 +8853,7 @@ int kvm_arch_init(void *opaque)
}
if (pi_inject_timer == -1)
- pi_inject_timer = housekeeping_enabled(HK_FLAG_TIMER);
+ pi_inject_timer = housekeeping_enabled(HK_TYPE_TIMER);
#ifdef CONFIG_X86_64
pvclock_gtod_register_notifier(&pvclock_gtod_notifier);
@@ -8942,6 +8955,13 @@ static int kvm_pv_clock_pairing(struct kvm_vcpu *vcpu, gpa_t paddr,
if (clock_type != KVM_CLOCK_PAIRING_WALLCLOCK)
return -KVM_EOPNOTSUPP;
+ /*
+ * When tsc is in permanent catchup mode guests won't be able to use
+ * pvclock_read_retry loop to get consistent view of pvclock
+ */
+ if (vcpu->arch.tsc_always_catchup)
+ return -KVM_EOPNOTSUPP;
+
if (!kvm_get_walltime_and_clockread(&ts, &cycle))
return -KVM_EOPNOTSUPP;
@@ -9160,6 +9180,7 @@ static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu)
likely(!pic_in_kernel(vcpu->kvm));
}
+/* Called within kvm->srcu read side. */
static void post_kvm_run_save(struct kvm_vcpu *vcpu)
{
struct kvm_run *kvm_run = vcpu->run;
@@ -9168,16 +9189,9 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu)
kvm_run->cr8 = kvm_get_cr8(vcpu);
kvm_run->apic_base = kvm_get_apic_base(vcpu);
- /*
- * The call to kvm_ready_for_interrupt_injection() may end up in
- * kvm_xen_has_interrupt() which may require the srcu lock to be
- * held, to protect against changes in the vcpu_info address.
- */
- vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
kvm_run->ready_for_interrupt_injection =
pic_in_kernel(vcpu->kvm) ||
kvm_vcpu_ready_for_interrupt_injection(vcpu);
- srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
if (is_smm(vcpu))
kvm_run->flags |= KVM_RUN_X86_SMM;
@@ -9795,6 +9809,7 @@ void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu)
EXPORT_SYMBOL_GPL(__kvm_request_immediate_exit);
/*
+ * Called within kvm->srcu read side.
* Returns 1 to let vcpu_run() continue the guest execution loop without
* exiting to the userspace. Otherwise, the value will be returned to the
* userspace.
@@ -9983,7 +9998,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
* result in virtual interrupt delivery.
*/
local_irq_disable();
- vcpu->mode = IN_GUEST_MODE;
+
+ /* Store vcpu->apicv_active before vcpu->mode. */
+ smp_store_release(&vcpu->mode, IN_GUEST_MODE);
srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
@@ -10171,6 +10188,7 @@ out:
return r;
}
+/* Called within kvm->srcu read side. */
static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu)
{
bool hv_timer;
@@ -10230,12 +10248,12 @@ static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu)
!vcpu->arch.apf.halted);
}
+/* Called within kvm->srcu read side. */
static int vcpu_run(struct kvm_vcpu *vcpu)
{
int r;
struct kvm *kvm = vcpu->kvm;
- vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
vcpu->arch.l1tf_flush_l1d = true;
for (;;) {
@@ -10263,14 +10281,12 @@ static int vcpu_run(struct kvm_vcpu *vcpu)
if (__xfer_to_guest_mode_work_pending()) {
srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
r = xfer_to_guest_mode_handle_work(vcpu);
+ vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
if (r)
return r;
- vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
}
}
- srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
-
return r;
}
@@ -10376,6 +10392,7 @@ static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
{
struct kvm_run *kvm_run = vcpu->run;
+ struct kvm *kvm = vcpu->kvm;
int r;
vcpu_load(vcpu);
@@ -10383,6 +10400,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
kvm_run->flags = 0;
kvm_load_guest_fpu(vcpu);
+ vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
if (kvm_run->immediate_exit) {
r = -EINTR;
@@ -10393,7 +10411,11 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
* use before KVM has ever run the vCPU.
*/
WARN_ON_ONCE(kvm_lapic_hv_timer_in_use(vcpu));
+
+ srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
kvm_vcpu_block(vcpu);
+ vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
+
if (kvm_apic_accept_events(vcpu) < 0) {
r = 0;
goto out;
@@ -10453,8 +10475,9 @@ out:
if (kvm_run->kvm_valid_regs)
store_regs(vcpu);
post_kvm_run_save(vcpu);
- kvm_sigset_deactivate(vcpu);
+ srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
+ kvm_sigset_deactivate(vcpu);
vcpu_put(vcpu);
return r;
}
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 767ec7f99516..f11d945ac41f 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -166,14 +166,9 @@ static inline u8 vcpu_virt_addr_bits(struct kvm_vcpu *vcpu)
return kvm_read_cr4_bits(vcpu, X86_CR4_LA57) ? 57 : 48;
}
-static inline u64 get_canonical(u64 la, u8 vaddr_bits)
-{
- return ((int64_t)la << (64 - vaddr_bits)) >> (64 - vaddr_bits);
-}
-
static inline bool is_noncanonical_address(u64 la, struct kvm_vcpu *vcpu)
{
- return get_canonical(la, vcpu_virt_addr_bits(vcpu)) != la;
+ return !__is_canonical_address(la, vcpu_virt_addr_bits(vcpu));
}
static inline void vcpu_cache_mmio_info(struct kvm_vcpu *vcpu,
diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
index bad57535fad0..74be1fda58e3 100644
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -133,32 +133,57 @@ static void kvm_xen_update_runstate(struct kvm_vcpu *v, int state)
void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state)
{
struct kvm_vcpu_xen *vx = &v->arch.xen;
+ struct gfn_to_hva_cache *ghc = &vx->runstate_cache;
+ struct kvm_memslots *slots = kvm_memslots(v->kvm);
+ bool atomic = (state == RUNSTATE_runnable);
uint64_t state_entry_time;
- unsigned int offset;
+ int __user *user_state;
+ uint64_t __user *user_times;
kvm_xen_update_runstate(v, state);
if (!vx->runstate_set)
return;
- BUILD_BUG_ON(sizeof(struct compat_vcpu_runstate_info) != 0x2c);
+ if (unlikely(slots->generation != ghc->generation || kvm_is_error_hva(ghc->hva)) &&
+ kvm_gfn_to_hva_cache_init(v->kvm, ghc, ghc->gpa, ghc->len))
+ return;
+
+ /* We made sure it fits in a single page */
+ BUG_ON(!ghc->memslot);
+
+ if (atomic)
+ pagefault_disable();
- offset = offsetof(struct compat_vcpu_runstate_info, state_entry_time);
-#ifdef CONFIG_X86_64
/*
- * The only difference is alignment of uint64_t in 32-bit.
- * So the first field 'state' is accessed directly using
- * offsetof() (where its offset happens to be zero), while the
- * remaining fields which are all uint64_t, start at 'offset'
- * which we tweak here by adding 4.
+ * The only difference between 32-bit and 64-bit versions of the
+ * runstate struct us the alignment of uint64_t in 32-bit, which
+ * means that the 64-bit version has an additional 4 bytes of
+ * padding after the first field 'state'.
+ *
+ * So we use 'int __user *user_state' to point to the state field,
+ * and 'uint64_t __user *user_times' for runstate_entry_time. So
+ * the actual array of time[] in each state starts at user_times[1].
*/
+ BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state) != 0);
+ BUILD_BUG_ON(offsetof(struct compat_vcpu_runstate_info, state) != 0);
+ user_state = (int __user *)ghc->hva;
+
+ BUILD_BUG_ON(sizeof(struct compat_vcpu_runstate_info) != 0x2c);
+
+ user_times = (uint64_t __user *)(ghc->hva +
+ offsetof(struct compat_vcpu_runstate_info,
+ state_entry_time));
+#ifdef CONFIG_X86_64
BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state_entry_time) !=
offsetof(struct compat_vcpu_runstate_info, state_entry_time) + 4);
BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, time) !=
offsetof(struct compat_vcpu_runstate_info, time) + 4);
if (v->kvm->arch.xen.long_mode)
- offset = offsetof(struct vcpu_runstate_info, state_entry_time);
+ user_times = (uint64_t __user *)(ghc->hva +
+ offsetof(struct vcpu_runstate_info,
+ state_entry_time));
#endif
/*
* First write the updated state_entry_time at the appropriate
@@ -172,10 +197,8 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state)
BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state_entry_time) !=
sizeof(state_entry_time));
- if (kvm_write_guest_offset_cached(v->kvm, &v->arch.xen.runstate_cache,
- &state_entry_time, offset,
- sizeof(state_entry_time)))
- return;
+ if (__put_user(state_entry_time, user_times))
+ goto out;
smp_wmb();
/*
@@ -189,11 +212,8 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state)
BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state) !=
sizeof(vx->current_runstate));
- if (kvm_write_guest_offset_cached(v->kvm, &v->arch.xen.runstate_cache,
- &vx->current_runstate,
- offsetof(struct vcpu_runstate_info, state),
- sizeof(vx->current_runstate)))
- return;
+ if (__put_user(vx->current_runstate, user_state))
+ goto out;
/*
* Write the actual runstate times immediately after the
@@ -208,24 +228,23 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state)
BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, time) !=
sizeof(vx->runstate_times));
- if (kvm_write_guest_offset_cached(v->kvm, &v->arch.xen.runstate_cache,
- &vx->runstate_times[0],
- offset + sizeof(u64),
- sizeof(vx->runstate_times)))
- return;
-
+ if (__copy_to_user(user_times + 1, vx->runstate_times, sizeof(vx->runstate_times)))
+ goto out;
smp_wmb();
/*
* Finally, clear the XEN_RUNSTATE_UPDATE bit in the guest's
* runstate_entry_time field.
*/
-
state_entry_time &= ~XEN_RUNSTATE_UPDATE;
- if (kvm_write_guest_offset_cached(v->kvm, &v->arch.xen.runstate_cache,
- &state_entry_time, offset,
- sizeof(state_entry_time)))
- return;
+ __put_user(state_entry_time, user_times);
+ smp_wmb();
+
+ out:
+ mark_page_dirty_in_slot(v->kvm, ghc->memslot, ghc->gpa >> PAGE_SHIFT);
+
+ if (atomic)
+ pagefault_enable();
}
int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
@@ -443,6 +462,12 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
break;
}
+ /* It must fit within a single page */
+ if ((data->u.gpa & ~PAGE_MASK) + sizeof(struct vcpu_info) > PAGE_SIZE) {
+ r = -EINVAL;
+ break;
+ }
+
r = kvm_gfn_to_hva_cache_init(vcpu->kvm,
&vcpu->arch.xen.vcpu_info_cache,
data->u.gpa,
@@ -460,6 +485,12 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
break;
}
+ /* It must fit within a single page */
+ if ((data->u.gpa & ~PAGE_MASK) + sizeof(struct pvclock_vcpu_time_info) > PAGE_SIZE) {
+ r = -EINVAL;
+ break;
+ }
+
r = kvm_gfn_to_hva_cache_init(vcpu->kvm,
&vcpu->arch.xen.vcpu_time_info_cache,
data->u.gpa,
@@ -481,6 +512,12 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
break;
}
+ /* It must fit within a single page */
+ if ((data->u.gpa & ~PAGE_MASK) + sizeof(struct vcpu_runstate_info) > PAGE_SIZE) {
+ r = -EINVAL;
+ break;
+ }
+
r = kvm_gfn_to_hva_cache_init(vcpu->kvm,
&vcpu->arch.xen.runstate_cache,
data->u.gpa,