summaryrefslogtreecommitdiff
path: root/arch/x86/kvm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kvm')
-rw-r--r--arch/x86/kvm/cpuid.c28
-rw-r--r--arch/x86/kvm/hyperv.c20
-rw-r--r--arch/x86/kvm/i8259.c20
-rw-r--r--arch/x86/kvm/ioapic.c2
-rw-r--r--arch/x86/kvm/ioapic.h4
-rw-r--r--arch/x86/kvm/irq.h2
-rw-r--r--arch/x86/kvm/mmu/mmu.c30
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.c35
-rw-r--r--arch/x86/kvm/svm/avic.c2
-rw-r--r--arch/x86/kvm/svm/nested.c23
-rw-r--r--arch/x86/kvm/svm/sev.c45
-rw-r--r--arch/x86/kvm/svm/svm.c35
-rw-r--r--arch/x86/kvm/svm/svm.h6
-rw-r--r--arch/x86/kvm/svm/svm_onhyperv.h2
-rw-r--r--arch/x86/kvm/trace.h15
-rw-r--r--arch/x86/kvm/vmx/nested.c56
-rw-r--r--arch/x86/kvm/vmx/vmx.h2
-rw-r--r--arch/x86/kvm/x86.c17
18 files changed, 220 insertions, 124 deletions
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 739be5da3bca..fe03bd978761 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -208,30 +208,6 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
kvm_mmu_after_set_cpuid(vcpu);
}
-static int is_efer_nx(void)
-{
- return host_efer & EFER_NX;
-}
-
-static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu)
-{
- int i;
- struct kvm_cpuid_entry2 *e, *entry;
-
- entry = NULL;
- for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
- e = &vcpu->arch.cpuid_entries[i];
- if (e->function == 0x80000001) {
- entry = e;
- break;
- }
- }
- if (entry && cpuid_entry_has(entry, X86_FEATURE_NX) && !is_efer_nx()) {
- cpuid_entry_clear(entry, X86_FEATURE_NX);
- printk(KERN_INFO "kvm: guest NX capability removed\n");
- }
-}
-
int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu)
{
struct kvm_cpuid_entry2 *best;
@@ -302,7 +278,6 @@ int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
vcpu->arch.cpuid_entries = e2;
vcpu->arch.cpuid_nent = cpuid->nent;
- cpuid_fix_nx_cap(vcpu);
kvm_update_cpuid_runtime(vcpu);
kvm_vcpu_after_set_cpuid(vcpu);
@@ -401,7 +376,6 @@ static __always_inline void kvm_cpu_cap_mask(enum cpuid_leafs leaf, u32 mask)
void kvm_set_cpu_caps(void)
{
- unsigned int f_nx = is_efer_nx() ? F(NX) : 0;
#ifdef CONFIG_X86_64
unsigned int f_gbpages = F(GBPAGES);
unsigned int f_lm = F(LM);
@@ -515,7 +489,7 @@ void kvm_set_cpu_caps(void)
F(CX8) | F(APIC) | 0 /* Reserved */ | F(SYSCALL) |
F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
F(PAT) | F(PSE36) | 0 /* Reserved */ |
- f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) |
+ F(NX) | 0 /* Reserved */ | F(MMXEXT) | F(MMX) |
F(FXSR) | F(FXSR_OPT) | f_gbpages | F(RDTSCP) |
0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW)
);
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index b07592ca92f0..41d2a53c5dea 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -1933,7 +1933,7 @@ ret_success:
void kvm_hv_set_cpuid(struct kvm_vcpu *vcpu)
{
struct kvm_cpuid_entry2 *entry;
- struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+ struct kvm_vcpu_hv *hv_vcpu;
entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_INTERFACE, 0);
if (entry && entry->eax == HYPERV_CPUID_SIGNATURE_EAX) {
@@ -2016,6 +2016,7 @@ static void kvm_hv_hypercall_set_result(struct kvm_vcpu *vcpu, u64 result)
static int kvm_hv_hypercall_complete(struct kvm_vcpu *vcpu, u64 result)
{
+ trace_kvm_hv_hypercall_done(result);
kvm_hv_hypercall_set_result(vcpu, result);
++vcpu->stat.hypercalls;
return kvm_skip_emulated_instruction(vcpu);
@@ -2139,6 +2140,7 @@ static bool hv_check_hypercall_access(struct kvm_vcpu_hv *hv_vcpu, u16 code)
int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
{
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
struct kvm_hv_hcall hc;
u64 ret = HV_STATUS_SUCCESS;
@@ -2173,17 +2175,25 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
hc.rep_idx = (hc.param >> HV_HYPERCALL_REP_START_OFFSET) & 0xfff;
hc.rep = !!(hc.rep_cnt || hc.rep_idx);
- if (hc.fast && is_xmm_fast_hypercall(&hc))
- kvm_hv_hypercall_read_xmm(&hc);
-
trace_kvm_hv_hypercall(hc.code, hc.fast, hc.rep_cnt, hc.rep_idx,
hc.ingpa, hc.outgpa);
- if (unlikely(!hv_check_hypercall_access(to_hv_vcpu(vcpu), hc.code))) {
+ if (unlikely(!hv_check_hypercall_access(hv_vcpu, hc.code))) {
ret = HV_STATUS_ACCESS_DENIED;
goto hypercall_complete;
}
+ if (hc.fast && is_xmm_fast_hypercall(&hc)) {
+ if (unlikely(hv_vcpu->enforce_cpuid &&
+ !(hv_vcpu->cpuid_cache.features_edx &
+ HV_X64_HYPERCALL_XMM_INPUT_AVAILABLE))) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+
+ kvm_hv_hypercall_read_xmm(&hc);
+ }
+
switch (hc.code) {
case HVCALL_NOTIFY_LONG_SPIN_WAIT:
if (unlikely(hc.rep)) {
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 629a09ca9860..0b80263d46d8 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -541,17 +541,17 @@ static int picdev_slave_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
addr, len, val);
}
-static int picdev_eclr_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
+static int picdev_elcr_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
gpa_t addr, int len, const void *val)
{
- return picdev_write(container_of(dev, struct kvm_pic, dev_eclr),
+ return picdev_write(container_of(dev, struct kvm_pic, dev_elcr),
addr, len, val);
}
-static int picdev_eclr_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
+static int picdev_elcr_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
gpa_t addr, int len, void *val)
{
- return picdev_read(container_of(dev, struct kvm_pic, dev_eclr),
+ return picdev_read(container_of(dev, struct kvm_pic, dev_elcr),
addr, len, val);
}
@@ -577,9 +577,9 @@ static const struct kvm_io_device_ops picdev_slave_ops = {
.write = picdev_slave_write,
};
-static const struct kvm_io_device_ops picdev_eclr_ops = {
- .read = picdev_eclr_read,
- .write = picdev_eclr_write,
+static const struct kvm_io_device_ops picdev_elcr_ops = {
+ .read = picdev_elcr_read,
+ .write = picdev_elcr_write,
};
int kvm_pic_init(struct kvm *kvm)
@@ -602,7 +602,7 @@ int kvm_pic_init(struct kvm *kvm)
*/
kvm_iodevice_init(&s->dev_master, &picdev_master_ops);
kvm_iodevice_init(&s->dev_slave, &picdev_slave_ops);
- kvm_iodevice_init(&s->dev_eclr, &picdev_eclr_ops);
+ kvm_iodevice_init(&s->dev_elcr, &picdev_elcr_ops);
mutex_lock(&kvm->slots_lock);
ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, 0x20, 2,
&s->dev_master);
@@ -613,7 +613,7 @@ int kvm_pic_init(struct kvm *kvm)
if (ret < 0)
goto fail_unreg_2;
- ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, 0x4d0, 2, &s->dev_eclr);
+ ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, 0x4d0, 2, &s->dev_elcr);
if (ret < 0)
goto fail_unreg_1;
@@ -647,7 +647,7 @@ void kvm_pic_destroy(struct kvm *kvm)
mutex_lock(&kvm->slots_lock);
kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_master);
kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_slave);
- kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_eclr);
+ kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_elcr);
mutex_unlock(&kvm->slots_lock);
kvm->arch.vpic = NULL;
diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
index 698969e18fe3..ff005fe738a4 100644
--- a/arch/x86/kvm/ioapic.c
+++ b/arch/x86/kvm/ioapic.c
@@ -96,7 +96,7 @@ static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic,
static void rtc_irq_eoi_tracking_reset(struct kvm_ioapic *ioapic)
{
ioapic->rtc_status.pending_eoi = 0;
- bitmap_zero(ioapic->rtc_status.dest_map.map, KVM_MAX_VCPU_ID);
+ bitmap_zero(ioapic->rtc_status.dest_map.map, KVM_MAX_VCPU_ID + 1);
}
static void kvm_rtc_eoi_tracking_restore_all(struct kvm_ioapic *ioapic);
diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h
index 660401700075..11e4065e1617 100644
--- a/arch/x86/kvm/ioapic.h
+++ b/arch/x86/kvm/ioapic.h
@@ -43,13 +43,13 @@ struct kvm_vcpu;
struct dest_map {
/* vcpu bitmap where IRQ has been sent */
- DECLARE_BITMAP(map, KVM_MAX_VCPU_ID);
+ DECLARE_BITMAP(map, KVM_MAX_VCPU_ID + 1);
/*
* Vector sent to a given vcpu, only valid when
* the vcpu's bit in map is set
*/
- u8 vectors[KVM_MAX_VCPU_ID];
+ u8 vectors[KVM_MAX_VCPU_ID + 1];
};
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 9b64abf9b3f1..650642b18d15 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -55,7 +55,7 @@ struct kvm_pic {
int output; /* intr from master PIC */
struct kvm_io_device dev_master;
struct kvm_io_device dev_slave;
- struct kvm_io_device dev_eclr;
+ struct kvm_io_device dev_elcr;
void (*ack_notifier)(void *opaque, int irq);
unsigned long irq_states[PIC_NUM_PINS];
};
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 66f7f5bc3482..47b765270239 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -1644,7 +1644,7 @@ static int is_empty_shadow_page(u64 *spt)
* aggregate version in order to make the slab shrinker
* faster
*/
-static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, unsigned long nr)
+static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, long nr)
{
kvm->arch.n_used_mmu_pages += nr;
percpu_counter_add(&kvm_total_used_mmu_pages, nr);
@@ -2535,6 +2535,7 @@ static void kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
int mmu_try_to_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn, bool can_unsync)
{
struct kvm_mmu_page *sp;
+ bool locked = false;
/*
* Force write-protection if the page is being tracked. Note, the page
@@ -2557,9 +2558,34 @@ int mmu_try_to_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn, bool can_unsync)
if (sp->unsync)
continue;
+ /*
+ * TDP MMU page faults require an additional spinlock as they
+ * run with mmu_lock held for read, not write, and the unsync
+ * logic is not thread safe. Take the spinklock regardless of
+ * the MMU type to avoid extra conditionals/parameters, there's
+ * no meaningful penalty if mmu_lock is held for write.
+ */
+ if (!locked) {
+ locked = true;
+ spin_lock(&vcpu->kvm->arch.mmu_unsync_pages_lock);
+
+ /*
+ * Recheck after taking the spinlock, a different vCPU
+ * may have since marked the page unsync. A false
+ * positive on the unprotected check above is not
+ * possible as clearing sp->unsync _must_ hold mmu_lock
+ * for write, i.e. unsync cannot transition from 0->1
+ * while this CPU holds mmu_lock for read (or write).
+ */
+ if (READ_ONCE(sp->unsync))
+ continue;
+ }
+
WARN_ON(sp->role.level != PG_LEVEL_4K);
kvm_unsync_page(vcpu, sp);
}
+ if (locked)
+ spin_unlock(&vcpu->kvm->arch.mmu_unsync_pages_lock);
/*
* We need to ensure that the marking of unsync pages is visible
@@ -5537,6 +5563,8 @@ void kvm_mmu_init_vm(struct kvm *kvm)
{
struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
+ spin_lock_init(&kvm->arch.mmu_unsync_pages_lock);
+
if (!kvm_mmu_init_tdp_mmu(kvm))
/*
* No smp_load/store wrappers needed here as we are in
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 0853370bd811..d80cb122b5f3 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -43,6 +43,7 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
if (!kvm->arch.tdp_mmu_enabled)
return;
+ WARN_ON(!list_empty(&kvm->arch.tdp_mmu_pages));
WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
/*
@@ -81,8 +82,6 @@ static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
bool shared)
{
- gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
-
kvm_lockdep_assert_mmu_lock_held(kvm, shared);
if (!refcount_dec_and_test(&root->tdp_mmu_root_count))
@@ -94,7 +93,7 @@ void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
list_del_rcu(&root->link);
spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
- zap_gfn_range(kvm, root, 0, max_gfn, false, false, shared);
+ zap_gfn_range(kvm, root, 0, -1ull, false, false, shared);
call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback);
}
@@ -724,13 +723,29 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
gfn_t start, gfn_t end, bool can_yield, bool flush,
bool shared)
{
+ gfn_t max_gfn_host = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
+ bool zap_all = (start == 0 && end >= max_gfn_host);
struct tdp_iter iter;
+ /*
+ * No need to try to step down in the iterator when zapping all SPTEs,
+ * zapping the top-level non-leaf SPTEs will recurse on their children.
+ */
+ int min_level = zap_all ? root->role.level : PG_LEVEL_4K;
+
+ /*
+ * Bound the walk at host.MAXPHYADDR, guest accesses beyond that will
+ * hit a #PF(RSVD) and never get to an EPT Violation/Misconfig / #NPF,
+ * and so KVM will never install a SPTE for such addresses.
+ */
+ end = min(end, max_gfn_host);
+
kvm_lockdep_assert_mmu_lock_held(kvm, shared);
rcu_read_lock();
- tdp_root_for_each_pte(iter, root, start, end) {
+ for_each_tdp_pte_min_level(iter, root->spt, root->role.level,
+ min_level, start, end) {
retry:
if (can_yield &&
tdp_mmu_iter_cond_resched(kvm, &iter, flush, shared)) {
@@ -744,9 +759,10 @@ retry:
/*
* If this is a non-last-level SPTE that covers a larger range
* than should be zapped, continue, and zap the mappings at a
- * lower level.
+ * lower level, except when zapping all SPTEs.
*/
- if ((iter.gfn < start ||
+ if (!zap_all &&
+ (iter.gfn < start ||
iter.gfn + KVM_PAGES_PER_HPAGE(iter.level) > end) &&
!is_last_spte(iter.old_spte, iter.level))
continue;
@@ -794,12 +810,11 @@ bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start,
void kvm_tdp_mmu_zap_all(struct kvm *kvm)
{
- gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
bool flush = false;
int i;
for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
- flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, 0, max_gfn,
+ flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, 0, -1ull,
flush, false);
if (flush)
@@ -838,7 +853,6 @@ static struct kvm_mmu_page *next_invalidated_root(struct kvm *kvm,
*/
void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
{
- gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
struct kvm_mmu_page *next_root;
struct kvm_mmu_page *root;
bool flush = false;
@@ -854,8 +868,7 @@ void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
rcu_read_unlock();
- flush = zap_gfn_range(kvm, root, 0, max_gfn, true, flush,
- true);
+ flush = zap_gfn_range(kvm, root, 0, -1ull, true, flush, true);
/*
* Put the reference acquired in
diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
index 1d01da64c333..a8ad78a2faa1 100644
--- a/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@@ -646,7 +646,7 @@ out:
void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
- struct vmcb *vmcb = svm->vmcb;
+ struct vmcb *vmcb = svm->vmcb01.ptr;
bool activated = kvm_vcpu_apicv_active(vcpu);
if (!enable_apicv)
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 3bd09c50c98b..e5515477c30a 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -158,6 +158,9 @@ void recalc_intercepts(struct vcpu_svm *svm)
/* If SMI is not intercepted, ignore guest SMI intercept as well */
if (!intercept_smi)
vmcb_clr_intercept(c, INTERCEPT_SMI);
+
+ vmcb_set_intercept(c, INTERCEPT_VMLOAD);
+ vmcb_set_intercept(c, INTERCEPT_VMSAVE);
}
static void copy_vmcb_control_area(struct vmcb_control_area *dst,
@@ -503,7 +506,11 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12
static void nested_vmcb02_prepare_control(struct vcpu_svm *svm)
{
- const u32 mask = V_INTR_MASKING_MASK | V_GIF_ENABLE_MASK | V_GIF_MASK;
+ const u32 int_ctl_vmcb01_bits =
+ V_INTR_MASKING_MASK | V_GIF_MASK | V_GIF_ENABLE_MASK;
+
+ const u32 int_ctl_vmcb12_bits = V_TPR_MASK | V_IRQ_INJECTION_BITS_MASK;
+
struct kvm_vcpu *vcpu = &svm->vcpu;
/*
@@ -515,7 +522,7 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm)
* Also covers avic_vapic_bar, avic_backing_page, avic_logical_id,
* avic_physical_id.
*/
- WARN_ON(svm->vmcb01.ptr->control.int_ctl & AVIC_ENABLE_MASK);
+ WARN_ON(kvm_apicv_activated(svm->vcpu.kvm));
/* Copied from vmcb01. msrpm_base can be overwritten later. */
svm->vmcb->control.nested_ctl = svm->vmcb01.ptr->control.nested_ctl;
@@ -535,8 +542,8 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm)
vcpu->arch.l1_tsc_offset + svm->nested.ctl.tsc_offset;
svm->vmcb->control.int_ctl =
- (svm->nested.ctl.int_ctl & ~mask) |
- (svm->vmcb01.ptr->control.int_ctl & mask);
+ (svm->nested.ctl.int_ctl & int_ctl_vmcb12_bits) |
+ (svm->vmcb01.ptr->control.int_ctl & int_ctl_vmcb01_bits);
svm->vmcb->control.virt_ext = svm->nested.ctl.virt_ext;
svm->vmcb->control.int_vector = svm->nested.ctl.int_vector;
@@ -702,8 +709,8 @@ out:
}
/* Copy state save area fields which are handled by VMRUN */
-void svm_copy_vmrun_state(struct vmcb_save_area *from_save,
- struct vmcb_save_area *to_save)
+void svm_copy_vmrun_state(struct vmcb_save_area *to_save,
+ struct vmcb_save_area *from_save)
{
to_save->es = from_save->es;
to_save->cs = from_save->cs;
@@ -722,7 +729,7 @@ void svm_copy_vmrun_state(struct vmcb_save_area *from_save,
to_save->cpl = 0;
}
-void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb)
+void svm_copy_vmloadsave_state(struct vmcb *to_vmcb, struct vmcb *from_vmcb)
{
to_vmcb->save.fs = from_vmcb->save.fs;
to_vmcb->save.gs = from_vmcb->save.gs;
@@ -1385,7 +1392,7 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
svm->nested.vmcb12_gpa = kvm_state->hdr.svm.vmcb_pa;
- svm_copy_vmrun_state(save, &svm->vmcb01.ptr->save);
+ svm_copy_vmrun_state(&svm->vmcb01.ptr->save, save);
nested_load_control_from_vmcb12(svm, ctl);
svm_switch_vmcb(svm, &svm->nested.vmcb02);
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 6710d9ee2e4b..7fbce342eec4 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -64,6 +64,7 @@ static DEFINE_MUTEX(sev_bitmap_lock);
unsigned int max_sev_asid;
static unsigned int min_sev_asid;
static unsigned long sev_me_mask;
+static unsigned int nr_asids;
static unsigned long *sev_asid_bitmap;
static unsigned long *sev_reclaim_asid_bitmap;
@@ -78,11 +79,11 @@ struct enc_region {
/* Called with the sev_bitmap_lock held, or on shutdown */
static int sev_flush_asids(int min_asid, int max_asid)
{
- int ret, pos, error = 0;
+ int ret, asid, error = 0;
/* Check if there are any ASIDs to reclaim before performing a flush */
- pos = find_next_bit(sev_reclaim_asid_bitmap, max_asid, min_asid);
- if (pos >= max_asid)
+ asid = find_next_bit(sev_reclaim_asid_bitmap, nr_asids, min_asid);
+ if (asid > max_asid)
return -EBUSY;
/*
@@ -115,15 +116,15 @@ static bool __sev_recycle_asids(int min_asid, int max_asid)
/* The flush process will flush all reclaimable SEV and SEV-ES ASIDs */
bitmap_xor(sev_asid_bitmap, sev_asid_bitmap, sev_reclaim_asid_bitmap,
- max_sev_asid);
- bitmap_zero(sev_reclaim_asid_bitmap, max_sev_asid);
+ nr_asids);
+ bitmap_zero(sev_reclaim_asid_bitmap, nr_asids);
return true;
}
static int sev_asid_new(struct kvm_sev_info *sev)
{
- int pos, min_asid, max_asid, ret;
+ int asid, min_asid, max_asid, ret;
bool retry = true;
enum misc_res_type type;
@@ -143,11 +144,11 @@ static int sev_asid_new(struct kvm_sev_info *sev)
* SEV-enabled guests must use asid from min_sev_asid to max_sev_asid.
* SEV-ES-enabled guest can use from 1 to min_sev_asid - 1.
*/
- min_asid = sev->es_active ? 0 : min_sev_asid - 1;
+ min_asid = sev->es_active ? 1 : min_sev_asid;
max_asid = sev->es_active ? min_sev_asid - 1 : max_sev_asid;
again:
- pos = find_next_zero_bit(sev_asid_bitmap, max_sev_asid, min_asid);
- if (pos >= max_asid) {
+ asid = find_next_zero_bit(sev_asid_bitmap, max_asid + 1, min_asid);
+ if (asid > max_asid) {
if (retry && __sev_recycle_asids(min_asid, max_asid)) {
retry = false;
goto again;
@@ -157,11 +158,11 @@ again:
goto e_uncharge;
}
- __set_bit(pos, sev_asid_bitmap);
+ __set_bit(asid, sev_asid_bitmap);
mutex_unlock(&sev_bitmap_lock);
- return pos + 1;
+ return asid;
e_uncharge:
misc_cg_uncharge(type, sev->misc_cg, 1);
put_misc_cg(sev->misc_cg);
@@ -179,17 +180,16 @@ static int sev_get_asid(struct kvm *kvm)
static void sev_asid_free(struct kvm_sev_info *sev)
{
struct svm_cpu_data *sd;
- int cpu, pos;
+ int cpu;
enum misc_res_type type;
mutex_lock(&sev_bitmap_lock);
- pos = sev->asid - 1;
- __set_bit(pos, sev_reclaim_asid_bitmap);
+ __set_bit(sev->asid, sev_reclaim_asid_bitmap);
for_each_possible_cpu(cpu) {
sd = per_cpu(svm_data, cpu);
- sd->sev_vmcbs[pos] = NULL;
+ sd->sev_vmcbs[sev->asid] = NULL;
}
mutex_unlock(&sev_bitmap_lock);
@@ -1857,12 +1857,17 @@ void __init sev_hardware_setup(void)
min_sev_asid = edx;
sev_me_mask = 1UL << (ebx & 0x3f);
- /* Initialize SEV ASID bitmaps */
- sev_asid_bitmap = bitmap_zalloc(max_sev_asid, GFP_KERNEL);
+ /*
+ * Initialize SEV ASID bitmaps. Allocate space for ASID 0 in the bitmap,
+ * even though it's never used, so that the bitmap is indexed by the
+ * actual ASID.
+ */
+ nr_asids = max_sev_asid + 1;
+ sev_asid_bitmap = bitmap_zalloc(nr_asids, GFP_KERNEL);
if (!sev_asid_bitmap)
goto out;
- sev_reclaim_asid_bitmap = bitmap_zalloc(max_sev_asid, GFP_KERNEL);
+ sev_reclaim_asid_bitmap = bitmap_zalloc(nr_asids, GFP_KERNEL);
if (!sev_reclaim_asid_bitmap) {
bitmap_free(sev_asid_bitmap);
sev_asid_bitmap = NULL;
@@ -1907,7 +1912,7 @@ void sev_hardware_teardown(void)
return;
/* No need to take sev_bitmap_lock, all VMs have been destroyed. */
- sev_flush_asids(0, max_sev_asid);
+ sev_flush_asids(1, max_sev_asid);
bitmap_free(sev_asid_bitmap);
bitmap_free(sev_reclaim_asid_bitmap);
@@ -1921,7 +1926,7 @@ int sev_cpu_init(struct svm_cpu_data *sd)
if (!sev_enabled)
return 0;
- sd->sev_vmcbs = kcalloc(max_sev_asid + 1, sizeof(void *), GFP_KERNEL);
+ sd->sev_vmcbs = kcalloc(nr_asids, sizeof(void *), GFP_KERNEL);
if (!sd->sev_vmcbs)
return -ENOMEM;
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 664d20f0689c..69639f9624f5 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -1406,8 +1406,6 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
goto error_free_vmsa_page;
}
- svm_vcpu_init_msrpm(vcpu, svm->msrpm);
-
svm->vmcb01.ptr = page_address(vmcb01_page);
svm->vmcb01.pa = __sme_set(page_to_pfn(vmcb01_page) << PAGE_SHIFT);
@@ -1419,6 +1417,8 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
svm_switch_vmcb(svm, &svm->vmcb01);
init_vmcb(vcpu);
+ svm_vcpu_init_msrpm(vcpu, svm->msrpm);
+
svm_init_osvw(vcpu);
vcpu->arch.microcode_version = 0x01000065;
@@ -1568,8 +1568,11 @@ static void svm_set_vintr(struct vcpu_svm *svm)
{
struct vmcb_control_area *control;
- /* The following fields are ignored when AVIC is enabled */
- WARN_ON(kvm_vcpu_apicv_active(&svm->vcpu));
+ /*
+ * The following fields are ignored when AVIC is enabled
+ */
+ WARN_ON(kvm_apicv_activated(svm->vcpu.kvm));
+
svm_set_intercept(svm, INTERCEPT_VINTR);
/*
@@ -1586,17 +1589,18 @@ static void svm_set_vintr(struct vcpu_svm *svm)
static void svm_clear_vintr(struct vcpu_svm *svm)
{
- const u32 mask = V_TPR_MASK | V_GIF_ENABLE_MASK | V_GIF_MASK | V_INTR_MASKING_MASK;
svm_clr_intercept(svm, INTERCEPT_VINTR);
/* Drop int_ctl fields related to VINTR injection. */
- svm->vmcb->control.int_ctl &= mask;
+ svm->vmcb->control.int_ctl &= ~V_IRQ_INJECTION_BITS_MASK;
if (is_guest_mode(&svm->vcpu)) {
- svm->vmcb01.ptr->control.int_ctl &= mask;
+ svm->vmcb01.ptr->control.int_ctl &= ~V_IRQ_INJECTION_BITS_MASK;
WARN_ON((svm->vmcb->control.int_ctl & V_TPR_MASK) !=
(svm->nested.ctl.int_ctl & V_TPR_MASK));
- svm->vmcb->control.int_ctl |= svm->nested.ctl.int_ctl & ~mask;
+
+ svm->vmcb->control.int_ctl |= svm->nested.ctl.int_ctl &
+ V_IRQ_INJECTION_BITS_MASK;
}
vmcb_mark_dirty(svm->vmcb, VMCB_INTR);
@@ -2147,11 +2151,12 @@ static int vmload_vmsave_interception(struct kvm_vcpu *vcpu, bool vmload)
ret = kvm_skip_emulated_instruction(vcpu);
if (vmload) {
- nested_svm_vmloadsave(vmcb12, svm->vmcb);
+ svm_copy_vmloadsave_state(svm->vmcb, vmcb12);
svm->sysenter_eip_hi = 0;
svm->sysenter_esp_hi = 0;
- } else
- nested_svm_vmloadsave(svm->vmcb, vmcb12);
+ } else {
+ svm_copy_vmloadsave_state(vmcb12, svm->vmcb);
+ }
kvm_vcpu_unmap(vcpu, &map, true);
@@ -4344,8 +4349,8 @@ static int svm_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
BUILD_BUG_ON(offsetof(struct vmcb, save) != 0x400);
- svm_copy_vmrun_state(&svm->vmcb01.ptr->save,
- map_save.hva + 0x400);
+ svm_copy_vmrun_state(map_save.hva + 0x400,
+ &svm->vmcb01.ptr->save);
kvm_vcpu_unmap(vcpu, &map_save, true);
}
@@ -4393,8 +4398,8 @@ static int svm_leave_smm(struct kvm_vcpu *vcpu, const char *smstate)
&map_save) == -EINVAL)
return 1;
- svm_copy_vmrun_state(map_save.hva + 0x400,
- &svm->vmcb01.ptr->save);
+ svm_copy_vmrun_state(&svm->vmcb01.ptr->save,
+ map_save.hva + 0x400);
kvm_vcpu_unmap(vcpu, &map_save, true);
}
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 7e2090752d8f..bd0fe94c2920 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -464,9 +464,9 @@ void svm_leave_nested(struct vcpu_svm *svm);
void svm_free_nested(struct vcpu_svm *svm);
int svm_allocate_nested(struct vcpu_svm *svm);
int nested_svm_vmrun(struct kvm_vcpu *vcpu);
-void svm_copy_vmrun_state(struct vmcb_save_area *from_save,
- struct vmcb_save_area *to_save);
-void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb);
+void svm_copy_vmrun_state(struct vmcb_save_area *to_save,
+ struct vmcb_save_area *from_save);
+void svm_copy_vmloadsave_state(struct vmcb *to_vmcb, struct vmcb *from_vmcb);
int nested_svm_vmexit(struct vcpu_svm *svm);
static inline int nested_svm_simple_vmexit(struct vcpu_svm *svm, u32 exit_code)
diff --git a/arch/x86/kvm/svm/svm_onhyperv.h b/arch/x86/kvm/svm/svm_onhyperv.h
index 9b9a55abc29f..c53b8bf8d013 100644
--- a/arch/x86/kvm/svm/svm_onhyperv.h
+++ b/arch/x86/kvm/svm/svm_onhyperv.h
@@ -89,7 +89,7 @@ static inline void svm_hv_vmcb_dirty_nested_enlightenments(
* as we mark it dirty unconditionally towards end of vcpu
* init phase.
*/
- if (vmcb && vmcb_is_clean(vmcb, VMCB_HV_NESTED_ENLIGHTENMENTS) &&
+ if (vmcb_is_clean(vmcb, VMCB_HV_NESTED_ENLIGHTENMENTS) &&
hve->hv_enlightenments_control.msr_bitmap)
vmcb_mark_dirty(vmcb, VMCB_HV_NESTED_ENLIGHTENMENTS);
}
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index b484141ea15b..03ebe368333e 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -92,6 +92,21 @@ TRACE_EVENT(kvm_hv_hypercall,
__entry->outgpa)
);
+TRACE_EVENT(kvm_hv_hypercall_done,
+ TP_PROTO(u64 result),
+ TP_ARGS(result),
+
+ TP_STRUCT__entry(
+ __field(__u64, result)
+ ),
+
+ TP_fast_assign(
+ __entry->result = result;
+ ),
+
+ TP_printk("result 0x%llx", __entry->result)
+);
+
/*
* Tracepoint for Xen hypercall.
*/
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 1a52134b0c42..b3f77d18eb5a 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -330,6 +330,31 @@ void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu)
vcpu_put(vcpu);
}
+#define EPTP_PA_MASK GENMASK_ULL(51, 12)
+
+static bool nested_ept_root_matches(hpa_t root_hpa, u64 root_eptp, u64 eptp)
+{
+ return VALID_PAGE(root_hpa) &&
+ ((root_eptp & EPTP_PA_MASK) == (eptp & EPTP_PA_MASK));
+}
+
+static void nested_ept_invalidate_addr(struct kvm_vcpu *vcpu, gpa_t eptp,
+ gpa_t addr)
+{
+ uint i;
+ struct kvm_mmu_root_info *cached_root;
+
+ WARN_ON_ONCE(!mmu_is_nested(vcpu));
+
+ for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
+ cached_root = &vcpu->arch.mmu->prev_roots[i];
+
+ if (nested_ept_root_matches(cached_root->hpa, cached_root->pgd,
+ eptp))
+ vcpu->arch.mmu->invlpg(vcpu, addr, cached_root->hpa);
+ }
+}
+
static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,
struct x86_exception *fault)
{
@@ -342,10 +367,22 @@ static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,
vm_exit_reason = EXIT_REASON_PML_FULL;
vmx->nested.pml_full = false;
exit_qualification &= INTR_INFO_UNBLOCK_NMI;
- } else if (fault->error_code & PFERR_RSVD_MASK)
- vm_exit_reason = EXIT_REASON_EPT_MISCONFIG;
- else
- vm_exit_reason = EXIT_REASON_EPT_VIOLATION;
+ } else {
+ if (fault->error_code & PFERR_RSVD_MASK)
+ vm_exit_reason = EXIT_REASON_EPT_MISCONFIG;
+ else
+ vm_exit_reason = EXIT_REASON_EPT_VIOLATION;
+
+ /*
+ * Although the caller (kvm_inject_emulated_page_fault) would
+ * have already synced the faulting address in the shadow EPT
+ * tables for the current EPTP12, we also need to sync it for
+ * any other cached EPTP02s based on the same EP4TA, since the
+ * TLB associates mappings to the EP4TA rather than the full EPTP.
+ */
+ nested_ept_invalidate_addr(vcpu, vmcs12->ept_pointer,
+ fault->address);
+ }
nested_vmx_vmexit(vcpu, vm_exit_reason, 0, exit_qualification);
vmcs12->guest_physical_address = fault->address;
@@ -5325,14 +5362,6 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu)
return nested_vmx_succeed(vcpu);
}
-#define EPTP_PA_MASK GENMASK_ULL(51, 12)
-
-static bool nested_ept_root_matches(hpa_t root_hpa, u64 root_eptp, u64 eptp)
-{
- return VALID_PAGE(root_hpa) &&
- ((root_eptp & EPTP_PA_MASK) == (eptp & EPTP_PA_MASK));
-}
-
/* Emulate the INVEPT instruction */
static int handle_invept(struct kvm_vcpu *vcpu)
{
@@ -5826,7 +5855,8 @@ static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu,
if (is_nmi(intr_info))
return true;
else if (is_page_fault(intr_info))
- return vcpu->arch.apf.host_apf_flags || !enable_ept;
+ return vcpu->arch.apf.host_apf_flags ||
+ vmx_need_pf_intercept(vcpu);
else if (is_debug(intr_info) &&
vcpu->guest_debug &
(KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index db88ed4f2121..17a1cb4b059d 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -522,7 +522,7 @@ static inline struct vmcs *alloc_vmcs(bool shadow)
static inline bool vmx_has_waitpkg(struct vcpu_vmx *vmx)
{
- return vmx->secondary_exec_control &
+ return secondary_exec_controls_get(vmx) &
SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE;
}
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index a4fd10604f72..e5d5c5ed7dd4 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3407,7 +3407,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
return 1;
break;
case MSR_KVM_ASYNC_PF_ACK:
- if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF))
+ if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT))
return 1;
if (data & 0x1) {
vcpu->arch.apf.pageready_pending = false;
@@ -3746,7 +3746,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
msr_info->data = vcpu->arch.apf.msr_int_val;
break;
case MSR_KVM_ASYNC_PF_ACK:
- if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF))
+ if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT))
return 1;
msr_info->data = 0;
@@ -4358,8 +4358,17 @@ static int kvm_cpu_accept_dm_intr(struct kvm_vcpu *vcpu)
static int kvm_vcpu_ready_for_interrupt_injection(struct kvm_vcpu *vcpu)
{
- return kvm_arch_interrupt_allowed(vcpu) &&
- kvm_cpu_accept_dm_intr(vcpu);
+ /*
+ * Do not cause an interrupt window exit if an exception
+ * is pending or an event needs reinjection; userspace
+ * might want to inject the interrupt manually using KVM_SET_REGS
+ * or KVM_SET_SREGS. For that to work, we must be at an
+ * instruction boundary and with no events half-injected.
+ */
+ return (kvm_arch_interrupt_allowed(vcpu) &&
+ kvm_cpu_accept_dm_intr(vcpu) &&
+ !kvm_event_needs_reinjection(vcpu) &&
+ !vcpu->arch.exception.pending);
}
static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,