summaryrefslogtreecommitdiff
path: root/arch/x86/kvm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kvm')
-rw-r--r--arch/x86/kvm/hyperv.c91
-rw-r--r--arch/x86/kvm/hyperv.h1
-rw-r--r--arch/x86/kvm/lapic.c12
-rw-r--r--arch/x86/kvm/mmu/mmu_internal.h5
-rw-r--r--arch/x86/kvm/mmu/tdp_iter.c30
-rw-r--r--arch/x86/kvm/mmu/tdp_iter.h4
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.c51
-rw-r--r--arch/x86/kvm/svm/svm.c25
-rw-r--r--arch/x86/kvm/vmx/vmx.c2
-rw-r--r--arch/x86/kvm/x86.c115
10 files changed, 228 insertions, 108 deletions
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 58fa8c029867..f98370a39936 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -520,10 +520,10 @@ static u64 get_time_ref_counter(struct kvm *kvm)
u64 tsc;
/*
- * The guest has not set up the TSC page or the clock isn't
- * stable, fall back to get_kvmclock_ns.
+ * Fall back to get_kvmclock_ns() when TSC page hasn't been set up,
+ * is broken, disabled or being updated.
*/
- if (!hv->tsc_ref.tsc_sequence)
+ if (hv->hv_tsc_page_status != HV_TSC_PAGE_SET)
return div_u64(get_kvmclock_ns(kvm), 100);
vcpu = kvm_get_vcpu(kvm, 0);
@@ -1077,6 +1077,21 @@ static bool compute_tsc_page_parameters(struct pvclock_vcpu_time_info *hv_clock,
return true;
}
+/*
+ * Don't touch TSC page values if the guest has opted for TSC emulation after
+ * migration. KVM doesn't fully support reenlightenment notifications and TSC
+ * access emulation and Hyper-V is known to expect the values in TSC page to
+ * stay constant before TSC access emulation is disabled from guest side
+ * (HV_X64_MSR_TSC_EMULATION_STATUS). KVM userspace is expected to preserve TSC
+ * frequency and guest visible TSC value across migration (and prevent it when
+ * TSC scaling is unsupported).
+ */
+static inline bool tsc_page_update_unsafe(struct kvm_hv *hv)
+{
+ return (hv->hv_tsc_page_status != HV_TSC_PAGE_GUEST_CHANGED) &&
+ hv->hv_tsc_emulation_control;
+}
+
void kvm_hv_setup_tsc_page(struct kvm *kvm,
struct pvclock_vcpu_time_info *hv_clock)
{
@@ -1087,7 +1102,8 @@ void kvm_hv_setup_tsc_page(struct kvm *kvm,
BUILD_BUG_ON(sizeof(tsc_seq) != sizeof(hv->tsc_ref.tsc_sequence));
BUILD_BUG_ON(offsetof(struct ms_hyperv_tsc_page, tsc_sequence) != 0);
- if (!(hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE))
+ if (hv->hv_tsc_page_status == HV_TSC_PAGE_BROKEN ||
+ hv->hv_tsc_page_status == HV_TSC_PAGE_UNSET)
return;
mutex_lock(&hv->hv_lock);
@@ -1101,7 +1117,15 @@ void kvm_hv_setup_tsc_page(struct kvm *kvm,
*/
if (unlikely(kvm_read_guest(kvm, gfn_to_gpa(gfn),
&tsc_seq, sizeof(tsc_seq))))
+ goto out_err;
+
+ if (tsc_seq && tsc_page_update_unsafe(hv)) {
+ if (kvm_read_guest(kvm, gfn_to_gpa(gfn), &hv->tsc_ref, sizeof(hv->tsc_ref)))
+ goto out_err;
+
+ hv->hv_tsc_page_status = HV_TSC_PAGE_SET;
goto out_unlock;
+ }
/*
* While we're computing and writing the parameters, force the
@@ -1110,15 +1134,15 @@ void kvm_hv_setup_tsc_page(struct kvm *kvm,
hv->tsc_ref.tsc_sequence = 0;
if (kvm_write_guest(kvm, gfn_to_gpa(gfn),
&hv->tsc_ref, sizeof(hv->tsc_ref.tsc_sequence)))
- goto out_unlock;
+ goto out_err;
if (!compute_tsc_page_parameters(hv_clock, &hv->tsc_ref))
- goto out_unlock;
+ goto out_err;
/* Ensure sequence is zero before writing the rest of the struct. */
smp_wmb();
if (kvm_write_guest(kvm, gfn_to_gpa(gfn), &hv->tsc_ref, sizeof(hv->tsc_ref)))
- goto out_unlock;
+ goto out_err;
/*
* Now switch to the TSC page mechanism by writing the sequence.
@@ -1131,8 +1155,45 @@ void kvm_hv_setup_tsc_page(struct kvm *kvm,
smp_wmb();
hv->tsc_ref.tsc_sequence = tsc_seq;
- kvm_write_guest(kvm, gfn_to_gpa(gfn),
- &hv->tsc_ref, sizeof(hv->tsc_ref.tsc_sequence));
+ if (kvm_write_guest(kvm, gfn_to_gpa(gfn),
+ &hv->tsc_ref, sizeof(hv->tsc_ref.tsc_sequence)))
+ goto out_err;
+
+ hv->hv_tsc_page_status = HV_TSC_PAGE_SET;
+ goto out_unlock;
+
+out_err:
+ hv->hv_tsc_page_status = HV_TSC_PAGE_BROKEN;
+out_unlock:
+ mutex_unlock(&hv->hv_lock);
+}
+
+void kvm_hv_invalidate_tsc_page(struct kvm *kvm)
+{
+ struct kvm_hv *hv = to_kvm_hv(kvm);
+ u64 gfn;
+
+ if (hv->hv_tsc_page_status == HV_TSC_PAGE_BROKEN ||
+ hv->hv_tsc_page_status == HV_TSC_PAGE_UNSET ||
+ tsc_page_update_unsafe(hv))
+ return;
+
+ mutex_lock(&hv->hv_lock);
+
+ if (!(hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE))
+ goto out_unlock;
+
+ /* Preserve HV_TSC_PAGE_GUEST_CHANGED/HV_TSC_PAGE_HOST_CHANGED states */
+ if (hv->hv_tsc_page_status == HV_TSC_PAGE_SET)
+ hv->hv_tsc_page_status = HV_TSC_PAGE_UPDATING;
+
+ gfn = hv->hv_tsc_page >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT;
+
+ hv->tsc_ref.tsc_sequence = 0;
+ if (kvm_write_guest(kvm, gfn_to_gpa(gfn),
+ &hv->tsc_ref, sizeof(hv->tsc_ref.tsc_sequence)))
+ hv->hv_tsc_page_status = HV_TSC_PAGE_BROKEN;
+
out_unlock:
mutex_unlock(&hv->hv_lock);
}
@@ -1193,8 +1254,15 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
}
case HV_X64_MSR_REFERENCE_TSC:
hv->hv_tsc_page = data;
- if (hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE)
+ if (hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE) {
+ if (!host)
+ hv->hv_tsc_page_status = HV_TSC_PAGE_GUEST_CHANGED;
+ else
+ hv->hv_tsc_page_status = HV_TSC_PAGE_HOST_CHANGED;
kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
+ } else {
+ hv->hv_tsc_page_status = HV_TSC_PAGE_UNSET;
+ }
break;
case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
return kvm_hv_msr_set_crash_data(kvm,
@@ -1229,6 +1297,9 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
hv->hv_tsc_emulation_control = data;
break;
case HV_X64_MSR_TSC_EMULATION_STATUS:
+ if (data && !host)
+ return 1;
+
hv->hv_tsc_emulation_status = data;
break;
case HV_X64_MSR_TIME_REF_COUNT:
diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h
index e951af1fcb2c..60547d5cb6d7 100644
--- a/arch/x86/kvm/hyperv.h
+++ b/arch/x86/kvm/hyperv.h
@@ -133,6 +133,7 @@ void kvm_hv_process_stimers(struct kvm_vcpu *vcpu);
void kvm_hv_setup_tsc_page(struct kvm *kvm,
struct pvclock_vcpu_time_info *hv_clock);
+void kvm_hv_invalidate_tsc_page(struct kvm *kvm);
void kvm_hv_init_vm(struct kvm *kvm);
void kvm_hv_destroy_vm(struct kvm *kvm);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 45d40bfacb7c..cc369b9ad8f1 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1642,7 +1642,16 @@ static void apic_timer_expired(struct kvm_lapic *apic, bool from_timer_fn)
}
if (kvm_use_posted_timer_interrupt(apic->vcpu)) {
- kvm_wait_lapic_expire(vcpu);
+ /*
+ * Ensure the guest's timer has truly expired before posting an
+ * interrupt. Open code the relevant checks to avoid querying
+ * lapic_timer_int_injected(), which will be false since the
+ * interrupt isn't yet injected. Waiting until after injecting
+ * is not an option since that won't help a posted interrupt.
+ */
+ if (vcpu->arch.apic->lapic_timer.expired_tscdeadline &&
+ vcpu->arch.apic->lapic_timer.timer_advance_ns)
+ __kvm_wait_lapic_expire(vcpu);
kvm_apic_inject_pending_timer_irqs(apic);
return;
}
@@ -2595,6 +2604,7 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
apic_update_ppr(apic);
hrtimer_cancel(&apic->lapic_timer.timer);
+ apic->lapic_timer.expired_tscdeadline = 0;
apic_update_lvtt(apic);
apic_manage_nmi_watchdog(apic, kvm_lapic_get_reg(apic, APIC_LVT0));
update_divide_count(apic);
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index ec4fc28b325a..1f6f98c76bdf 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -78,6 +78,11 @@ static inline struct kvm_mmu_page *sptep_to_sp(u64 *sptep)
return to_shadow_page(__pa(sptep));
}
+static inline int kvm_mmu_page_as_id(struct kvm_mmu_page *sp)
+{
+ return sp->role.smm ? 1 : 0;
+}
+
static inline bool kvm_vcpu_ad_need_write_protect(struct kvm_vcpu *vcpu)
{
/*
diff --git a/arch/x86/kvm/mmu/tdp_iter.c b/arch/x86/kvm/mmu/tdp_iter.c
index e5f148106e20..b3ed302c1a35 100644
--- a/arch/x86/kvm/mmu/tdp_iter.c
+++ b/arch/x86/kvm/mmu/tdp_iter.c
@@ -21,6 +21,21 @@ static gfn_t round_gfn_for_level(gfn_t gfn, int level)
}
/*
+ * Return the TDP iterator to the root PT and allow it to continue its
+ * traversal over the paging structure from there.
+ */
+void tdp_iter_restart(struct tdp_iter *iter)
+{
+ iter->yielded_gfn = iter->next_last_level_gfn;
+ iter->level = iter->root_level;
+
+ iter->gfn = round_gfn_for_level(iter->next_last_level_gfn, iter->level);
+ tdp_iter_refresh_sptep(iter);
+
+ iter->valid = true;
+}
+
+/*
* Sets a TDP iterator to walk a pre-order traversal of the paging structure
* rooted at root_pt, starting with the walk to translate next_last_level_gfn.
*/
@@ -31,16 +46,12 @@ void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level,
WARN_ON(root_level > PT64_ROOT_MAX_LEVEL);
iter->next_last_level_gfn = next_last_level_gfn;
- iter->yielded_gfn = iter->next_last_level_gfn;
iter->root_level = root_level;
iter->min_level = min_level;
- iter->level = root_level;
- iter->pt_path[iter->level - 1] = (tdp_ptep_t)root_pt;
-
- iter->gfn = round_gfn_for_level(iter->next_last_level_gfn, iter->level);
- tdp_iter_refresh_sptep(iter);
+ iter->pt_path[iter->root_level - 1] = (tdp_ptep_t)root_pt;
+ iter->as_id = kvm_mmu_page_as_id(sptep_to_sp(root_pt));
- iter->valid = true;
+ tdp_iter_restart(iter);
}
/*
@@ -159,8 +170,3 @@ void tdp_iter_next(struct tdp_iter *iter)
iter->valid = false;
}
-tdp_ptep_t tdp_iter_root_pt(struct tdp_iter *iter)
-{
- return iter->pt_path[iter->root_level - 1];
-}
-
diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h
index 4cc177d75c4a..b1748b988d3a 100644
--- a/arch/x86/kvm/mmu/tdp_iter.h
+++ b/arch/x86/kvm/mmu/tdp_iter.h
@@ -36,6 +36,8 @@ struct tdp_iter {
int min_level;
/* The iterator's current level within the paging structure */
int level;
+ /* The address space ID, i.e. SMM vs. regular. */
+ int as_id;
/* A snapshot of the value at sptep */
u64 old_spte;
/*
@@ -62,6 +64,6 @@ tdp_ptep_t spte_to_child_pt(u64 pte, int level);
void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level,
int min_level, gfn_t next_last_level_gfn);
void tdp_iter_next(struct tdp_iter *iter);
-tdp_ptep_t tdp_iter_root_pt(struct tdp_iter *iter);
+void tdp_iter_restart(struct tdp_iter *iter);
#endif /* __KVM_X86_MMU_TDP_ITER_H */
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index c926c6b899a1..462b1f71c77f 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -203,11 +203,6 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
u64 old_spte, u64 new_spte, int level,
bool shared);
-static int kvm_mmu_page_as_id(struct kvm_mmu_page *sp)
-{
- return sp->role.smm ? 1 : 0;
-}
-
static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level)
{
bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
@@ -301,11 +296,16 @@ static void tdp_mmu_unlink_page(struct kvm *kvm, struct kvm_mmu_page *sp,
*
* Given a page table that has been removed from the TDP paging structure,
* iterates through the page table to clear SPTEs and free child page tables.
+ *
+ * Note that pt is passed in as a tdp_ptep_t, but it does not need RCU
+ * protection. Since this thread removed it from the paging structure,
+ * this thread will be responsible for ensuring the page is freed. Hence the
+ * early rcu_dereferences in the function.
*/
-static void handle_removed_tdp_mmu_page(struct kvm *kvm, u64 *pt,
+static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt,
bool shared)
{
- struct kvm_mmu_page *sp = sptep_to_sp(pt);
+ struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt));
int level = sp->role.level;
gfn_t base_gfn = sp->gfn;
u64 old_child_spte;
@@ -318,7 +318,7 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, u64 *pt,
tdp_mmu_unlink_page(kvm, sp, shared);
for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
- sptep = pt + i;
+ sptep = rcu_dereference(pt) + i;
gfn = base_gfn + (i * KVM_PAGES_PER_HPAGE(level - 1));
if (shared) {
@@ -337,7 +337,18 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, u64 *pt,
cpu_relax();
}
} else {
+ /*
+ * If the SPTE is not MMU-present, there is no backing
+ * page associated with the SPTE and so no side effects
+ * that need to be recorded, and exclusive ownership of
+ * mmu_lock ensures the SPTE can't be made present.
+ * Note, zapping MMIO SPTEs is also unnecessary as they
+ * are guarded by the memslots generation, not by being
+ * unreachable.
+ */
old_child_spte = READ_ONCE(*sptep);
+ if (!is_shadow_present_pte(old_child_spte))
+ continue;
/*
* Marking the SPTE as a removed SPTE is not
@@ -481,10 +492,6 @@ static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm,
struct tdp_iter *iter,
u64 new_spte)
{
- u64 *root_pt = tdp_iter_root_pt(iter);
- struct kvm_mmu_page *root = sptep_to_sp(root_pt);
- int as_id = kvm_mmu_page_as_id(root);
-
lockdep_assert_held_read(&kvm->mmu_lock);
/*
@@ -498,8 +505,8 @@ static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm,
new_spte) != iter->old_spte)
return false;
- handle_changed_spte(kvm, as_id, iter->gfn, iter->old_spte, new_spte,
- iter->level, true);
+ handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
+ new_spte, iter->level, true);
return true;
}
@@ -527,7 +534,7 @@ static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm,
* here since the SPTE is going from non-present
* to non-present.
*/
- WRITE_ONCE(*iter->sptep, 0);
+ WRITE_ONCE(*rcu_dereference(iter->sptep), 0);
return true;
}
@@ -553,10 +560,6 @@ static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
u64 new_spte, bool record_acc_track,
bool record_dirty_log)
{
- tdp_ptep_t root_pt = tdp_iter_root_pt(iter);
- struct kvm_mmu_page *root = sptep_to_sp(root_pt);
- int as_id = kvm_mmu_page_as_id(root);
-
lockdep_assert_held_write(&kvm->mmu_lock);
/*
@@ -570,13 +573,13 @@ static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
WRITE_ONCE(*rcu_dereference(iter->sptep), new_spte);
- __handle_changed_spte(kvm, as_id, iter->gfn, iter->old_spte, new_spte,
- iter->level, false);
+ __handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
+ new_spte, iter->level, false);
if (record_acc_track)
handle_changed_spte_acc_track(iter->old_spte, new_spte,
iter->level);
if (record_dirty_log)
- handle_changed_spte_dirty_log(kvm, as_id, iter->gfn,
+ handle_changed_spte_dirty_log(kvm, iter->as_id, iter->gfn,
iter->old_spte, new_spte,
iter->level);
}
@@ -648,9 +651,7 @@ static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm,
WARN_ON(iter->gfn > iter->next_last_level_gfn);
- tdp_iter_start(iter, iter->pt_path[iter->root_level - 1],
- iter->root_level, iter->min_level,
- iter->next_last_level_gfn);
+ tdp_iter_restart(iter);
return true;
}
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index baee91c1e936..58a45bb139f8 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -115,13 +115,6 @@ static const struct svm_direct_access_msrs {
{ .index = MSR_INVALID, .always = false },
};
-/* enable NPT for AMD64 and X86 with PAE */
-#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
-bool npt_enabled = true;
-#else
-bool npt_enabled;
-#endif
-
/*
* These 2 parameters are used to config the controls for Pause-Loop Exiting:
* pause_filter_count: On processors that support Pause filtering(indicated
@@ -170,9 +163,12 @@ module_param(pause_filter_count_shrink, ushort, 0444);
static unsigned short pause_filter_count_max = KVM_SVM_DEFAULT_PLE_WINDOW_MAX;
module_param(pause_filter_count_max, ushort, 0444);
-/* allow nested paging (virtualized MMU) for all guests */
-static int npt = true;
-module_param(npt, int, S_IRUGO);
+/*
+ * Use nested page tables by default. Note, NPT may get forced off by
+ * svm_hardware_setup() if it's unsupported by hardware or the host kernel.
+ */
+bool npt_enabled = true;
+module_param_named(npt, npt_enabled, bool, 0444);
/* allow nested virtualization in KVM/SVM */
static int nested = true;
@@ -988,10 +984,15 @@ static __init int svm_hardware_setup(void)
goto err;
}
- if (!boot_cpu_has(X86_FEATURE_NPT))
+ /*
+ * KVM's MMU doesn't support using 2-level paging for itself, and thus
+ * NPT isn't supported if the host is using 2-level paging since host
+ * CR4 is unchanged on VMRUN.
+ */
+ if (!IS_ENABLED(CONFIG_X86_64) && !IS_ENABLED(CONFIG_X86_PAE))
npt_enabled = false;
- if (npt_enabled && !npt)
+ if (!boot_cpu_has(X86_FEATURE_NPT))
npt_enabled = false;
kvm_configure_mmu(npt_enabled, get_max_npt_level(), PG_LEVEL_1G);
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 50810d471462..32cf8287d4a7 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -6580,8 +6580,8 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
int i, nr_msrs;
struct perf_guest_switch_msr *msrs;
+ /* Note, nr_msrs may be garbage if perf_guest_get_msrs() returns NULL. */
msrs = perf_guest_get_msrs(&nr_msrs);
-
if (!msrs)
return;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 2a20ce60152e..fe806e894212 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1526,35 +1526,44 @@ EXPORT_SYMBOL_GPL(kvm_enable_efer_bits);
bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type)
{
+ struct kvm_x86_msr_filter *msr_filter;
+ struct msr_bitmap_range *ranges;
struct kvm *kvm = vcpu->kvm;
- struct msr_bitmap_range *ranges = kvm->arch.msr_filter.ranges;
- u32 count = kvm->arch.msr_filter.count;
- u32 i;
- bool r = kvm->arch.msr_filter.default_allow;
+ bool allowed;
int idx;
+ u32 i;
- /* MSR filtering not set up or x2APIC enabled, allow everything */
- if (!count || (index >= 0x800 && index <= 0x8ff))
+ /* x2APIC MSRs do not support filtering. */
+ if (index >= 0x800 && index <= 0x8ff)
return true;
- /* Prevent collision with set_msr_filter */
idx = srcu_read_lock(&kvm->srcu);
- for (i = 0; i < count; i++) {
+ msr_filter = srcu_dereference(kvm->arch.msr_filter, &kvm->srcu);
+ if (!msr_filter) {
+ allowed = true;
+ goto out;
+ }
+
+ allowed = msr_filter->default_allow;
+ ranges = msr_filter->ranges;
+
+ for (i = 0; i < msr_filter->count; i++) {
u32 start = ranges[i].base;
u32 end = start + ranges[i].nmsrs;
u32 flags = ranges[i].flags;
unsigned long *bitmap = ranges[i].bitmap;
if ((index >= start) && (index < end) && (flags & type)) {
- r = !!test_bit(index - start, bitmap);
+ allowed = !!test_bit(index - start, bitmap);
break;
}
}
+out:
srcu_read_unlock(&kvm->srcu, idx);
- return r;
+ return allowed;
}
EXPORT_SYMBOL_GPL(kvm_msr_allowed);
@@ -2551,6 +2560,8 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)
struct kvm_vcpu *vcpu;
struct kvm_arch *ka = &kvm->arch;
+ kvm_hv_invalidate_tsc_page(kvm);
+
spin_lock(&ka->pvclock_gtod_sync_lock);
kvm_make_mclock_inprogress_request(kvm);
/* no guest entries from this point */
@@ -5352,25 +5363,34 @@ split_irqchip_unlock:
return r;
}
-static void kvm_clear_msr_filter(struct kvm *kvm)
+static struct kvm_x86_msr_filter *kvm_alloc_msr_filter(bool default_allow)
+{
+ struct kvm_x86_msr_filter *msr_filter;
+
+ msr_filter = kzalloc(sizeof(*msr_filter), GFP_KERNEL_ACCOUNT);
+ if (!msr_filter)
+ return NULL;
+
+ msr_filter->default_allow = default_allow;
+ return msr_filter;
+}
+
+static void kvm_free_msr_filter(struct kvm_x86_msr_filter *msr_filter)
{
u32 i;
- u32 count = kvm->arch.msr_filter.count;
- struct msr_bitmap_range ranges[16];
- mutex_lock(&kvm->lock);
- kvm->arch.msr_filter.count = 0;
- memcpy(ranges, kvm->arch.msr_filter.ranges, count * sizeof(ranges[0]));
- mutex_unlock(&kvm->lock);
- synchronize_srcu(&kvm->srcu);
+ if (!msr_filter)
+ return;
+
+ for (i = 0; i < msr_filter->count; i++)
+ kfree(msr_filter->ranges[i].bitmap);
- for (i = 0; i < count; i++)
- kfree(ranges[i].bitmap);
+ kfree(msr_filter);
}
-static int kvm_add_msr_filter(struct kvm *kvm, struct kvm_msr_filter_range *user_range)
+static int kvm_add_msr_filter(struct kvm_x86_msr_filter *msr_filter,
+ struct kvm_msr_filter_range *user_range)
{
- struct msr_bitmap_range *ranges = kvm->arch.msr_filter.ranges;
struct msr_bitmap_range range;
unsigned long *bitmap = NULL;
size_t bitmap_size;
@@ -5404,11 +5424,9 @@ static int kvm_add_msr_filter(struct kvm *kvm, struct kvm_msr_filter_range *user
goto err;
}
- /* Everything ok, add this range identifier to our global pool */
- ranges[kvm->arch.msr_filter.count] = range;
- /* Make sure we filled the array before we tell anyone to walk it */
- smp_wmb();
- kvm->arch.msr_filter.count++;
+ /* Everything ok, add this range identifier. */
+ msr_filter->ranges[msr_filter->count] = range;
+ msr_filter->count++;
return 0;
err:
@@ -5419,10 +5437,11 @@ err:
static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, void __user *argp)
{
struct kvm_msr_filter __user *user_msr_filter = argp;
+ struct kvm_x86_msr_filter *new_filter, *old_filter;
struct kvm_msr_filter filter;
bool default_allow;
- int r = 0;
bool empty = true;
+ int r = 0;
u32 i;
if (copy_from_user(&filter, user_msr_filter, sizeof(filter)))
@@ -5435,25 +5454,32 @@ static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, void __user *argp)
if (empty && !default_allow)
return -EINVAL;
- kvm_clear_msr_filter(kvm);
-
- kvm->arch.msr_filter.default_allow = default_allow;
+ new_filter = kvm_alloc_msr_filter(default_allow);
+ if (!new_filter)
+ return -ENOMEM;
- /*
- * Protect from concurrent calls to this function that could trigger
- * a TOCTOU violation on kvm->arch.msr_filter.count.
- */
- mutex_lock(&kvm->lock);
for (i = 0; i < ARRAY_SIZE(filter.ranges); i++) {
- r = kvm_add_msr_filter(kvm, &filter.ranges[i]);
- if (r)
- break;
+ r = kvm_add_msr_filter(new_filter, &filter.ranges[i]);
+ if (r) {
+ kvm_free_msr_filter(new_filter);
+ return r;
+ }
}
+ mutex_lock(&kvm->lock);
+
+ /* The per-VM filter is protected by kvm->lock... */
+ old_filter = srcu_dereference_check(kvm->arch.msr_filter, &kvm->srcu, 1);
+
+ rcu_assign_pointer(kvm->arch.msr_filter, new_filter);
+ synchronize_srcu(&kvm->srcu);
+
+ kvm_free_msr_filter(old_filter);
+
kvm_make_all_cpus_request(kvm, KVM_REQ_MSR_FILTER_CHANGED);
mutex_unlock(&kvm->lock);
- return r;
+ return 0;
}
long kvm_arch_vm_ioctl(struct file *filp,
@@ -6603,7 +6629,7 @@ static int kvm_emulate_wbinvd_noskip(struct kvm_vcpu *vcpu)
int cpu = get_cpu();
cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
- smp_call_function_many(vcpu->arch.wbinvd_dirty_mask,
+ on_each_cpu_mask(vcpu->arch.wbinvd_dirty_mask,
wbinvd_ipi, NULL, 1);
put_cpu();
cpumask_clear(vcpu->arch.wbinvd_dirty_mask);
@@ -10601,7 +10627,7 @@ void __user * __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa,
return (void __user *)hva;
} else {
if (!slot || !slot->npages)
- return 0;
+ return NULL;
old_npages = slot->npages;
hva = slot->userspace_addr;
@@ -10634,8 +10660,6 @@ void kvm_arch_pre_destroy_vm(struct kvm *kvm)
void kvm_arch_destroy_vm(struct kvm *kvm)
{
- u32 i;
-
if (current->mm == kvm->mm) {
/*
* Free memory regions allocated on behalf of userspace,
@@ -10651,8 +10675,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
mutex_unlock(&kvm->slots_lock);
}
static_call_cond(kvm_x86_vm_destroy)(kvm);
- for (i = 0; i < kvm->arch.msr_filter.count; i++)
- kfree(kvm->arch.msr_filter.ranges[i].bitmap);
+ kvm_free_msr_filter(srcu_dereference_check(kvm->arch.msr_filter, &kvm->srcu, 1));
kvm_pic_destroy(kvm);
kvm_ioapic_destroy(kvm);
kvm_free_vcpus(kvm);