summaryrefslogtreecommitdiff
path: root/arch/x86/kvm/mmu.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kvm/mmu.c')
-rw-r--r--arch/x86/kvm/mmu.c268
1 files changed, 169 insertions, 99 deletions
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index e50425d0f5f7..978f402006ee 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -22,6 +22,7 @@
#include "mmu.h"
#include "x86.h"
#include "kvm_cache_regs.h"
+#include "cpuid.h"
#include <linux/kvm_host.h>
#include <linux/types.h>
@@ -198,16 +199,20 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask)
EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
/*
- * spte bits of bit 3 ~ bit 11 are used as low 9 bits of generation number,
- * the bits of bits 52 ~ bit 61 are used as high 10 bits of generation
- * number.
+ * the low bit of the generation number is always presumed to be zero.
+ * This disables mmio caching during memslot updates. The concept is
+ * similar to a seqcount but instead of retrying the access we just punt
+ * and ignore the cache.
+ *
+ * spte bits 3-11 are used as bits 1-9 of the generation number,
+ * the bits 52-61 are used as bits 10-19 of the generation number.
*/
-#define MMIO_SPTE_GEN_LOW_SHIFT 3
+#define MMIO_SPTE_GEN_LOW_SHIFT 2
#define MMIO_SPTE_GEN_HIGH_SHIFT 52
-#define MMIO_GEN_SHIFT 19
-#define MMIO_GEN_LOW_SHIFT 9
-#define MMIO_GEN_LOW_MASK ((1 << MMIO_GEN_LOW_SHIFT) - 1)
+#define MMIO_GEN_SHIFT 20
+#define MMIO_GEN_LOW_SHIFT 10
+#define MMIO_GEN_LOW_MASK ((1 << MMIO_GEN_LOW_SHIFT) - 2)
#define MMIO_GEN_MASK ((1 << MMIO_GEN_SHIFT) - 1)
#define MMIO_MAX_GEN ((1 << MMIO_GEN_SHIFT) - 1)
@@ -235,12 +240,7 @@ static unsigned int get_mmio_spte_generation(u64 spte)
static unsigned int kvm_current_mmio_generation(struct kvm *kvm)
{
- /*
- * Init kvm generation close to MMIO_MAX_GEN to easily test the
- * code of handling generation number wrap-around.
- */
- return (kvm_memslots(kvm)->generation +
- MMIO_MAX_GEN - 150) & MMIO_GEN_MASK;
+ return kvm_memslots(kvm)->generation & MMIO_GEN_MASK;
}
static void mark_mmio_spte(struct kvm *kvm, u64 *sptep, u64 gfn,
@@ -295,11 +295,6 @@ static bool check_mmio_spte(struct kvm *kvm, u64 spte)
return likely(kvm_gen == spte_gen);
}
-static inline u64 rsvd_bits(int s, int e)
-{
- return ((1ULL << (e - s + 1)) - 1) << s;
-}
-
void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
u64 dirty_mask, u64 nx_mask, u64 x_mask)
{
@@ -595,7 +590,8 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte)
* we always atomicly update it, see the comments in
* spte_has_volatile_bits().
*/
- if (is_writable_pte(old_spte) && !is_writable_pte(new_spte))
+ if (spte_is_locklessly_modifiable(old_spte) &&
+ !is_writable_pte(new_spte))
ret = true;
if (!shadow_accessed_mask)
@@ -634,7 +630,7 @@ static int mmu_spte_clear_track_bits(u64 *sptep)
* kvm mmu, before reclaiming the page, we should
* unmap it from mmu first.
*/
- WARN_ON(!kvm_is_mmio_pfn(pfn) && !page_count(pfn_to_page(pfn)));
+ WARN_ON(!kvm_is_reserved_pfn(pfn) && !page_count(pfn_to_page(pfn)));
if (!shadow_accessed_mask || old_spte & shadow_accessed_mask)
kvm_set_pfn_accessed(pfn);
@@ -1176,20 +1172,18 @@ static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
/*
* Write-protect on the specified @sptep, @pt_protect indicates whether
- * spte writ-protection is caused by protecting shadow page table.
- * @flush indicates whether tlb need be flushed.
+ * spte write-protection is caused by protecting shadow page table.
*
- * Note: write protection is difference between drity logging and spte
+ * Note: write protection is difference between dirty logging and spte
* protection:
* - for dirty logging, the spte can be set to writable at anytime if
* its dirty bitmap is properly set.
* - for spte protection, the spte can be writable only after unsync-ing
* shadow page.
*
- * Return true if the spte is dropped.
+ * Return true if tlb need be flushed.
*/
-static bool
-spte_write_protect(struct kvm *kvm, u64 *sptep, bool *flush, bool pt_protect)
+static bool spte_write_protect(struct kvm *kvm, u64 *sptep, bool pt_protect)
{
u64 spte = *sptep;
@@ -1199,17 +1193,11 @@ spte_write_protect(struct kvm *kvm, u64 *sptep, bool *flush, bool pt_protect)
rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep);
- if (__drop_large_spte(kvm, sptep)) {
- *flush |= true;
- return true;
- }
-
if (pt_protect)
spte &= ~SPTE_MMU_WRITEABLE;
spte = spte & ~PT_WRITABLE_MASK;
- *flush |= mmu_spte_update(sptep, spte);
- return false;
+ return mmu_spte_update(sptep, spte);
}
static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp,
@@ -1221,11 +1209,8 @@ static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp,
for (sptep = rmap_get_first(*rmapp, &iter); sptep;) {
BUG_ON(!(*sptep & PT_PRESENT_MASK));
- if (spte_write_protect(kvm, sptep, &flush, pt_protect)) {
- sptep = rmap_get_first(*rmapp, &iter);
- continue;
- }
+ flush |= spte_write_protect(kvm, sptep, pt_protect);
sptep = rmap_get_next(&iter);
}
@@ -1277,7 +1262,8 @@ static bool rmap_write_protect(struct kvm *kvm, u64 gfn)
}
static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
- struct kvm_memory_slot *slot, unsigned long data)
+ struct kvm_memory_slot *slot, gfn_t gfn, int level,
+ unsigned long data)
{
u64 *sptep;
struct rmap_iterator iter;
@@ -1285,7 +1271,8 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
while ((sptep = rmap_get_first(*rmapp, &iter))) {
BUG_ON(!(*sptep & PT_PRESENT_MASK));
- rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", sptep, *sptep);
+ rmap_printk("kvm_rmap_unmap_hva: spte %p %llx gfn %llx (%d)\n",
+ sptep, *sptep, gfn, level);
drop_spte(kvm, sptep);
need_tlb_flush = 1;
@@ -1295,7 +1282,8 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
}
static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
- struct kvm_memory_slot *slot, unsigned long data)
+ struct kvm_memory_slot *slot, gfn_t gfn, int level,
+ unsigned long data)
{
u64 *sptep;
struct rmap_iterator iter;
@@ -1309,7 +1297,8 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
for (sptep = rmap_get_first(*rmapp, &iter); sptep;) {
BUG_ON(!is_shadow_present_pte(*sptep));
- rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", sptep, *sptep);
+ rmap_printk("kvm_set_pte_rmapp: spte %p %llx gfn %llx (%d)\n",
+ sptep, *sptep, gfn, level);
need_flush = 1;
@@ -1343,6 +1332,8 @@ static int kvm_handle_hva_range(struct kvm *kvm,
int (*handler)(struct kvm *kvm,
unsigned long *rmapp,
struct kvm_memory_slot *slot,
+ gfn_t gfn,
+ int level,
unsigned long data))
{
int j;
@@ -1372,6 +1363,7 @@ static int kvm_handle_hva_range(struct kvm *kvm,
j < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++j) {
unsigned long idx, idx_end;
unsigned long *rmapp;
+ gfn_t gfn = gfn_start;
/*
* {idx(page_j) | page_j intersects with
@@ -1382,8 +1374,10 @@ static int kvm_handle_hva_range(struct kvm *kvm,
rmapp = __gfn_to_rmap(gfn_start, j, memslot);
- for (; idx <= idx_end; ++idx)
- ret |= handler(kvm, rmapp++, memslot, data);
+ for (; idx <= idx_end;
+ ++idx, gfn += (1UL << KVM_HPAGE_GFN_SHIFT(j)))
+ ret |= handler(kvm, rmapp++, memslot,
+ gfn, j, data);
}
}
@@ -1394,6 +1388,7 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
unsigned long data,
int (*handler)(struct kvm *kvm, unsigned long *rmapp,
struct kvm_memory_slot *slot,
+ gfn_t gfn, int level,
unsigned long data))
{
return kvm_handle_hva_range(kvm, hva, hva + 1, data, handler);
@@ -1415,24 +1410,14 @@ void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
}
static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
- struct kvm_memory_slot *slot, unsigned long data)
+ struct kvm_memory_slot *slot, gfn_t gfn, int level,
+ unsigned long data)
{
u64 *sptep;
struct rmap_iterator uninitialized_var(iter);
int young = 0;
- /*
- * In case of absence of EPT Access and Dirty Bits supports,
- * emulate the accessed bit for EPT, by checking if this page has
- * an EPT mapping, and clearing it if it does. On the next access,
- * a new EPT mapping will be established.
- * This has some overhead, but not as much as the cost of swapping
- * out actively used pages or breaking up actively used hugepages.
- */
- if (!shadow_accessed_mask) {
- young = kvm_unmap_rmapp(kvm, rmapp, slot, data);
- goto out;
- }
+ BUG_ON(!shadow_accessed_mask);
for (sptep = rmap_get_first(*rmapp, &iter); sptep;
sptep = rmap_get_next(&iter)) {
@@ -1444,14 +1429,13 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
(unsigned long *)sptep);
}
}
-out:
- /* @data has hva passed to kvm_age_hva(). */
- trace_kvm_age_page(data, slot, young);
+ trace_kvm_age_page(gfn, level, slot, young);
return young;
}
static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
- struct kvm_memory_slot *slot, unsigned long data)
+ struct kvm_memory_slot *slot, gfn_t gfn,
+ int level, unsigned long data)
{
u64 *sptep;
struct rmap_iterator iter;
@@ -1489,13 +1473,33 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
- kvm_unmap_rmapp(vcpu->kvm, rmapp, NULL, 0);
+ kvm_unmap_rmapp(vcpu->kvm, rmapp, NULL, gfn, sp->role.level, 0);
kvm_flush_remote_tlbs(vcpu->kvm);
}
-int kvm_age_hva(struct kvm *kvm, unsigned long hva)
+int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
{
- return kvm_handle_hva(kvm, hva, hva, kvm_age_rmapp);
+ /*
+ * In case of absence of EPT Access and Dirty Bits supports,
+ * emulate the accessed bit for EPT, by checking if this page has
+ * an EPT mapping, and clearing it if it does. On the next access,
+ * a new EPT mapping will be established.
+ * This has some overhead, but not as much as the cost of swapping
+ * out actively used pages or breaking up actively used hugepages.
+ */
+ if (!shadow_accessed_mask) {
+ /*
+ * We are holding the kvm->mmu_lock, and we are blowing up
+ * shadow PTEs. MMU notifier consumers need to be kept at bay.
+ * This is correct as long as we don't decouple the mmu_lock
+ * protected regions (like invalidate_range_start|end does).
+ */
+ kvm->mmu_notifier_seq++;
+ return kvm_handle_hva_range(kvm, start, end, 0,
+ kvm_unmap_rmapp);
+ }
+
+ return kvm_handle_hva_range(kvm, start, end, 0, kvm_age_rmapp);
}
int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
@@ -1758,7 +1762,7 @@ static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
return 1;
}
- kvm_mmu_flush_tlb(vcpu);
+ kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
return 0;
}
@@ -1811,7 +1815,7 @@ static void kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn)
kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
if (flush)
- kvm_mmu_flush_tlb(vcpu);
+ kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
}
struct mmu_page_path {
@@ -2457,7 +2461,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
spte |= PT_PAGE_SIZE_MASK;
if (tdp_enabled)
spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn,
- kvm_is_mmio_pfn(pfn));
+ kvm_is_reserved_pfn(pfn));
if (host_writable)
spte |= SPTE_HOST_WRITEABLE;
@@ -2545,7 +2549,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
true, host_writable)) {
if (write_fault)
*emulate = 1;
- kvm_mmu_flush_tlb(vcpu);
+ kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
}
if (unlikely(is_mmio_spte(*sptep) && emulate))
@@ -2672,6 +2676,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
break;
}
+ drop_large_spte(vcpu, iterator.sptep);
if (!is_shadow_present_pte(*iterator.sptep)) {
u64 base_addr = iterator.addr;
@@ -2732,7 +2737,7 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
* PT_PAGE_TABLE_LEVEL and there would be no adjustment done
* here.
*/
- if (!is_error_noslot_pfn(pfn) && !kvm_is_mmio_pfn(pfn) &&
+ if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) &&
level == PT_PAGE_TABLE_LEVEL &&
PageTransCompound(pfn_to_page(pfn)) &&
!has_wrprotected_page(vcpu->kvm, gfn, PT_DIRECTORY_LEVEL)) {
@@ -2801,9 +2806,9 @@ static bool page_fault_can_be_fast(u32 error_code)
}
static bool
-fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 spte)
+fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+ u64 *sptep, u64 spte)
{
- struct kvm_mmu_page *sp = page_header(__pa(sptep));
gfn_t gfn;
WARN_ON(!sp->role.direct);
@@ -2829,6 +2834,7 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
u32 error_code)
{
struct kvm_shadow_walk_iterator iterator;
+ struct kvm_mmu_page *sp;
bool ret = false;
u64 spte = 0ull;
@@ -2852,7 +2858,8 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
goto exit;
}
- if (!is_last_spte(spte, level))
+ sp = page_header(__pa(iterator.sptep));
+ if (!is_last_spte(spte, sp->role.level))
goto exit;
/*
@@ -2874,11 +2881,24 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
goto exit;
/*
+ * Do not fix write-permission on the large spte since we only dirty
+ * the first page into the dirty-bitmap in fast_pf_fix_direct_spte()
+ * that means other pages are missed if its slot is dirty-logged.
+ *
+ * Instead, we let the slow page fault path create a normal spte to
+ * fix the access.
+ *
+ * See the comments in kvm_arch_commit_memory_region().
+ */
+ if (sp->role.level > PT_PAGE_TABLE_LEVEL)
+ goto exit;
+
+ /*
* Currently, fast page fault only works for direct mapping since
* the gfn is not stable for indirect shadow page.
* See Documentation/virtual/kvm/locking.txt to get more detail.
*/
- ret = fast_pf_fix_direct_spte(vcpu, iterator.sptep, spte);
+ ret = fast_pf_fix_direct_spte(vcpu, sp, iterator.sptep, spte);
exit:
trace_fast_page_fault(vcpu, gva, error_code, iterator.sptep,
spte, ret);
@@ -3156,7 +3176,7 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)
if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
return;
- vcpu_clear_mmio_info(vcpu, ~0ul);
+ vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) {
hpa_t root = vcpu->arch.mmu.root_hpa;
@@ -3199,7 +3219,7 @@ static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr,
{
if (exception)
exception->error_code = 0;
- return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access);
+ return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access, exception);
}
static bool quickly_check_mmio_pf(struct kvm_vcpu *vcpu, u64 addr, bool direct)
@@ -3328,7 +3348,7 @@ static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
arch.direct_map = vcpu->arch.mmu.direct_map;
arch.cr3 = vcpu->arch.mmu.get_cr3(vcpu);
- return kvm_setup_async_pf(vcpu, gva, gfn, &arch);
+ return kvm_setup_async_pf(vcpu, gva, gfn_to_hva(vcpu->kvm, gfn), &arch);
}
static bool can_do_async_pf(struct kvm_vcpu *vcpu)
@@ -3443,13 +3463,6 @@ static void nonpaging_init_context(struct kvm_vcpu *vcpu,
context->nx = false;
}
-void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
-{
- ++vcpu->stat.tlb_flush;
- kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
-}
-EXPORT_SYMBOL_GPL(kvm_mmu_flush_tlb);
-
void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu)
{
mmu_free_roots(vcpu);
@@ -3510,11 +3523,23 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
{
int maxphyaddr = cpuid_maxphyaddr(vcpu);
u64 exb_bit_rsvd = 0;
+ u64 gbpages_bit_rsvd = 0;
+ u64 nonleaf_bit8_rsvd = 0;
context->bad_mt_xwr = 0;
if (!context->nx)
exb_bit_rsvd = rsvd_bits(63, 63);
+ if (!guest_cpuid_has_gbpages(vcpu))
+ gbpages_bit_rsvd = rsvd_bits(7, 7);
+
+ /*
+ * Non-leaf PML4Es and PDPEs reserve bit 8 (which would be the G bit for
+ * leaf entries) on AMD CPUs only.
+ */
+ if (guest_cpuid_is_amd(vcpu))
+ nonleaf_bit8_rsvd = rsvd_bits(8, 8);
+
switch (context->root_level) {
case PT32_ROOT_LEVEL:
/* no rsvd bits for 2 level 4K page table entries */
@@ -3537,7 +3562,7 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
case PT32E_ROOT_LEVEL:
context->rsvd_bits_mask[0][2] =
rsvd_bits(maxphyaddr, 63) |
- rsvd_bits(7, 8) | rsvd_bits(1, 2); /* PDPTE */
+ rsvd_bits(5, 8) | rsvd_bits(1, 2); /* PDPTE */
context->rsvd_bits_mask[0][1] = exb_bit_rsvd |
rsvd_bits(maxphyaddr, 62); /* PDE */
context->rsvd_bits_mask[0][0] = exb_bit_rsvd |
@@ -3549,16 +3574,16 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
break;
case PT64_ROOT_LEVEL:
context->rsvd_bits_mask[0][3] = exb_bit_rsvd |
- rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8);
+ nonleaf_bit8_rsvd | rsvd_bits(7, 7) | rsvd_bits(maxphyaddr, 51);
context->rsvd_bits_mask[0][2] = exb_bit_rsvd |
- rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8);
+ nonleaf_bit8_rsvd | gbpages_bit_rsvd | rsvd_bits(maxphyaddr, 51);
context->rsvd_bits_mask[0][1] = exb_bit_rsvd |
rsvd_bits(maxphyaddr, 51);
context->rsvd_bits_mask[0][0] = exb_bit_rsvd |
rsvd_bits(maxphyaddr, 51);
context->rsvd_bits_mask[1][3] = context->rsvd_bits_mask[0][3];
context->rsvd_bits_mask[1][2] = exb_bit_rsvd |
- rsvd_bits(maxphyaddr, 51) |
+ gbpages_bit_rsvd | rsvd_bits(maxphyaddr, 51) |
rsvd_bits(13, 29);
context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
rsvd_bits(maxphyaddr, 51) |
@@ -3600,20 +3625,27 @@ static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu,
}
}
-static void update_permission_bitmask(struct kvm_vcpu *vcpu,
+void update_permission_bitmask(struct kvm_vcpu *vcpu,
struct kvm_mmu *mmu, bool ept)
{
unsigned bit, byte, pfec;
u8 map;
- bool fault, x, w, u, wf, uf, ff, smep;
+ bool fault, x, w, u, wf, uf, ff, smapf, cr4_smap, cr4_smep, smap = 0;
- smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP);
+ cr4_smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP);
+ cr4_smap = kvm_read_cr4_bits(vcpu, X86_CR4_SMAP);
for (byte = 0; byte < ARRAY_SIZE(mmu->permissions); ++byte) {
pfec = byte << 1;
map = 0;
wf = pfec & PFERR_WRITE_MASK;
uf = pfec & PFERR_USER_MASK;
ff = pfec & PFERR_FETCH_MASK;
+ /*
+ * PFERR_RSVD_MASK bit is set in PFEC if the access is not
+ * subject to SMAP restrictions, and cleared otherwise. The
+ * bit is only meaningful if the SMAP bit is set in CR4.
+ */
+ smapf = !(pfec & PFERR_RSVD_MASK);
for (bit = 0; bit < 8; ++bit) {
x = bit & ACC_EXEC_MASK;
w = bit & ACC_WRITE_MASK;
@@ -3625,12 +3657,33 @@ static void update_permission_bitmask(struct kvm_vcpu *vcpu,
/* Allow supervisor writes if !cr0.wp */
w |= !is_write_protection(vcpu) && !uf;
/* Disallow supervisor fetches of user code if cr4.smep */
- x &= !(smep && u && !uf);
+ x &= !(cr4_smep && u && !uf);
+
+ /*
+ * SMAP:kernel-mode data accesses from user-mode
+ * mappings should fault. A fault is considered
+ * as a SMAP violation if all of the following
+ * conditions are ture:
+ * - X86_CR4_SMAP is set in CR4
+ * - An user page is accessed
+ * - Page fault in kernel mode
+ * - if CPL = 3 or X86_EFLAGS_AC is clear
+ *
+ * Here, we cover the first three conditions.
+ * The fourth is computed dynamically in
+ * permission_fault() and is in smapf.
+ *
+ * Also, SMAP does not affect instruction
+ * fetches, add the !ff check here to make it
+ * clearer.
+ */
+ smap = cr4_smap && u && !uf && !ff;
} else
/* Not really needed: no U/S accesses on ept */
u = 1;
- fault = (ff && !x) || (uf && !u) || (wf && !w);
+ fault = (ff && !x) || (uf && !u) || (wf && !w) ||
+ (smapf && smap);
map |= fault << bit;
}
mmu->permissions[byte] = map;
@@ -3924,7 +3977,7 @@ static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, bool zap_page,
if (remote_flush)
kvm_flush_remote_tlbs(vcpu->kvm);
else if (local_flush)
- kvm_mmu_flush_tlb(vcpu);
+ kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
}
static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa,
@@ -4185,7 +4238,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
{
vcpu->arch.mmu.invlpg(vcpu, gva);
- kvm_mmu_flush_tlb(vcpu);
+ kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
++vcpu->stat.invlpg;
}
EXPORT_SYMBOL_GPL(kvm_mmu_invlpg);
@@ -4275,15 +4328,32 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
if (*rmapp)
__rmap_write_protect(kvm, rmapp, false);
- if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
- kvm_flush_remote_tlbs(kvm);
+ if (need_resched() || spin_needbreak(&kvm->mmu_lock))
cond_resched_lock(&kvm->mmu_lock);
- }
}
}
- kvm_flush_remote_tlbs(kvm);
spin_unlock(&kvm->mmu_lock);
+
+ /*
+ * kvm_mmu_slot_remove_write_access() and kvm_vm_ioctl_get_dirty_log()
+ * which do tlb flush out of mmu-lock should be serialized by
+ * kvm->slots_lock otherwise tlb flush would be missed.
+ */
+ lockdep_assert_held(&kvm->slots_lock);
+
+ /*
+ * We can flush all the TLBs out of the mmu lock without TLB
+ * corruption since we just change the spte from writable to
+ * readonly so that we only need to care the case of changing
+ * spte from present to present (changing the spte from present
+ * to nonpresent will flush all the TLBs immediately), in other
+ * words, the only case we care is mmu_spte_update() where we
+ * haved checked SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE
+ * instead of PT_WRITABLE_MASK, that means it does not depend
+ * on PT_WRITABLE_MASK anymore.
+ */
+ kvm_flush_remote_tlbs(kvm);
}
#define BATCH_ZAP_PAGES 10
@@ -4378,7 +4448,7 @@ void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm)
* The very rare case: if the generation-number is round,
* zap all shadow pages.
*/
- if (unlikely(kvm_current_mmio_generation(kvm) >= MMIO_MAX_GEN)) {
+ if (unlikely(kvm_current_mmio_generation(kvm) == 0)) {
printk_ratelimited(KERN_INFO "kvm: zapping shadow pages for mmio generation wraparound\n");
kvm_mmu_invalidate_zap_all_pages(kvm);
}
@@ -4479,7 +4549,7 @@ int kvm_mmu_module_init(void)
if (!mmu_page_header_cache)
goto nomem;
- if (percpu_counter_init(&kvm_total_used_mmu_pages, 0))
+ if (percpu_counter_init(&kvm_total_used_mmu_pages, 0, GFP_KERNEL))
goto nomem;
register_shrinker(&mmu_shrinker);