summaryrefslogtreecommitdiff
path: root/arch/arm64/kvm
diff options
context:
space:
mode:
authorOliver Upton <oliver.upton@linux.dev>2023-06-15 16:02:11 +0300
committerOliver Upton <oliver.upton@linux.dev>2023-06-15 16:02:11 +0300
commit83510396c0765cc15454eaf445fb98bad773634e (patch)
treefe4dc2e04b2ce203026123f48147323efa1df906 /arch/arm64/kvm
parent44c026a73be8038f03dbdeef028b642880cf1511 (diff)
parent14c3555f055dd0819381148bf5b569cc5ba9ddfb (diff)
downloadlinux-83510396c0765cc15454eaf445fb98bad773634e.tar.xz
Merge branch kvm-arm64/eager-page-splitting into kvmarm/next
* kvm-arm64/eager-page-splitting: : Eager Page Splitting, courtesy of Ricardo Koller. : : Dirty logging performance is dominated by the cost of splitting : hugepages to PTE granularity. On systems that mere mortals can get their : hands on, each fault incurs the cost of a full break-before-make : pattern, wherein the broadcast invalidation and ensuing serialization : significantly increases fault latency. : : The goal of eager page splitting is to move the cost of hugepage : splitting out of the stage-2 fault path and instead into the ioctls : responsible for managing the dirty log: : : - If manual protection is enabled for the VM, hugepage splitting : happens in the KVM_CLEAR_DIRTY_LOG ioctl. This is desirable as it : provides userspace granular control over hugepage splitting. : : - Otherwise, if userspace relies on the legacy dirty log behavior : (clear on collection), hugepage splitting is done at the moment dirty : logging is enabled for a particular memslot. : : Support for eager page splitting requires explicit opt-in from : userspace, which is realized through the : KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE capability. arm64: kvm: avoid overflow in integer division KVM: arm64: Use local TLBI on permission relaxation KVM: arm64: Split huge pages during KVM_CLEAR_DIRTY_LOG KVM: arm64: Open-code kvm_mmu_write_protect_pt_masked() KVM: arm64: Split huge pages when dirty logging is enabled KVM: arm64: Add kvm_uninit_stage2_mmu() KVM: arm64: Refactor kvm_arch_commit_memory_region() KVM: arm64: Add kvm_pgtable_stage2_split() KVM: arm64: Add KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE KVM: arm64: Export kvm_are_all_memslots_empty() KVM: arm64: Add helper for creating unlinked stage2 subtrees KVM: arm64: Add KVM_PGTABLE_WALK flags for skipping CMOs and BBM TLBIs KVM: arm64: Rename free_removed to free_unlinked Signed-off-by: Oliver Upton <oliver.upton@linux.dev>
Diffstat (limited to 'arch/arm64/kvm')
-rw-r--r--arch/arm64/kvm/arm.c28
-rw-r--r--arch/arm64/kvm/hyp/nvhe/hyp-main.c10
-rw-r--r--arch/arm64/kvm/hyp/nvhe/mem_protect.c6
-rw-r--r--arch/arm64/kvm/hyp/nvhe/tlb.c52
-rw-r--r--arch/arm64/kvm/hyp/pgtable.c201
-rw-r--r--arch/arm64/kvm/hyp/vhe/tlb.c32
-rw-r--r--arch/arm64/kvm/mmu.c207
7 files changed, 484 insertions, 52 deletions
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 14391826241c..c605626801c4 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -65,6 +65,7 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
struct kvm_enable_cap *cap)
{
int r;
+ u64 new_cap;
if (cap->flags)
return -EINVAL;
@@ -89,6 +90,24 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
r = 0;
set_bit(KVM_ARCH_FLAG_SYSTEM_SUSPEND_ENABLED, &kvm->arch.flags);
break;
+ case KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE:
+ new_cap = cap->args[0];
+
+ mutex_lock(&kvm->slots_lock);
+ /*
+ * To keep things simple, allow changing the chunk
+ * size only when no memory slots have been created.
+ */
+ if (!kvm_are_all_memslots_empty(kvm)) {
+ r = -EINVAL;
+ } else if (new_cap && !kvm_is_block_size_supported(new_cap)) {
+ r = -EINVAL;
+ } else {
+ r = 0;
+ kvm->arch.mmu.split_page_chunk_size = new_cap;
+ }
+ mutex_unlock(&kvm->slots_lock);
+ break;
default:
r = -EINVAL;
break;
@@ -302,6 +321,15 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_ARM_PTRAUTH_GENERIC:
r = system_has_full_ptr_auth();
break;
+ case KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE:
+ if (kvm)
+ r = kvm->arch.mmu.split_page_chunk_size;
+ else
+ r = KVM_ARM_EAGER_SPLIT_CHUNK_SIZE_DEFAULT;
+ break;
+ case KVM_CAP_ARM_SUPPORTED_BLOCK_SIZES:
+ r = kvm_supported_block_sizes();
+ break;
default:
r = 0;
}
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
index 728e01d4536b..c6bf1e49ca93 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
@@ -125,6 +125,15 @@ static void handle___kvm_tlb_flush_vmid_ipa(struct kvm_cpu_context *host_ctxt)
__kvm_tlb_flush_vmid_ipa(kern_hyp_va(mmu), ipa, level);
}
+static void handle___kvm_tlb_flush_vmid_ipa_nsh(struct kvm_cpu_context *host_ctxt)
+{
+ DECLARE_REG(struct kvm_s2_mmu *, mmu, host_ctxt, 1);
+ DECLARE_REG(phys_addr_t, ipa, host_ctxt, 2);
+ DECLARE_REG(int, level, host_ctxt, 3);
+
+ __kvm_tlb_flush_vmid_ipa_nsh(kern_hyp_va(mmu), ipa, level);
+}
+
static void handle___kvm_tlb_flush_vmid(struct kvm_cpu_context *host_ctxt)
{
DECLARE_REG(struct kvm_s2_mmu *, mmu, host_ctxt, 1);
@@ -315,6 +324,7 @@ static const hcall_t host_hcall[] = {
HANDLE_FUNC(__kvm_vcpu_run),
HANDLE_FUNC(__kvm_flush_vm_context),
HANDLE_FUNC(__kvm_tlb_flush_vmid_ipa),
+ HANDLE_FUNC(__kvm_tlb_flush_vmid_ipa_nsh),
HANDLE_FUNC(__kvm_tlb_flush_vmid),
HANDLE_FUNC(__kvm_flush_cpu_context),
HANDLE_FUNC(__kvm_timer_set_cntvoff),
diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
index 2e9ec4a2a4a3..d35e75b13ffe 100644
--- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -91,9 +91,9 @@ static void host_s2_put_page(void *addr)
hyp_put_page(&host_s2_pool, addr);
}
-static void host_s2_free_removed_table(void *addr, u32 level)
+static void host_s2_free_unlinked_table(void *addr, u32 level)
{
- kvm_pgtable_stage2_free_removed(&host_mmu.mm_ops, addr, level);
+ kvm_pgtable_stage2_free_unlinked(&host_mmu.mm_ops, addr, level);
}
static int prepare_s2_pool(void *pgt_pool_base)
@@ -110,7 +110,7 @@ static int prepare_s2_pool(void *pgt_pool_base)
host_mmu.mm_ops = (struct kvm_pgtable_mm_ops) {
.zalloc_pages_exact = host_s2_zalloc_pages_exact,
.zalloc_page = host_s2_zalloc_page,
- .free_removed_table = host_s2_free_removed_table,
+ .free_unlinked_table = host_s2_free_unlinked_table,
.phys_to_virt = hyp_phys_to_virt,
.virt_to_phys = hyp_virt_to_phys,
.page_count = hyp_page_count,
diff --git a/arch/arm64/kvm/hyp/nvhe/tlb.c b/arch/arm64/kvm/hyp/nvhe/tlb.c
index 978179133f4b..b9991bbd8e3f 100644
--- a/arch/arm64/kvm/hyp/nvhe/tlb.c
+++ b/arch/arm64/kvm/hyp/nvhe/tlb.c
@@ -130,6 +130,58 @@ void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu,
__tlb_switch_to_host(&cxt);
}
+void __kvm_tlb_flush_vmid_ipa_nsh(struct kvm_s2_mmu *mmu,
+ phys_addr_t ipa, int level)
+{
+ struct tlb_inv_context cxt;
+
+ /* Switch to requested VMID */
+ __tlb_switch_to_guest(mmu, &cxt, true);
+
+ /*
+ * We could do so much better if we had the VA as well.
+ * Instead, we invalidate Stage-2 for this IPA, and the
+ * whole of Stage-1. Weep...
+ */
+ ipa >>= 12;
+ __tlbi_level(ipas2e1, ipa, level);
+
+ /*
+ * We have to ensure completion of the invalidation at Stage-2,
+ * since a table walk on another CPU could refill a TLB with a
+ * complete (S1 + S2) walk based on the old Stage-2 mapping if
+ * the Stage-1 invalidation happened first.
+ */
+ dsb(nsh);
+ __tlbi(vmalle1);
+ dsb(nsh);
+ isb();
+
+ /*
+ * If the host is running at EL1 and we have a VPIPT I-cache,
+ * then we must perform I-cache maintenance at EL2 in order for
+ * it to have an effect on the guest. Since the guest cannot hit
+ * I-cache lines allocated with a different VMID, we don't need
+ * to worry about junk out of guest reset (we nuke the I-cache on
+ * VMID rollover), but we do need to be careful when remapping
+ * executable pages for the same guest. This can happen when KSM
+ * takes a CoW fault on an executable page, copies the page into
+ * a page that was previously mapped in the guest and then needs
+ * to invalidate the guest view of the I-cache for that page
+ * from EL1. To solve this, we invalidate the entire I-cache when
+ * unmapping a page from a guest if we have a VPIPT I-cache but
+ * the host is running at EL1. As above, we could do better if
+ * we had the VA.
+ *
+ * The moral of this story is: if you have a VPIPT I-cache, then
+ * you should be running with VHE enabled.
+ */
+ if (icache_is_vpipt())
+ icache_inval_all_pou();
+
+ __tlb_switch_to_host(&cxt);
+}
+
void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu)
{
struct tlb_inv_context cxt;
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index 5282cb9ca4cf..8acab89080af 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -63,6 +63,16 @@ struct kvm_pgtable_walk_data {
const u64 end;
};
+static bool kvm_pgtable_walk_skip_bbm_tlbi(const struct kvm_pgtable_visit_ctx *ctx)
+{
+ return unlikely(ctx->flags & KVM_PGTABLE_WALK_SKIP_BBM_TLBI);
+}
+
+static bool kvm_pgtable_walk_skip_cmo(const struct kvm_pgtable_visit_ctx *ctx)
+{
+ return unlikely(ctx->flags & KVM_PGTABLE_WALK_SKIP_CMO);
+}
+
static bool kvm_phys_is_valid(u64 phys)
{
return phys < BIT(id_aa64mmfr0_parange_to_phys_shift(ID_AA64MMFR0_EL1_PARANGE_MAX));
@@ -743,14 +753,17 @@ static bool stage2_try_break_pte(const struct kvm_pgtable_visit_ctx *ctx,
if (!stage2_try_set_pte(ctx, KVM_INVALID_PTE_LOCKED))
return false;
- /*
- * Perform the appropriate TLB invalidation based on the evicted pte
- * value (if any).
- */
- if (kvm_pte_table(ctx->old, ctx->level))
- kvm_call_hyp(__kvm_tlb_flush_vmid, mmu);
- else if (kvm_pte_valid(ctx->old))
- kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, ctx->addr, ctx->level);
+ if (!kvm_pgtable_walk_skip_bbm_tlbi(ctx)) {
+ /*
+ * Perform the appropriate TLB invalidation based on the
+ * evicted pte value (if any).
+ */
+ if (kvm_pte_table(ctx->old, ctx->level))
+ kvm_call_hyp(__kvm_tlb_flush_vmid, mmu);
+ else if (kvm_pte_valid(ctx->old))
+ kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu,
+ ctx->addr, ctx->level);
+ }
if (stage2_pte_is_counted(ctx->old))
mm_ops->put_page(ctx->ptep);
@@ -857,11 +870,13 @@ static int stage2_map_walker_try_leaf(const struct kvm_pgtable_visit_ctx *ctx,
return -EAGAIN;
/* Perform CMOs before installation of the guest stage-2 PTE */
- if (mm_ops->dcache_clean_inval_poc && stage2_pte_cacheable(pgt, new))
+ if (!kvm_pgtable_walk_skip_cmo(ctx) && mm_ops->dcache_clean_inval_poc &&
+ stage2_pte_cacheable(pgt, new))
mm_ops->dcache_clean_inval_poc(kvm_pte_follow(new, mm_ops),
- granule);
+ granule);
- if (mm_ops->icache_inval_pou && stage2_pte_executable(new))
+ if (!kvm_pgtable_walk_skip_cmo(ctx) && mm_ops->icache_inval_pou &&
+ stage2_pte_executable(new))
mm_ops->icache_inval_pou(kvm_pte_follow(new, mm_ops), granule);
stage2_make_pte(ctx, new);
@@ -883,7 +898,7 @@ static int stage2_map_walk_table_pre(const struct kvm_pgtable_visit_ctx *ctx,
if (ret)
return ret;
- mm_ops->free_removed_table(childp, ctx->level);
+ mm_ops->free_unlinked_table(childp, ctx->level);
return 0;
}
@@ -928,7 +943,7 @@ static int stage2_map_walk_leaf(const struct kvm_pgtable_visit_ctx *ctx,
* The TABLE_PRE callback runs for table entries on the way down, looking
* for table entries which we could conceivably replace with a block entry
* for this mapping. If it finds one it replaces the entry and calls
- * kvm_pgtable_mm_ops::free_removed_table() to tear down the detached table.
+ * kvm_pgtable_mm_ops::free_unlinked_table() to tear down the detached table.
*
* Otherwise, the LEAF callback performs the mapping at the existing leaves
* instead.
@@ -1197,7 +1212,7 @@ int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr,
KVM_PGTABLE_WALK_HANDLE_FAULT |
KVM_PGTABLE_WALK_SHARED);
if (!ret)
- kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, pgt->mmu, addr, level);
+ kvm_call_hyp(__kvm_tlb_flush_vmid_ipa_nsh, pgt->mmu, addr, level);
return ret;
}
@@ -1230,6 +1245,162 @@ int kvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size)
return kvm_pgtable_walk(pgt, addr, size, &walker);
}
+kvm_pte_t *kvm_pgtable_stage2_create_unlinked(struct kvm_pgtable *pgt,
+ u64 phys, u32 level,
+ enum kvm_pgtable_prot prot,
+ void *mc, bool force_pte)
+{
+ struct stage2_map_data map_data = {
+ .phys = phys,
+ .mmu = pgt->mmu,
+ .memcache = mc,
+ .force_pte = force_pte,
+ };
+ struct kvm_pgtable_walker walker = {
+ .cb = stage2_map_walker,
+ .flags = KVM_PGTABLE_WALK_LEAF |
+ KVM_PGTABLE_WALK_SKIP_BBM_TLBI |
+ KVM_PGTABLE_WALK_SKIP_CMO,
+ .arg = &map_data,
+ };
+ /*
+ * The input address (.addr) is irrelevant for walking an
+ * unlinked table. Construct an ambiguous IA range to map
+ * kvm_granule_size(level) worth of memory.
+ */
+ struct kvm_pgtable_walk_data data = {
+ .walker = &walker,
+ .addr = 0,
+ .end = kvm_granule_size(level),
+ };
+ struct kvm_pgtable_mm_ops *mm_ops = pgt->mm_ops;
+ kvm_pte_t *pgtable;
+ int ret;
+
+ if (!IS_ALIGNED(phys, kvm_granule_size(level)))
+ return ERR_PTR(-EINVAL);
+
+ ret = stage2_set_prot_attr(pgt, prot, &map_data.attr);
+ if (ret)
+ return ERR_PTR(ret);
+
+ pgtable = mm_ops->zalloc_page(mc);
+ if (!pgtable)
+ return ERR_PTR(-ENOMEM);
+
+ ret = __kvm_pgtable_walk(&data, mm_ops, (kvm_pteref_t)pgtable,
+ level + 1);
+ if (ret) {
+ kvm_pgtable_stage2_free_unlinked(mm_ops, pgtable, level);
+ mm_ops->put_page(pgtable);
+ return ERR_PTR(ret);
+ }
+
+ return pgtable;
+}
+
+/*
+ * Get the number of page-tables needed to replace a block with a
+ * fully populated tree up to the PTE entries. Note that @level is
+ * interpreted as in "level @level entry".
+ */
+static int stage2_block_get_nr_page_tables(u32 level)
+{
+ switch (level) {
+ case 1:
+ return PTRS_PER_PTE + 1;
+ case 2:
+ return 1;
+ case 3:
+ return 0;
+ default:
+ WARN_ON_ONCE(level < KVM_PGTABLE_MIN_BLOCK_LEVEL ||
+ level >= KVM_PGTABLE_MAX_LEVELS);
+ return -EINVAL;
+ };
+}
+
+static int stage2_split_walker(const struct kvm_pgtable_visit_ctx *ctx,
+ enum kvm_pgtable_walk_flags visit)
+{
+ struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
+ struct kvm_mmu_memory_cache *mc = ctx->arg;
+ struct kvm_s2_mmu *mmu;
+ kvm_pte_t pte = ctx->old, new, *childp;
+ enum kvm_pgtable_prot prot;
+ u32 level = ctx->level;
+ bool force_pte;
+ int nr_pages;
+ u64 phys;
+
+ /* No huge-pages exist at the last level */
+ if (level == KVM_PGTABLE_MAX_LEVELS - 1)
+ return 0;
+
+ /* We only split valid block mappings */
+ if (!kvm_pte_valid(pte))
+ return 0;
+
+ nr_pages = stage2_block_get_nr_page_tables(level);
+ if (nr_pages < 0)
+ return nr_pages;
+
+ if (mc->nobjs >= nr_pages) {
+ /* Build a tree mapped down to the PTE granularity. */
+ force_pte = true;
+ } else {
+ /*
+ * Don't force PTEs, so create_unlinked() below does
+ * not populate the tree up to the PTE level. The
+ * consequence is that the call will require a single
+ * page of level 2 entries at level 1, or a single
+ * page of PTEs at level 2. If we are at level 1, the
+ * PTEs will be created recursively.
+ */
+ force_pte = false;
+ nr_pages = 1;
+ }
+
+ if (mc->nobjs < nr_pages)
+ return -ENOMEM;
+
+ mmu = container_of(mc, struct kvm_s2_mmu, split_page_cache);
+ phys = kvm_pte_to_phys(pte);
+ prot = kvm_pgtable_stage2_pte_prot(pte);
+
+ childp = kvm_pgtable_stage2_create_unlinked(mmu->pgt, phys,
+ level, prot, mc, force_pte);
+ if (IS_ERR(childp))
+ return PTR_ERR(childp);
+
+ if (!stage2_try_break_pte(ctx, mmu)) {
+ kvm_pgtable_stage2_free_unlinked(mm_ops, childp, level);
+ mm_ops->put_page(childp);
+ return -EAGAIN;
+ }
+
+ /*
+ * Note, the contents of the page table are guaranteed to be made
+ * visible before the new PTE is assigned because stage2_make_pte()
+ * writes the PTE using smp_store_release().
+ */
+ new = kvm_init_table_pte(childp, mm_ops);
+ stage2_make_pte(ctx, new);
+ dsb(ishst);
+ return 0;
+}
+
+int kvm_pgtable_stage2_split(struct kvm_pgtable *pgt, u64 addr, u64 size,
+ struct kvm_mmu_memory_cache *mc)
+{
+ struct kvm_pgtable_walker walker = {
+ .cb = stage2_split_walker,
+ .flags = KVM_PGTABLE_WALK_LEAF,
+ .arg = mc,
+ };
+
+ return kvm_pgtable_walk(pgt, addr, size, &walker);
+}
int __kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu,
struct kvm_pgtable_mm_ops *mm_ops,
@@ -1299,7 +1470,7 @@ void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt)
pgt->pgd = NULL;
}
-void kvm_pgtable_stage2_free_removed(struct kvm_pgtable_mm_ops *mm_ops, void *pgtable, u32 level)
+void kvm_pgtable_stage2_free_unlinked(struct kvm_pgtable_mm_ops *mm_ops, void *pgtable, u32 level)
{
kvm_pteref_t ptep = (kvm_pteref_t)pgtable;
struct kvm_pgtable_walker walker = {
diff --git a/arch/arm64/kvm/hyp/vhe/tlb.c b/arch/arm64/kvm/hyp/vhe/tlb.c
index 24cef9b87f9e..e69da550cdc5 100644
--- a/arch/arm64/kvm/hyp/vhe/tlb.c
+++ b/arch/arm64/kvm/hyp/vhe/tlb.c
@@ -111,6 +111,38 @@ void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu,
__tlb_switch_to_host(&cxt);
}
+void __kvm_tlb_flush_vmid_ipa_nsh(struct kvm_s2_mmu *mmu,
+ phys_addr_t ipa, int level)
+{
+ struct tlb_inv_context cxt;
+
+ dsb(nshst);
+
+ /* Switch to requested VMID */
+ __tlb_switch_to_guest(mmu, &cxt);
+
+ /*
+ * We could do so much better if we had the VA as well.
+ * Instead, we invalidate Stage-2 for this IPA, and the
+ * whole of Stage-1. Weep...
+ */
+ ipa >>= 12;
+ __tlbi_level(ipas2e1, ipa, level);
+
+ /*
+ * We have to ensure completion of the invalidation at Stage-2,
+ * since a table walk on another CPU could refill a TLB with a
+ * complete (S1 + S2) walk based on the old Stage-2 mapping if
+ * the Stage-1 invalidation happened first.
+ */
+ dsb(nsh);
+ __tlbi(vmalle1);
+ dsb(nsh);
+ isb();
+
+ __tlb_switch_to_host(&cxt);
+}
+
void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu)
{
struct tlb_inv_context cxt;
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 3b9d4d24c361..6db9ef288ec3 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -31,14 +31,21 @@ static phys_addr_t __ro_after_init hyp_idmap_vector;
static unsigned long __ro_after_init io_map_base;
-static phys_addr_t stage2_range_addr_end(phys_addr_t addr, phys_addr_t end)
+static phys_addr_t __stage2_range_addr_end(phys_addr_t addr, phys_addr_t end,
+ phys_addr_t size)
{
- phys_addr_t size = kvm_granule_size(KVM_PGTABLE_MIN_BLOCK_LEVEL);
phys_addr_t boundary = ALIGN_DOWN(addr + size, size);
return (boundary - 1 < end - 1) ? boundary : end;
}
+static phys_addr_t stage2_range_addr_end(phys_addr_t addr, phys_addr_t end)
+{
+ phys_addr_t size = kvm_granule_size(KVM_PGTABLE_MIN_BLOCK_LEVEL);
+
+ return __stage2_range_addr_end(addr, end, size);
+}
+
/*
* Release kvm_mmu_lock periodically if the memory region is large. Otherwise,
* we may see kernel panics with CONFIG_DETECT_HUNG_TASK,
@@ -75,6 +82,79 @@ static int stage2_apply_range(struct kvm_s2_mmu *mmu, phys_addr_t addr,
#define stage2_apply_range_resched(mmu, addr, end, fn) \
stage2_apply_range(mmu, addr, end, fn, true)
+/*
+ * Get the maximum number of page-tables pages needed to split a range
+ * of blocks into PAGE_SIZE PTEs. It assumes the range is already
+ * mapped at level 2, or at level 1 if allowed.
+ */
+static int kvm_mmu_split_nr_page_tables(u64 range)
+{
+ int n = 0;
+
+ if (KVM_PGTABLE_MIN_BLOCK_LEVEL < 2)
+ n += DIV_ROUND_UP(range, PUD_SIZE);
+ n += DIV_ROUND_UP(range, PMD_SIZE);
+ return n;
+}
+
+static bool need_split_memcache_topup_or_resched(struct kvm *kvm)
+{
+ struct kvm_mmu_memory_cache *cache;
+ u64 chunk_size, min;
+
+ if (need_resched() || rwlock_needbreak(&kvm->mmu_lock))
+ return true;
+
+ chunk_size = kvm->arch.mmu.split_page_chunk_size;
+ min = kvm_mmu_split_nr_page_tables(chunk_size);
+ cache = &kvm->arch.mmu.split_page_cache;
+ return kvm_mmu_memory_cache_nr_free_objects(cache) < min;
+}
+
+static int kvm_mmu_split_huge_pages(struct kvm *kvm, phys_addr_t addr,
+ phys_addr_t end)
+{
+ struct kvm_mmu_memory_cache *cache;
+ struct kvm_pgtable *pgt;
+ int ret, cache_capacity;
+ u64 next, chunk_size;
+
+ lockdep_assert_held_write(&kvm->mmu_lock);
+
+ chunk_size = kvm->arch.mmu.split_page_chunk_size;
+ cache_capacity = kvm_mmu_split_nr_page_tables(chunk_size);
+
+ if (chunk_size == 0)
+ return 0;
+
+ cache = &kvm->arch.mmu.split_page_cache;
+
+ do {
+ if (need_split_memcache_topup_or_resched(kvm)) {
+ write_unlock(&kvm->mmu_lock);
+ cond_resched();
+ /* Eager page splitting is best-effort. */
+ ret = __kvm_mmu_topup_memory_cache(cache,
+ cache_capacity,
+ cache_capacity);
+ write_lock(&kvm->mmu_lock);
+ if (ret)
+ break;
+ }
+
+ pgt = kvm->arch.mmu.pgt;
+ if (!pgt)
+ return -EINVAL;
+
+ next = __stage2_range_addr_end(addr, end, chunk_size);
+ ret = kvm_pgtable_stage2_split(pgt, addr, next - addr, cache);
+ if (ret)
+ break;
+ } while (addr = next, addr != end);
+
+ return ret;
+}
+
static bool memslot_is_logging(struct kvm_memory_slot *memslot)
{
return memslot->dirty_bitmap && !(memslot->flags & KVM_MEM_READONLY);
@@ -131,21 +211,21 @@ static void kvm_s2_free_pages_exact(void *virt, size_t size)
static struct kvm_pgtable_mm_ops kvm_s2_mm_ops;
-static void stage2_free_removed_table_rcu_cb(struct rcu_head *head)
+static void stage2_free_unlinked_table_rcu_cb(struct rcu_head *head)
{
struct page *page = container_of(head, struct page, rcu_head);
void *pgtable = page_to_virt(page);
u32 level = page_private(page);
- kvm_pgtable_stage2_free_removed(&kvm_s2_mm_ops, pgtable, level);
+ kvm_pgtable_stage2_free_unlinked(&kvm_s2_mm_ops, pgtable, level);
}
-static void stage2_free_removed_table(void *addr, u32 level)
+static void stage2_free_unlinked_table(void *addr, u32 level)
{
struct page *page = virt_to_page(addr);
set_page_private(page, (unsigned long)level);
- call_rcu(&page->rcu_head, stage2_free_removed_table_rcu_cb);
+ call_rcu(&page->rcu_head, stage2_free_unlinked_table_rcu_cb);
}
static void kvm_host_get_page(void *addr)
@@ -701,7 +781,7 @@ static struct kvm_pgtable_mm_ops kvm_s2_mm_ops = {
.zalloc_page = stage2_memcache_zalloc_page,
.zalloc_pages_exact = kvm_s2_zalloc_pages_exact,
.free_pages_exact = kvm_s2_free_pages_exact,
- .free_removed_table = stage2_free_removed_table,
+ .free_unlinked_table = stage2_free_unlinked_table,
.get_page = kvm_host_get_page,
.put_page = kvm_s2_put_page,
.page_count = kvm_host_page_count,
@@ -775,6 +855,10 @@ int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long t
for_each_possible_cpu(cpu)
*per_cpu_ptr(mmu->last_vcpu_ran, cpu) = -1;
+ /* The eager page splitting is disabled by default */
+ mmu->split_page_chunk_size = KVM_ARM_EAGER_SPLIT_CHUNK_SIZE_DEFAULT;
+ mmu->split_page_cache.gfp_zero = __GFP_ZERO;
+
mmu->pgt = pgt;
mmu->pgd_phys = __pa(pgt->pgd);
return 0;
@@ -786,6 +870,12 @@ out_free_pgtable:
return err;
}
+void kvm_uninit_stage2_mmu(struct kvm *kvm)
+{
+ kvm_free_stage2_pgd(&kvm->arch.mmu);
+ kvm_mmu_free_memory_cache(&kvm->arch.mmu.split_page_cache);
+}
+
static void stage2_unmap_memslot(struct kvm *kvm,
struct kvm_memory_slot *memslot)
{
@@ -989,39 +1079,66 @@ static void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot)
}
/**
- * kvm_mmu_write_protect_pt_masked() - write protect dirty pages
+ * kvm_mmu_split_memory_region() - split the stage 2 blocks into PAGE_SIZE
+ * pages for memory slot
* @kvm: The KVM pointer
- * @slot: The memory slot associated with mask
- * @gfn_offset: The gfn offset in memory slot
- * @mask: The mask of dirty pages at offset 'gfn_offset' in this memory
- * slot to be write protected
+ * @slot: The memory slot to split
*
- * Walks bits set in mask write protects the associated pte's. Caller must
- * acquire kvm_mmu_lock.
+ * Acquires kvm->mmu_lock. Called with kvm->slots_lock mutex acquired,
+ * serializing operations for VM memory regions.
*/
-static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
- struct kvm_memory_slot *slot,
- gfn_t gfn_offset, unsigned long mask)
+static void kvm_mmu_split_memory_region(struct kvm *kvm, int slot)
{
- phys_addr_t base_gfn = slot->base_gfn + gfn_offset;
- phys_addr_t start = (base_gfn + __ffs(mask)) << PAGE_SHIFT;
- phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT;
+ struct kvm_memslots *slots;
+ struct kvm_memory_slot *memslot;
+ phys_addr_t start, end;
- stage2_wp_range(&kvm->arch.mmu, start, end);
+ lockdep_assert_held(&kvm->slots_lock);
+
+ slots = kvm_memslots(kvm);
+ memslot = id_to_memslot(slots, slot);
+
+ start = memslot->base_gfn << PAGE_SHIFT;
+ end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT;
+
+ write_lock(&kvm->mmu_lock);
+ kvm_mmu_split_huge_pages(kvm, start, end);
+ write_unlock(&kvm->mmu_lock);
}
/*
- * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected
- * dirty pages.
+ * kvm_arch_mmu_enable_log_dirty_pt_masked() - enable dirty logging for selected pages.
+ * @kvm: The KVM pointer
+ * @slot: The memory slot associated with mask
+ * @gfn_offset: The gfn offset in memory slot
+ * @mask: The mask of pages at offset 'gfn_offset' in this memory
+ * slot to enable dirty logging on
*
- * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to
- * enable dirty logging for them.
+ * Writes protect selected pages to enable dirty logging, and then
+ * splits them to PAGE_SIZE. Caller must acquire kvm->mmu_lock.
*/
void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
struct kvm_memory_slot *slot,
gfn_t gfn_offset, unsigned long mask)
{
- kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
+ phys_addr_t base_gfn = slot->base_gfn + gfn_offset;
+ phys_addr_t start = (base_gfn + __ffs(mask)) << PAGE_SHIFT;
+ phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT;
+
+ lockdep_assert_held_write(&kvm->mmu_lock);
+
+ stage2_wp_range(&kvm->arch.mmu, start, end);
+
+ /*
+ * Eager-splitting is done when manual-protect is set. We
+ * also check for initially-all-set because we can avoid
+ * eager-splitting if initially-all-set is false.
+ * Initially-all-set equal false implies that huge-pages were
+ * already split when enabling dirty logging: no need to do it
+ * again.
+ */
+ if (kvm_dirty_log_manual_protect_and_init_set(kvm))
+ kvm_mmu_split_huge_pages(kvm, start, end);
}
static void kvm_send_hwpoison_signal(unsigned long address, short lsb)
@@ -1790,20 +1907,42 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
const struct kvm_memory_slot *new,
enum kvm_mr_change change)
{
+ bool log_dirty_pages = new && new->flags & KVM_MEM_LOG_DIRTY_PAGES;
+
/*
* At this point memslot has been committed and there is an
* allocated dirty_bitmap[], dirty pages will be tracked while the
* memory slot is write protected.
*/
- if (change != KVM_MR_DELETE && new->flags & KVM_MEM_LOG_DIRTY_PAGES) {
+ if (log_dirty_pages) {
+
+ if (change == KVM_MR_DELETE)
+ return;
+
/*
- * If we're with initial-all-set, we don't need to write
- * protect any pages because they're all reported as dirty.
- * Huge pages and normal pages will be write protect gradually.
+ * Huge and normal pages are write-protected and split
+ * on either of these two cases:
+ *
+ * 1. with initial-all-set: gradually with CLEAR ioctls,
*/
- if (!kvm_dirty_log_manual_protect_and_init_set(kvm)) {
- kvm_mmu_wp_memory_region(kvm, new->id);
- }
+ if (kvm_dirty_log_manual_protect_and_init_set(kvm))
+ return;
+ /*
+ * or
+ * 2. without initial-all-set: all in one shot when
+ * enabling dirty logging.
+ */
+ kvm_mmu_wp_memory_region(kvm, new->id);
+ kvm_mmu_split_memory_region(kvm, new->id);
+ } else {
+ /*
+ * Free any leftovers from the eager page splitting cache. Do
+ * this when deleting, moving, disabling dirty logging, or
+ * creating the memslot (a nop). Doing it for deletes makes
+ * sure we don't leak memory, and there's no need to keep the
+ * cache around for any of the other cases.
+ */
+ kvm_mmu_free_memory_cache(&kvm->arch.mmu.split_page_cache);
}
}
@@ -1877,7 +2016,7 @@ void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen)
void kvm_arch_flush_shadow_all(struct kvm *kvm)
{
- kvm_free_stage2_pgd(&kvm->arch.mmu);
+ kvm_uninit_stage2_mmu(kvm);
}
void kvm_arch_flush_shadow_memslot(struct kvm *kvm,