diff options
author | Marc Zyngier <maz@kernel.org> | 2023-12-18 20:07:34 +0300 |
---|---|---|
committer | Marc Zyngier <maz@kernel.org> | 2023-12-18 20:07:34 +0300 |
commit | 189f2c8e0c420b194af0ee22b8f247948cb577be (patch) | |
tree | 4766a8eb3c66fbfdc5deb53ad153e03e6d4c71ca /arch | |
parent | 2cc14f52aeb78ce3f29677c2de1f06c0e91471ab (diff) | |
parent | 11e5ea5242e38d44fcede879473566bb6d68f954 (diff) | |
download | linux-189f2c8e0c420b194af0ee22b8f247948cb577be.tar.xz |
Merge branch kvm-arm64/lpa2 into kvmarm-master/next
* kvm-arm64/lpa2:
: .
: Support FEAT_LPA2 at EL2 S1 and S2, courtesy of Ryan Roberts
:
: From the cover letter:
:
: "This adds support for FEAT_LPA2 to KVM for both hypervisor stage 1 (for the
: nvhe/protected modes) and the vm stage 2 translation tables (for all modes).
: FEAT_LPA2 enables 52 bit PAs and VAs for 4KB and 16KB granules (note this is
: already supported for 64KB granules via the FEAT_LPA and FEAT_LVA extensions)."
: .
KVM: arm64: Use helpers to classify exception types reported via ESR
KVM: selftests: arm64: Support P52V48 4K and 16K guest_modes
KVM: selftests: arm64: Determine max ipa size per-page size
KVM: arm64: Allow guests with >48-bit IPA size on FEAT_LPA2 systems
KVM: arm64: Support up to 5 levels of translation in kvm_pgtable
KVM: arm64: Convert translation level parameter to s8
KVM: arm64: Use LPA2 page-tables for stage2 and hyp stage1
KVM: arm64: Add new (V)TCR_EL2 field definitions for FEAT_LPA2
arm64: Add ARM64_HAS_LPA2 CPU capability
arm64/mm: Add FEAT_LPA2 specific ID_AA64MMFR0.TGRAN[2]
arm64/mm: Update tlb invalidation routines for FEAT_LPA2
arm64/mm: Add lpa2_is_enabled() kvm_lpa2_is_enabled() stubs
arm64/mm: Modify range-based tlbi to decrement scale
Signed-off-by: Marc Zyngier <maz@kernel.org>
Diffstat (limited to 'arch')
-rw-r--r-- | arch/arm64/include/asm/cpufeature.h | 5 | ||||
-rw-r--r-- | arch/arm64/include/asm/esr.h | 15 | ||||
-rw-r--r-- | arch/arm64/include/asm/kvm_arm.h | 2 | ||||
-rw-r--r-- | arch/arm64/include/asm/kvm_emulate.h | 26 | ||||
-rw-r--r-- | arch/arm64/include/asm/kvm_pgtable.h | 80 | ||||
-rw-r--r-- | arch/arm64/include/asm/kvm_pkvm.h | 5 | ||||
-rw-r--r-- | arch/arm64/include/asm/pgtable-prot.h | 2 | ||||
-rw-r--r-- | arch/arm64/include/asm/sysreg.h | 5 | ||||
-rw-r--r-- | arch/arm64/include/asm/tlb.h | 15 | ||||
-rw-r--r-- | arch/arm64/include/asm/tlbflush.h | 100 | ||||
-rw-r--r-- | arch/arm64/kernel/cpufeature.c | 39 | ||||
-rw-r--r-- | arch/arm64/kvm/arm.c | 5 | ||||
-rw-r--r-- | arch/arm64/kvm/hyp/include/hyp/fault.h | 2 | ||||
-rw-r--r-- | arch/arm64/kvm/hyp/include/hyp/switch.h | 2 | ||||
-rw-r--r-- | arch/arm64/kvm/hyp/nvhe/hyp-init.S | 4 | ||||
-rw-r--r-- | arch/arm64/kvm/hyp/nvhe/mem_protect.c | 6 | ||||
-rw-r--r-- | arch/arm64/kvm/hyp/nvhe/mm.c | 4 | ||||
-rw-r--r-- | arch/arm64/kvm/hyp/nvhe/setup.c | 2 | ||||
-rw-r--r-- | arch/arm64/kvm/hyp/pgtable.c | 90 | ||||
-rw-r--r-- | arch/arm64/kvm/mmu.c | 49 | ||||
-rw-r--r-- | arch/arm64/kvm/reset.c | 9 | ||||
-rw-r--r-- | arch/arm64/tools/cpucaps | 1 |
22 files changed, 314 insertions, 154 deletions
diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index f6d416fe49b0..acf109581ac0 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -819,6 +819,11 @@ static inline bool system_supports_tlb_range(void) return alternative_has_cap_unlikely(ARM64_HAS_TLB_RANGE); } +static inline bool system_supports_lpa2(void) +{ + return cpus_have_final_cap(ARM64_HAS_LPA2); +} + int do_emulate_mrs(struct pt_regs *regs, u32 sys_reg, u32 rt); bool try_emulate_mrs(struct pt_regs *regs, u32 isn); diff --git a/arch/arm64/include/asm/esr.h b/arch/arm64/include/asm/esr.h index ae35939f395b..353fe08546cf 100644 --- a/arch/arm64/include/asm/esr.h +++ b/arch/arm64/include/asm/esr.h @@ -392,6 +392,21 @@ static inline bool esr_is_data_abort(unsigned long esr) return ec == ESR_ELx_EC_DABT_LOW || ec == ESR_ELx_EC_DABT_CUR; } +static inline bool esr_fsc_is_translation_fault(unsigned long esr) +{ + return (esr & ESR_ELx_FSC_TYPE) == ESR_ELx_FSC_FAULT; +} + +static inline bool esr_fsc_is_permission_fault(unsigned long esr) +{ + return (esr & ESR_ELx_FSC_TYPE) == ESR_ELx_FSC_PERM; +} + +static inline bool esr_fsc_is_access_flag_fault(unsigned long esr) +{ + return (esr & ESR_ELx_FSC_TYPE) == ESR_ELx_FSC_ACCESS; +} + const char *esr_get_class_string(unsigned long esr); #endif /* __ASSEMBLY */ diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h index b85f46a73e21..312cbc300831 100644 --- a/arch/arm64/include/asm/kvm_arm.h +++ b/arch/arm64/include/asm/kvm_arm.h @@ -108,6 +108,7 @@ #define HCRX_HOST_FLAGS (HCRX_EL2_MSCEn | HCRX_EL2_TCR2En) /* TCR_EL2 Registers bits */ +#define TCR_EL2_DS (1UL << 32) #define TCR_EL2_RES1 ((1U << 31) | (1 << 23)) #define TCR_EL2_TBI (1 << 20) #define TCR_EL2_PS_SHIFT 16 @@ -122,6 +123,7 @@ TCR_EL2_ORGN0_MASK | TCR_EL2_IRGN0_MASK | TCR_EL2_T0SZ_MASK) /* VTCR_EL2 Registers bits */ +#define VTCR_EL2_DS TCR_EL2_DS #define VTCR_EL2_RES1 (1U << 31) #define VTCR_EL2_HD (1 << 22) #define VTCR_EL2_HA (1 << 21) diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h index 78a550537b67..31f13e7d339b 100644 --- a/arch/arm64/include/asm/kvm_emulate.h +++ b/arch/arm64/include/asm/kvm_emulate.h @@ -404,14 +404,25 @@ static __always_inline u8 kvm_vcpu_trap_get_fault(const struct kvm_vcpu *vcpu) return kvm_vcpu_get_esr(vcpu) & ESR_ELx_FSC; } -static __always_inline u8 kvm_vcpu_trap_get_fault_type(const struct kvm_vcpu *vcpu) +static inline +bool kvm_vcpu_trap_is_permission_fault(const struct kvm_vcpu *vcpu) { - return kvm_vcpu_get_esr(vcpu) & ESR_ELx_FSC_TYPE; + return esr_fsc_is_permission_fault(kvm_vcpu_get_esr(vcpu)); } -static __always_inline u8 kvm_vcpu_trap_get_fault_level(const struct kvm_vcpu *vcpu) +static inline +bool kvm_vcpu_trap_is_translation_fault(const struct kvm_vcpu *vcpu) { - return kvm_vcpu_get_esr(vcpu) & ESR_ELx_FSC_LEVEL; + return esr_fsc_is_translation_fault(kvm_vcpu_get_esr(vcpu)); +} + +static inline +u64 kvm_vcpu_trap_get_perm_fault_granule(const struct kvm_vcpu *vcpu) +{ + unsigned long esr = kvm_vcpu_get_esr(vcpu); + + BUG_ON(!esr_fsc_is_permission_fault(esr)); + return BIT(ARM64_HW_PGTABLE_LEVEL_SHIFT(esr & ESR_ELx_FSC_LEVEL)); } static __always_inline bool kvm_vcpu_abt_issea(const struct kvm_vcpu *vcpu) @@ -454,12 +465,7 @@ static inline bool kvm_is_write_fault(struct kvm_vcpu *vcpu) * first), then a permission fault to allow the flags * to be set. */ - switch (kvm_vcpu_trap_get_fault_type(vcpu)) { - case ESR_ELx_FSC_PERM: - return true; - default: - return false; - } + return kvm_vcpu_trap_is_permission_fault(vcpu); } if (kvm_vcpu_trap_is_iabt(vcpu)) diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h index d3e354bb8351..cfdf40f734b1 100644 --- a/arch/arm64/include/asm/kvm_pgtable.h +++ b/arch/arm64/include/asm/kvm_pgtable.h @@ -11,7 +11,8 @@ #include <linux/kvm_host.h> #include <linux/types.h> -#define KVM_PGTABLE_MAX_LEVELS 4U +#define KVM_PGTABLE_FIRST_LEVEL -1 +#define KVM_PGTABLE_LAST_LEVEL 3 /* * The largest supported block sizes for KVM (no 52-bit PA support): @@ -20,17 +21,29 @@ * - 64K (level 2): 512MB */ #ifdef CONFIG_ARM64_4K_PAGES -#define KVM_PGTABLE_MIN_BLOCK_LEVEL 1U +#define KVM_PGTABLE_MIN_BLOCK_LEVEL 1 #else -#define KVM_PGTABLE_MIN_BLOCK_LEVEL 2U +#define KVM_PGTABLE_MIN_BLOCK_LEVEL 2 #endif +#define kvm_lpa2_is_enabled() system_supports_lpa2() + +static inline u64 kvm_get_parange_max(void) +{ + if (kvm_lpa2_is_enabled() || + (IS_ENABLED(CONFIG_ARM64_PA_BITS_52) && PAGE_SHIFT == 16)) + return ID_AA64MMFR0_EL1_PARANGE_52; + else + return ID_AA64MMFR0_EL1_PARANGE_48; +} + static inline u64 kvm_get_parange(u64 mmfr0) { + u64 parange_max = kvm_get_parange_max(); u64 parange = cpuid_feature_extract_unsigned_field(mmfr0, ID_AA64MMFR0_EL1_PARANGE_SHIFT); - if (parange > ID_AA64MMFR0_EL1_PARANGE_MAX) - parange = ID_AA64MMFR0_EL1_PARANGE_MAX; + if (parange > parange_max) + parange = parange_max; return parange; } @@ -41,6 +54,8 @@ typedef u64 kvm_pte_t; #define KVM_PTE_ADDR_MASK GENMASK(47, PAGE_SHIFT) #define KVM_PTE_ADDR_51_48 GENMASK(15, 12) +#define KVM_PTE_ADDR_MASK_LPA2 GENMASK(49, PAGE_SHIFT) +#define KVM_PTE_ADDR_51_50_LPA2 GENMASK(9, 8) #define KVM_PHYS_INVALID (-1ULL) @@ -51,21 +66,34 @@ static inline bool kvm_pte_valid(kvm_pte_t pte) static inline u64 kvm_pte_to_phys(kvm_pte_t pte) { - u64 pa = pte & KVM_PTE_ADDR_MASK; - - if (PAGE_SHIFT == 16) - pa |= FIELD_GET(KVM_PTE_ADDR_51_48, pte) << 48; + u64 pa; + + if (kvm_lpa2_is_enabled()) { + pa = pte & KVM_PTE_ADDR_MASK_LPA2; + pa |= FIELD_GET(KVM_PTE_ADDR_51_50_LPA2, pte) << 50; + } else { + pa = pte & KVM_PTE_ADDR_MASK; + if (PAGE_SHIFT == 16) + pa |= FIELD_GET(KVM_PTE_ADDR_51_48, pte) << 48; + } return pa; } static inline kvm_pte_t kvm_phys_to_pte(u64 pa) { - kvm_pte_t pte = pa & KVM_PTE_ADDR_MASK; - - if (PAGE_SHIFT == 16) { - pa &= GENMASK(51, 48); - pte |= FIELD_PREP(KVM_PTE_ADDR_51_48, pa >> 48); + kvm_pte_t pte; + + if (kvm_lpa2_is_enabled()) { + pte = pa & KVM_PTE_ADDR_MASK_LPA2; + pa &= GENMASK(51, 50); + pte |= FIELD_PREP(KVM_PTE_ADDR_51_50_LPA2, pa >> 50); + } else { + pte = pa & KVM_PTE_ADDR_MASK; + if (PAGE_SHIFT == 16) { + pa &= GENMASK(51, 48); + pte |= FIELD_PREP(KVM_PTE_ADDR_51_48, pa >> 48); + } } return pte; @@ -76,28 +104,28 @@ static inline kvm_pfn_t kvm_pte_to_pfn(kvm_pte_t pte) return __phys_to_pfn(kvm_pte_to_phys(pte)); } -static inline u64 kvm_granule_shift(u32 level) +static inline u64 kvm_granule_shift(s8 level) { - /* Assumes KVM_PGTABLE_MAX_LEVELS is 4 */ + /* Assumes KVM_PGTABLE_LAST_LEVEL is 3 */ return ARM64_HW_PGTABLE_LEVEL_SHIFT(level); } -static inline u64 kvm_granule_size(u32 level) +static inline u64 kvm_granule_size(s8 level) { return BIT(kvm_granule_shift(level)); } -static inline bool kvm_level_supports_block_mapping(u32 level) +static inline bool kvm_level_supports_block_mapping(s8 level) { return level >= KVM_PGTABLE_MIN_BLOCK_LEVEL; } static inline u32 kvm_supported_block_sizes(void) { - u32 level = KVM_PGTABLE_MIN_BLOCK_LEVEL; + s8 level = KVM_PGTABLE_MIN_BLOCK_LEVEL; u32 r = 0; - for (; level < KVM_PGTABLE_MAX_LEVELS; level++) + for (; level <= KVM_PGTABLE_LAST_LEVEL; level++) r |= BIT(kvm_granule_shift(level)); return r; @@ -142,7 +170,7 @@ struct kvm_pgtable_mm_ops { void* (*zalloc_page)(void *arg); void* (*zalloc_pages_exact)(size_t size); void (*free_pages_exact)(void *addr, size_t size); - void (*free_unlinked_table)(void *addr, u32 level); + void (*free_unlinked_table)(void *addr, s8 level); void (*get_page)(void *addr); void (*put_page)(void *addr); int (*page_count)(void *addr); @@ -238,7 +266,7 @@ struct kvm_pgtable_visit_ctx { u64 start; u64 addr; u64 end; - u32 level; + s8 level; enum kvm_pgtable_walk_flags flags; }; @@ -341,7 +369,7 @@ static inline bool kvm_pgtable_walk_lock_held(void) */ struct kvm_pgtable { u32 ia_bits; - u32 start_level; + s8 start_level; kvm_pteref_t pgd; struct kvm_pgtable_mm_ops *mm_ops; @@ -475,7 +503,7 @@ void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt); * The page-table is assumed to be unreachable by any hardware walkers prior to * freeing and therefore no TLB invalidation is performed. */ -void kvm_pgtable_stage2_free_unlinked(struct kvm_pgtable_mm_ops *mm_ops, void *pgtable, u32 level); +void kvm_pgtable_stage2_free_unlinked(struct kvm_pgtable_mm_ops *mm_ops, void *pgtable, s8 level); /** * kvm_pgtable_stage2_create_unlinked() - Create an unlinked stage-2 paging structure. @@ -499,7 +527,7 @@ void kvm_pgtable_stage2_free_unlinked(struct kvm_pgtable_mm_ops *mm_ops, void *p * an ERR_PTR(error) on failure. */ kvm_pte_t *kvm_pgtable_stage2_create_unlinked(struct kvm_pgtable *pgt, - u64 phys, u32 level, + u64 phys, s8 level, enum kvm_pgtable_prot prot, void *mc, bool force_pte); @@ -725,7 +753,7 @@ int kvm_pgtable_walk(struct kvm_pgtable *pgt, u64 addr, u64 size, * Return: 0 on success, negative error code on failure. */ int kvm_pgtable_get_leaf(struct kvm_pgtable *pgt, u64 addr, - kvm_pte_t *ptep, u32 *level); + kvm_pte_t *ptep, s8 *level); /** * kvm_pgtable_stage2_pte_prot() - Retrieve the protection attributes of a diff --git a/arch/arm64/include/asm/kvm_pkvm.h b/arch/arm64/include/asm/kvm_pkvm.h index e46250a02017..ad9cfb5c1ff4 100644 --- a/arch/arm64/include/asm/kvm_pkvm.h +++ b/arch/arm64/include/asm/kvm_pkvm.h @@ -56,10 +56,11 @@ static inline unsigned long hyp_vm_table_pages(void) static inline unsigned long __hyp_pgtable_max_pages(unsigned long nr_pages) { - unsigned long total = 0, i; + unsigned long total = 0; + int i; /* Provision the worst case scenario */ - for (i = 0; i < KVM_PGTABLE_MAX_LEVELS; i++) { + for (i = KVM_PGTABLE_FIRST_LEVEL; i <= KVM_PGTABLE_LAST_LEVEL; i++) { nr_pages = DIV_ROUND_UP(nr_pages, PTRS_PER_PTE); total += nr_pages; } diff --git a/arch/arm64/include/asm/pgtable-prot.h b/arch/arm64/include/asm/pgtable-prot.h index e9624f6326dd..483dbfa39c4c 100644 --- a/arch/arm64/include/asm/pgtable-prot.h +++ b/arch/arm64/include/asm/pgtable-prot.h @@ -71,6 +71,8 @@ extern bool arm64_use_ng_mappings; #define PTE_MAYBE_NG (arm64_use_ng_mappings ? PTE_NG : 0) #define PMD_MAYBE_NG (arm64_use_ng_mappings ? PMD_SECT_NG : 0) +#define lpa2_is_enabled() false + /* * If we have userspace only BTI we don't want to mark kernel pages * guarded even if the system does support BTI. diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 5e65f51c10d2..48181cf6cc40 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -871,10 +871,12 @@ /* id_aa64mmfr0 */ #define ID_AA64MMFR0_EL1_TGRAN4_SUPPORTED_MIN 0x0 +#define ID_AA64MMFR0_EL1_TGRAN4_LPA2 ID_AA64MMFR0_EL1_TGRAN4_52_BIT #define ID_AA64MMFR0_EL1_TGRAN4_SUPPORTED_MAX 0x7 #define ID_AA64MMFR0_EL1_TGRAN64_SUPPORTED_MIN 0x0 #define ID_AA64MMFR0_EL1_TGRAN64_SUPPORTED_MAX 0x7 #define ID_AA64MMFR0_EL1_TGRAN16_SUPPORTED_MIN 0x1 +#define ID_AA64MMFR0_EL1_TGRAN16_LPA2 ID_AA64MMFR0_EL1_TGRAN16_52_BIT #define ID_AA64MMFR0_EL1_TGRAN16_SUPPORTED_MAX 0xf #define ARM64_MIN_PARANGE_BITS 32 @@ -882,6 +884,7 @@ #define ID_AA64MMFR0_EL1_TGRAN_2_SUPPORTED_DEFAULT 0x0 #define ID_AA64MMFR0_EL1_TGRAN_2_SUPPORTED_NONE 0x1 #define ID_AA64MMFR0_EL1_TGRAN_2_SUPPORTED_MIN 0x2 +#define ID_AA64MMFR0_EL1_TGRAN_2_SUPPORTED_LPA2 0x3 #define ID_AA64MMFR0_EL1_TGRAN_2_SUPPORTED_MAX 0x7 #ifdef CONFIG_ARM64_PA_BITS_52 @@ -892,11 +895,13 @@ #if defined(CONFIG_ARM64_4K_PAGES) #define ID_AA64MMFR0_EL1_TGRAN_SHIFT ID_AA64MMFR0_EL1_TGRAN4_SHIFT +#define ID_AA64MMFR0_EL1_TGRAN_LPA2 ID_AA64MMFR0_EL1_TGRAN4_52_BIT #define ID_AA64MMFR0_EL1_TGRAN_SUPPORTED_MIN ID_AA64MMFR0_EL1_TGRAN4_SUPPORTED_MIN #define ID_AA64MMFR0_EL1_TGRAN_SUPPORTED_MAX ID_AA64MMFR0_EL1_TGRAN4_SUPPORTED_MAX #define ID_AA64MMFR0_EL1_TGRAN_2_SHIFT ID_AA64MMFR0_EL1_TGRAN4_2_SHIFT #elif defined(CONFIG_ARM64_16K_PAGES) #define ID_AA64MMFR0_EL1_TGRAN_SHIFT ID_AA64MMFR0_EL1_TGRAN16_SHIFT +#define ID_AA64MMFR0_EL1_TGRAN_LPA2 ID_AA64MMFR0_EL1_TGRAN16_52_BIT #define ID_AA64MMFR0_EL1_TGRAN_SUPPORTED_MIN ID_AA64MMFR0_EL1_TGRAN16_SUPPORTED_MIN #define ID_AA64MMFR0_EL1_TGRAN_SUPPORTED_MAX ID_AA64MMFR0_EL1_TGRAN16_SUPPORTED_MAX #define ID_AA64MMFR0_EL1_TGRAN_2_SHIFT ID_AA64MMFR0_EL1_TGRAN16_2_SHIFT diff --git a/arch/arm64/include/asm/tlb.h b/arch/arm64/include/asm/tlb.h index 846c563689a8..0150deb332af 100644 --- a/arch/arm64/include/asm/tlb.h +++ b/arch/arm64/include/asm/tlb.h @@ -22,15 +22,15 @@ static void tlb_flush(struct mmu_gather *tlb); #include <asm-generic/tlb.h> /* - * get the tlbi levels in arm64. Default value is 0 if more than one - * of cleared_* is set or neither is set. - * Arm64 doesn't support p4ds now. + * get the tlbi levels in arm64. Default value is TLBI_TTL_UNKNOWN if more than + * one of cleared_* is set or neither is set - this elides the level hinting to + * the hardware. */ static inline int tlb_get_level(struct mmu_gather *tlb) { /* The TTL field is only valid for the leaf entry. */ if (tlb->freed_tables) - return 0; + return TLBI_TTL_UNKNOWN; if (tlb->cleared_ptes && !(tlb->cleared_pmds || tlb->cleared_puds || @@ -47,7 +47,12 @@ static inline int tlb_get_level(struct mmu_gather *tlb) tlb->cleared_p4ds)) return 1; - return 0; + if (tlb->cleared_p4ds && !(tlb->cleared_ptes || + tlb->cleared_pmds || + tlb->cleared_puds)) + return 0; + + return TLBI_TTL_UNKNOWN; } static inline void tlb_flush(struct mmu_gather *tlb) diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h index bb2c2833a987..1deb5d789c2e 100644 --- a/arch/arm64/include/asm/tlbflush.h +++ b/arch/arm64/include/asm/tlbflush.h @@ -94,19 +94,22 @@ static inline unsigned long get_trans_granule(void) * When ARMv8.4-TTL exists, TLBI operations take an additional hint for * the level at which the invalidation must take place. If the level is * wrong, no invalidation may take place. In the case where the level - * cannot be easily determined, a 0 value for the level parameter will - * perform a non-hinted invalidation. + * cannot be easily determined, the value TLBI_TTL_UNKNOWN will perform + * a non-hinted invalidation. Any provided level outside the hint range + * will also cause fall-back to non-hinted invalidation. * * For Stage-2 invalidation, use the level values provided to that effect * in asm/stage2_pgtable.h. */ #define TLBI_TTL_MASK GENMASK_ULL(47, 44) +#define TLBI_TTL_UNKNOWN INT_MAX + #define __tlbi_level(op, addr, level) do { \ u64 arg = addr; \ \ if (alternative_has_cap_unlikely(ARM64_HAS_ARMv8_4_TTL) && \ - level) { \ + level >= 0 && level <= 3) { \ u64 ttl = level & 3; \ ttl |= get_trans_granule() << 2; \ arg &= ~TLBI_TTL_MASK; \ @@ -122,28 +125,34 @@ static inline unsigned long get_trans_granule(void) } while (0) /* - * This macro creates a properly formatted VA operand for the TLB RANGE. - * The value bit assignments are: + * This macro creates a properly formatted VA operand for the TLB RANGE. The + * value bit assignments are: * * +----------+------+-------+-------+-------+----------------------+ * | ASID | TG | SCALE | NUM | TTL | BADDR | * +-----------------+-------+-------+-------+----------------------+ * |63 48|47 46|45 44|43 39|38 37|36 0| * - * The address range is determined by below formula: - * [BADDR, BADDR + (NUM + 1) * 2^(5*SCALE + 1) * PAGESIZE) + * The address range is determined by below formula: [BADDR, BADDR + (NUM + 1) * + * 2^(5*SCALE + 1) * PAGESIZE) + * + * Note that the first argument, baddr, is pre-shifted; If LPA2 is in use, BADDR + * holds addr[52:16]. Else BADDR holds page number. See for example ARM DDI + * 0487J.a section C5.5.60 "TLBI VAE1IS, TLBI VAE1ISNXS, TLB Invalidate by VA, + * EL1, Inner Shareable". * */ -#define __TLBI_VADDR_RANGE(addr, asid, scale, num, ttl) \ - ({ \ - unsigned long __ta = (addr) >> PAGE_SHIFT; \ - __ta &= GENMASK_ULL(36, 0); \ - __ta |= (unsigned long)(ttl) << 37; \ - __ta |= (unsigned long)(num) << 39; \ - __ta |= (unsigned long)(scale) << 44; \ - __ta |= get_trans_granule() << 46; \ - __ta |= (unsigned long)(asid) << 48; \ - __ta; \ +#define __TLBI_VADDR_RANGE(baddr, asid, scale, num, ttl) \ + ({ \ + unsigned long __ta = (baddr); \ + unsigned long __ttl = (ttl >= 1 && ttl <= 3) ? ttl : 0; \ + __ta &= GENMASK_ULL(36, 0); \ + __ta |= __ttl << 37; \ + __ta |= (unsigned long)(num) << 39; \ + __ta |= (unsigned long)(scale) << 44; \ + __ta |= get_trans_granule() << 46; \ + __ta |= (unsigned long)(asid) << 48; \ + __ta; \ }) /* These macros are used by the TLBI RANGE feature. */ @@ -216,12 +225,16 @@ static inline unsigned long get_trans_granule(void) * CPUs, ensuring that any walk-cache entries associated with the * translation are also invalidated. * - * __flush_tlb_range(vma, start, end, stride, last_level) + * __flush_tlb_range(vma, start, end, stride, last_level, tlb_level) * Invalidate the virtual-address range '[start, end)' on all * CPUs for the user address space corresponding to 'vma->mm'. * The invalidation operations are issued at a granularity * determined by 'stride' and only affect any walk-cache entries - * if 'last_level' is equal to false. + * if 'last_level' is equal to false. tlb_level is the level at + * which the invalidation must take place. If the level is wrong, + * no invalidation may take place. In the case where the level + * cannot be easily determined, the value TLBI_TTL_UNKNOWN will + * perform a non-hinted invalidation. * * * Finally, take a look at asm/tlb.h to see how tlb_flush() is implemented @@ -345,34 +358,44 @@ static inline void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch) * @tlb_level: Translation Table level hint, if known * @tlbi_user: If 'true', call an additional __tlbi_user() * (typically for user ASIDs). 'flase' for IPA instructions + * @lpa2: If 'true', the lpa2 scheme is used as set out below * * When the CPU does not support TLB range operations, flush the TLB * entries one by one at the granularity of 'stride'. If the TLB * range ops are supported, then: * - * 1. If 'pages' is odd, flush the first page through non-range - * operations; + * 1. If FEAT_LPA2 is in use, the start address of a range operation must be + * 64KB aligned, so flush pages one by one until the alignment is reached + * using the non-range operations. This step is skipped if LPA2 is not in + * use. + * + * 2. The minimum range granularity is decided by 'scale', so multiple range + * TLBI operations may be required. Start from scale = 3, flush the largest + * possible number of pages ((num+1)*2^(5*scale+1)) that fit into the + * requested range, then decrement scale and continue until one or zero pages + * are left. We must start from highest scale to ensure 64KB start alignment + * is maintained in the LPA2 case. * - * 2. For remaining pages: the minimum range granularity is decided - * by 'scale', so multiple range TLBI operations may be required. - * Start from scale = 0, flush the corresponding number of pages - * ((num+1)*2^(5*scale+1) starting from 'addr'), then increase it - * until no pages left. + * 3. If there is 1 page remaining, flush it through non-range operations. Range + * operations can only span an even number of pages. We save this for last to + * ensure 64KB start alignment is maintained for the LPA2 case. * * Note that certain ranges can be represented by either num = 31 and * scale or num = 0 and scale + 1. The loop below favours the latter * since num is limited to 30 by the __TLBI_RANGE_NUM() macro. */ #define __flush_tlb_range_op(op, start, pages, stride, \ - asid, tlb_level, tlbi_user) \ + asid, tlb_level, tlbi_user, lpa2) \ do { \ int num = 0; \ - int scale = 0; \ + int scale = 3; \ + int shift = lpa2 ? 16 : PAGE_SHIFT; \ unsigned long addr; \ \ while (pages > 0) { \ if (!system_supports_tlb_range() || \ - pages % 2 == 1) { \ + pages == 1 || \ + (lpa2 && start != ALIGN(start, SZ_64K))) { \ addr = __TLBI_VADDR(start, asid); \ __tlbi_level(op, addr, tlb_level); \ if (tlbi_user) \ @@ -384,20 +407,20 @@ do { \ \ num = __TLBI_RANGE_NUM(pages, scale); \ if (num >= 0) { \ - addr = __TLBI_VADDR_RANGE(start, asid, scale, \ - num, tlb_level); \ + addr = __TLBI_VADDR_RANGE(start >> shift, asid, \ + scale, num, tlb_level); \ __tlbi(r##op, addr); \ if (tlbi_user) \ __tlbi_user(r##op, addr); \ start += __TLBI_RANGE_PAGES(num, scale) << PAGE_SHIFT; \ pages -= __TLBI_RANGE_PAGES(num, scale); \ } \ - scale++; \ + scale--; \ } \ } while (0) #define __flush_s2_tlb_range_op(op, start, pages, stride, tlb_level) \ - __flush_tlb_range_op(op, start, pages, stride, 0, tlb_level, false) + __flush_tlb_range_op(op, start, pages, stride, 0, tlb_level, false, kvm_lpa2_is_enabled()); static inline void __flush_tlb_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, @@ -427,9 +450,11 @@ static inline void __flush_tlb_range(struct vm_area_struct *vma, asid = ASID(vma->vm_mm); if (last_level) - __flush_tlb_range_op(vale1is, start, pages, stride, asid, tlb_level, true); + __flush_tlb_range_op(vale1is, start, pages, stride, asid, + tlb_level, true, lpa2_is_enabled()); else - __flush_tlb_range_op(vae1is, start, pages, stride, asid, tlb_level, true); + __flush_tlb_range_op(vae1is, start, pages, stride, asid, + tlb_level, true, lpa2_is_enabled()); dsb(ish); mmu_notifier_arch_invalidate_secondary_tlbs(vma->vm_mm, start, end); @@ -441,9 +466,10 @@ static inline void flush_tlb_range(struct vm_area_struct *vma, /* * We cannot use leaf-only invalidation here, since we may be invalidating * table entries as part of collapsing hugepages or moving page tables. - * Set the tlb_level to 0 because we can not get enough information here. + * Set the tlb_level to TLBI_TTL_UNKNOWN because we can not get enough + * information here. */ - __flush_tlb_range(vma, start, end, PAGE_SIZE, false, 0); + __flush_tlb_range(vma, start, end, PAGE_SIZE, false, TLBI_TTL_UNKNOWN); } static inline void flush_tlb_kernel_range(unsigned long start, unsigned long end) diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 646591c67e7a..7900ba7e157e 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -1768,6 +1768,39 @@ static bool unmap_kernel_at_el0(const struct arm64_cpu_capabilities *entry, return !meltdown_safe; } +#if defined(ID_AA64MMFR0_EL1_TGRAN_LPA2) && defined(ID_AA64MMFR0_EL1_TGRAN_2_SUPPORTED_LPA2) +static bool has_lpa2_at_stage1(u64 mmfr0) +{ + unsigned int tgran; + + tgran = cpuid_feature_extract_unsigned_field(mmfr0, + ID_AA64MMFR0_EL1_TGRAN_SHIFT); + return tgran == ID_AA64MMFR0_EL1_TGRAN_LPA2; +} + +static bool has_lpa2_at_stage2(u64 mmfr0) +{ + unsigned int tgran; + + tgran = cpuid_feature_extract_unsigned_field(mmfr0, + ID_AA64MMFR0_EL1_TGRAN_2_SHIFT); + return tgran == ID_AA64MMFR0_EL1_TGRAN_2_SUPPORTED_LPA2; +} + +static bool has_lpa2(const struct arm64_cpu_capabilities *entry, int scope) +{ + u64 mmfr0; + + mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); + return has_lpa2_at_stage1(mmfr0) && has_lpa2_at_stage2(mmfr0); +} +#else +static bool has_lpa2(const struct arm64_cpu_capabilities *entry, int scope) +{ + return false; +} +#endif + #ifdef CONFIG_UNMAP_KERNEL_AT_EL0 #define KPTI_NG_TEMP_VA (-(1UL << PMD_SHIFT)) @@ -2731,6 +2764,12 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .matches = has_cpuid_feature, ARM64_CPUID_FIELDS(ID_AA64MMFR2_EL1, EVT, IMP) }, + { + .desc = "52-bit Virtual Addressing for KVM (LPA2)", + .capability = ARM64_HAS_LPA2, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = has_lpa2, + }, {}, }; diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index e5f75f1f1085..c4bbc224549b 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -1837,6 +1837,7 @@ static int kvm_init_vector_slots(void) static void __init cpu_prepare_hyp_mode(int cpu, u32 hyp_va_bits) { struct kvm_nvhe_init_params *params = per_cpu_ptr_nvhe_sym(kvm_init_params, cpu); + u64 mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); unsigned long tcr; /* @@ -1859,6 +1860,10 @@ static void __init cpu_prepare_hyp_mode(int cpu, u32 hyp_va_bits) } tcr &= ~TCR_T0SZ_MASK; tcr |= TCR_T0SZ(hyp_va_bits); + tcr &= ~TCR_EL2_PS_MASK; + tcr |= FIELD_PREP(TCR_EL2_PS_MASK, kvm_get_parange(mmfr0)); + if (kvm_lpa2_is_enabled()) + tcr |= TCR_EL2_DS; params->tcr_el2 = tcr; params->pgd_pa = kvm_mmu_get_httbr(); diff --git a/arch/arm64/kvm/hyp/include/hyp/fault.h b/arch/arm64/kvm/hyp/include/hyp/fault.h index 9ddcfe2c3e57..9e13c1bc2ad5 100644 --- a/arch/arm64/kvm/hyp/include/hyp/fault.h +++ b/arch/arm64/kvm/hyp/include/hyp/fault.h @@ -60,7 +60,7 @@ static inline bool __get_fault_info(u64 esr, struct kvm_vcpu_fault_info *fault) */ if (!(esr & ESR_ELx_S1PTW) && (cpus_have_final_cap(ARM64_WORKAROUND_834220) || - (esr & ESR_ELx_FSC_TYPE) == ESR_ELx_FSC_PERM)) { + esr_fsc_is_permission_fault(esr))) { if (!__translate_far_to_hpfar(far, &hpfar)) return false; } else { diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index f99d8af0b9af..f44fb11307fb 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -591,7 +591,7 @@ static bool kvm_hyp_handle_dabt_low(struct kvm_vcpu *vcpu, u64 *exit_code) if (static_branch_unlikely(&vgic_v2_cpuif_trap)) { bool valid; - valid = kvm_vcpu_trap_get_fault_type(vcpu) == ESR_ELx_FSC_FAULT && + valid = kvm_vcpu_trap_is_translation_fault(vcpu) && kvm_vcpu_dabt_isvalid(vcpu) && !kvm_vcpu_abt_issea(vcpu) && !kvm_vcpu_abt_iss1tw(vcpu); diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-init.S b/arch/arm64/kvm/hyp/nvhe/hyp-init.S index 1cc06e6797bd..f62a7d360285 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-init.S +++ b/arch/arm64/kvm/hyp/nvhe/hyp-init.S @@ -122,11 +122,7 @@ alternative_if ARM64_HAS_CNP alternative_else_nop_endif msr ttbr0_el2, x2 - /* - * Set the PS bits in TCR_EL2. - */ ldr x0, [x0, #NVHE_INIT_TCR_EL2] - tcr_compute_pa_size x0, #TCR_EL2_PS_SHIFT, x1, x2 msr tcr_el2, x0 isb diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index 8d0a5834e883..861c76021a25 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -91,7 +91,7 @@ static void host_s2_put_page(void *addr) hyp_put_page(&host_s2_pool, addr); } -static void host_s2_free_unlinked_table(void *addr, u32 level) +static void host_s2_free_unlinked_table(void *addr, s8 level) { kvm_pgtable_stage2_free_unlinked(&host_mmu.mm_ops, addr, level); } @@ -443,7 +443,7 @@ static int host_stage2_adjust_range(u64 addr, struct kvm_mem_range *range) { struct kvm_mem_range cur; kvm_pte_t pte; - u32 level; + s8 level; int ret; hyp_assert_lock_held(&host_mmu.lock); @@ -462,7 +462,7 @@ static int host_stage2_adjust_range(u64 addr, struct kvm_mem_range *range) cur.start = ALIGN_DOWN(addr, granule); cur.end = cur.start + granule; level++; - } while ((level < KVM_PGTABLE_MAX_LEVELS) && + } while ((level <= KVM_PGTABLE_LAST_LEVEL) && !(kvm_level_supports_block_mapping(level) && range_included(&cur, range))); diff --git a/arch/arm64/kvm/hyp/nvhe/mm.c b/arch/arm64/kvm/hyp/nvhe/mm.c index 65a7a186d7b2..b01a3d1078a8 100644 --- a/arch/arm64/kvm/hyp/nvhe/mm.c +++ b/arch/arm64/kvm/hyp/nvhe/mm.c @@ -260,7 +260,7 @@ static void fixmap_clear_slot(struct hyp_fixmap_slot *slot) * https://lore.kernel.org/kvm/20221017115209.2099-1-will@kernel.org/T/#mf10dfbaf1eaef9274c581b81c53758918c1d0f03 */ dsb(ishst); - __tlbi_level(vale2is, __TLBI_VADDR(addr, 0), (KVM_PGTABLE_MAX_LEVELS - 1)); + __tlbi_level(vale2is, __TLBI_VADDR(addr, 0), KVM_PGTABLE_LAST_LEVEL); dsb(ish); isb(); } @@ -275,7 +275,7 @@ static int __create_fixmap_slot_cb(const struct kvm_pgtable_visit_ctx *ctx, { struct hyp_fixmap_slot *slot = per_cpu_ptr(&fixmap_slots, (u64)ctx->arg); - if (!kvm_pte_valid(ctx->old) || ctx->level != KVM_PGTABLE_MAX_LEVELS - 1) + if (!kvm_pte_valid(ctx->old) || ctx->level != KVM_PGTABLE_LAST_LEVEL) return -EINVAL; slot->addr = ctx->addr; diff --git a/arch/arm64/kvm/hyp/nvhe/setup.c b/arch/arm64/kvm/hyp/nvhe/setup.c index 0d5e0a89ddce..bc58d1b515af 100644 --- a/arch/arm64/kvm/hyp/nvhe/setup.c +++ b/arch/arm64/kvm/hyp/nvhe/setup.c @@ -181,7 +181,7 @@ static int fix_host_ownership_walker(const struct kvm_pgtable_visit_ctx *ctx, if (!kvm_pte_valid(ctx->old)) return 0; - if (ctx->level != (KVM_PGTABLE_MAX_LEVELS - 1)) + if (ctx->level != KVM_PGTABLE_LAST_LEVEL) return -EINVAL; phys = kvm_pte_to_phys(ctx->old); diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index 1966fdee740e..c651df904fe3 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -79,7 +79,10 @@ static bool kvm_pgtable_walk_skip_cmo(const struct kvm_pgtable_visit_ctx *ctx) static bool kvm_phys_is_valid(u64 phys) { - return phys < BIT(id_aa64mmfr0_parange_to_phys_shift(ID_AA64MMFR0_EL1_PARANGE_MAX)); + u64 parange_max = kvm_get_parange_max(); + u8 shift = id_aa64mmfr0_parange_to_phys_shift(parange_max); + + return phys < BIT(shift); } static bool kvm_block_mapping_supported(const struct kvm_pgtable_visit_ctx *ctx, u64 phys) @@ -98,7 +101,7 @@ static bool kvm_block_mapping_supported(const struct kvm_pgtable_visit_ctx *ctx, return IS_ALIGNED(ctx->addr, granule); } -static u32 kvm_pgtable_idx(struct kvm_pgtable_walk_data *data, u32 level) +static u32 kvm_pgtable_idx(struct kvm_pgtable_walk_data *data, s8 level) { u64 shift = kvm_granule_shift(level); u64 mask = BIT(PAGE_SHIFT - 3) - 1; @@ -114,7 +117,7 @@ static u32 kvm_pgd_page_idx(struct kvm_pgtable *pgt, u64 addr) return (addr & mask) >> shift; } -static u32 kvm_pgd_pages(u32 ia_bits, u32 start_level) +static u32 kvm_pgd_pages(u32 ia_bits, s8 start_level) { struct kvm_pgtable pgt = { .ia_bits = ia_bits, @@ -124,9 +127,9 @@ static u32 kvm_pgd_pages(u32 ia_bits, u32 start_level) return kvm_pgd_page_idx(&pgt, -1ULL) + 1; } -static bool kvm_pte_table(kvm_pte_t pte, u32 level) +static bool kvm_pte_table(kvm_pte_t pte, s8 level) { - if (level == KVM_PGTABLE_MAX_LEVELS - 1) + if (level == KVM_PGTABLE_LAST_LEVEL) return false; if (!kvm_pte_valid(pte)) @@ -154,11 +157,11 @@ static kvm_pte_t kvm_init_table_pte(kvm_pte_t *childp, struct kvm_pgtable_mm_ops return pte; } -static kvm_pte_t kvm_init_valid_leaf_pte(u64 pa, kvm_pte_t attr, u32 level) +static kvm_pte_t kvm_init_valid_leaf_pte(u64 pa, kvm_pte_t attr, s8 level) { kvm_pte_t pte = kvm_phys_to_pte(pa); - u64 type = (level == KVM_PGTABLE_MAX_LEVELS - 1) ? KVM_PTE_TYPE_PAGE : - KVM_PTE_TYPE_BLOCK; + u64 type = (level == KVM_PGTABLE_LAST_LEVEL) ? KVM_PTE_TYPE_PAGE : + KVM_PTE_TYPE_BLOCK; pte |= attr & (KVM_PTE_LEAF_ATTR_LO | KVM_PTE_LEAF_ATTR_HI); pte |= FIELD_PREP(KVM_PTE_TYPE, type); @@ -203,11 +206,11 @@ static bool kvm_pgtable_walk_continue(const struct kvm_pgtable_walker *walker, } static int __kvm_pgtable_walk(struct kvm_pgtable_walk_data *data, - struct kvm_pgtable_mm_ops *mm_ops, kvm_pteref_t pgtable, u32 level); + struct kvm_pgtable_mm_ops *mm_ops, kvm_pteref_t pgtable, s8 level); static inline int __kvm_pgtable_visit(struct kvm_pgtable_walk_data *data, struct kvm_pgtable_mm_ops *mm_ops, - kvm_pteref_t pteref, u32 level) + kvm_pteref_t pteref, s8 level) { enum kvm_pgtable_walk_flags flags = data->walker->flags; kvm_pte_t *ptep = kvm_dereference_pteref(data->walker, pteref); @@ -272,12 +275,13 @@ out: } static int __kvm_pgtable_walk(struct kvm_pgtable_walk_data *data, - struct kvm_pgtable_mm_ops *mm_ops, kvm_pteref_t pgtable, u32 level) + struct kvm_pgtable_mm_ops *mm_ops, kvm_pteref_t pgtable, s8 level) { u32 idx; int ret = 0; - if (WARN_ON_ONCE(level >= KVM_PGTABLE_MAX_LEVELS)) + if (WARN_ON_ONCE(level < KVM_PGTABLE_FIRST_LEVEL || + level > KVM_PGTABLE_LAST_LEVEL)) return -EINVAL; for (idx = kvm_pgtable_idx(data, level); idx < PTRS_PER_PTE; ++idx) { @@ -340,7 +344,7 @@ int kvm_pgtable_walk(struct kvm_pgtable *pgt, u64 addr, u64 size, struct leaf_walk_data { kvm_pte_t pte; - u32 level; + s8 level; }; static int leaf_walker(const struct kvm_pgtable_visit_ctx *ctx, @@ -355,7 +359,7 @@ static int leaf_walker(const struct kvm_pgtable_visit_ctx *ctx, } int kvm_pgtable_get_leaf(struct kvm_pgtable *pgt, u64 addr, - kvm_pte_t *ptep, u32 *level) + kvm_pte_t *ptep, s8 *level) { struct leaf_walk_data data; struct kvm_pgtable_walker walker = { @@ -408,7 +412,8 @@ static int hyp_set_prot_attr(enum kvm_pgtable_prot prot, kvm_pte_t *ptep) } attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_AP, ap); - attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_SH, sh); + if (!kvm_lpa2_is_enabled()) + attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_SH, sh); attr |= KVM_PTE_LEAF_ATTR_LO_S1_AF; attr |= prot & KVM_PTE_LEAF_ATTR_HI_SW; *ptep = attr; @@ -467,7 +472,7 @@ static int hyp_map_walker(const struct kvm_pgtable_visit_ctx *ctx, if (hyp_map_walker_try_leaf(ctx, data)) return 0; - if (WARN_ON(ctx->level == KVM_PGTABLE_MAX_LEVELS - 1)) + if (WARN_ON(ctx->level == KVM_PGTABLE_LAST_LEVEL)) return -EINVAL; childp = (kvm_pte_t *)mm_ops->zalloc_page(NULL); @@ -563,14 +568,19 @@ u64 kvm_pgtable_hyp_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size) int kvm_pgtable_hyp_init(struct kvm_pgtable *pgt, u32 va_bits, struct kvm_pgtable_mm_ops *mm_ops) { - u64 levels = ARM64_HW_PGTABLE_LEVELS(va_bits); + s8 start_level = KVM_PGTABLE_LAST_LEVEL + 1 - + ARM64_HW_PGTABLE_LEVELS(va_bits); + + if (start_level < KVM_PGTABLE_FIRST_LEVEL || + start_level > KVM_PGTABLE_LAST_LEVEL) + return -EINVAL; pgt->pgd = (kvm_pteref_t)mm_ops->zalloc_page(NULL); if (!pgt->pgd) return -ENOMEM; pgt->ia_bits = va_bits; - pgt->start_level = KVM_PGTABLE_MAX_LEVELS - levels; + pgt->start_level = start_level; pgt->mm_ops = mm_ops; pgt->mmu = NULL; pgt->force_pte_cb = NULL; @@ -624,7 +634,7 @@ struct stage2_map_data { u64 kvm_get_vtcr(u64 mmfr0, u64 mmfr1, u32 phys_shift) { u64 vtcr = VTCR_EL2_FLAGS; - u8 lvls; + s8 lvls; vtcr |= kvm_get_parange(mmfr0) << VTCR_EL2_PS_SHIFT; vtcr |= VTCR_EL2_T0SZ(phys_shift); @@ -635,6 +645,15 @@ u64 kvm_get_vtcr(u64 mmfr0, u64 mmfr1, u32 phys_shift) lvls = stage2_pgtable_levels(phys_shift); if (lvls < 2) lvls = 2; + + /* + * When LPA2 is enabled, the HW supports an extra level of translation + * (for 5 in total) when using 4K pages. It also introduces VTCR_EL2.SL2 + * to as an addition to SL0 to enable encoding this extra start level. + * However, since we always use concatenated pages for the first level + * lookup, we will never need this extra level and therefore do not need + * to touch SL2. + */ vtcr |= VTCR_EL2_LVLS_TO_SL0(lvls); #ifdef CONFIG_ARM64_HW_AFDBM @@ -654,6 +673,9 @@ u64 kvm_get_vtcr(u64 mmfr0, u64 mmfr1, u32 phys_shift) vtcr |= VTCR_EL2_HA; #endif /* CONFIG_ARM64_HW_AFDBM */ + if (kvm_lpa2_is_enabled()) + vtcr |= VTCR_EL2_DS; + /* Set the vmid bits */ vtcr |= (get_vmid_bits(mmfr1) == 16) ? VTCR_EL2_VS_16BIT : @@ -711,7 +733,9 @@ static int stage2_set_prot_attr(struct kvm_pgtable *pgt, enum kvm_pgtable_prot p if (prot & KVM_PGTABLE_PROT_W) attr |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W; - attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S2_SH, sh); + if (!kvm_lpa2_is_enabled()) + attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S2_SH, sh); + attr |= KVM_PTE_LEAF_ATTR_LO_S2_AF; attr |= prot & KVM_PTE_LEAF_ATTR_HI_SW; *ptep = attr; @@ -902,7 +926,7 @@ static bool stage2_leaf_mapping_allowed(const struct kvm_pgtable_visit_ctx *ctx, { u64 phys = stage2_map_walker_phys_addr(ctx, data); - if (data->force_pte && (ctx->level < (KVM_PGTABLE_MAX_LEVELS - 1))) + if (data->force_pte && ctx->level < KVM_PGTABLE_LAST_LEVEL) return false; return kvm_block_mapping_supported(ctx, phys); @@ -981,7 +1005,7 @@ static int stage2_map_walk_leaf(const struct kvm_pgtable_visit_ctx *ctx, if (ret != -E2BIG) return ret; - if (WARN_ON(ctx->level == KVM_PGTABLE_MAX_LEVELS - 1)) + if (WARN_ON(ctx->level == KVM_PGTABLE_LAST_LEVEL)) return -EINVAL; if (!data->memcache) @@ -1151,7 +1175,7 @@ struct stage2_attr_data { kvm_pte_t attr_set; kvm_pte_t attr_clr; kvm_pte_t pte; - u32 level; + s8 level; }; static int stage2_attr_walker(const struct kvm_pgtable_visit_ctx *ctx, @@ -1194,7 +1218,7 @@ static int stage2_attr_walker(const struct kvm_pgtable_visit_ctx *ctx, static int stage2_update_leaf_attrs(struct kvm_pgtable *pgt, u64 addr, u64 size, kvm_pte_t attr_set, kvm_pte_t attr_clr, kvm_pte_t *orig_pte, - u32 *level, enum kvm_pgtable_walk_flags flags) + s8 *level, enum kvm_pgtable_walk_flags flags) { int ret; kvm_pte_t attr_mask = KVM_PTE_LEAF_ATTR_LO | KVM_PTE_LEAF_ATTR_HI; @@ -1296,7 +1320,7 @@ int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr, enum kvm_pgtable_prot prot) { int ret; - u32 level; + s8 level; kvm_pte_t set = 0, clr = 0; if (prot & KVM_PTE_LEAF_ATTR_HI_SW) @@ -1349,7 +1373,7 @@ int kvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size) } kvm_pte_t *kvm_pgtable_stage2_create_unlinked(struct kvm_pgtable *pgt, - u64 phys, u32 level, + u64 phys, s8 level, enum kvm_pgtable_prot prot, void *mc, bool force_pte) { @@ -1407,7 +1431,7 @@ kvm_pte_t *kvm_pgtable_stage2_create_unlinked(struct kvm_pgtable *pgt, * fully populated tree up to the PTE entries. Note that @level is * interpreted as in "level @level entry". */ -static int stage2_block_get_nr_page_tables(u32 level) +static int stage2_block_get_nr_page_tables(s8 level) { switch (level) { case 1: @@ -1418,7 +1442,7 @@ static int stage2_block_get_nr_page_tables(u32 level) return 0; default: WARN_ON_ONCE(level < KVM_PGTABLE_MIN_BLOCK_LEVEL || - level >= KVM_PGTABLE_MAX_LEVELS); + level > KVM_PGTABLE_LAST_LEVEL); return -EINVAL; }; } @@ -1431,13 +1455,13 @@ static int stage2_split_walker(const struct kvm_pgtable_visit_ctx *ctx, struct kvm_s2_mmu *mmu; kvm_pte_t pte = ctx->old, new, *childp; enum kvm_pgtable_prot prot; - u32 level = ctx->level; + s8 level = ctx->level; bool force_pte; int nr_pages; u64 phys; /* No huge-pages exist at the last level */ - if (level == KVM_PGTABLE_MAX_LEVELS - 1) + if (level == KVM_PGTABLE_LAST_LEVEL) return 0; /* We only split valid block mappings */ @@ -1514,7 +1538,7 @@ int __kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu, u64 vtcr = mmu->vtcr; u32 ia_bits = VTCR_EL2_IPA(vtcr); u32 sl0 = FIELD_GET(VTCR_EL2_SL0_MASK, vtcr); - u32 start_level = VTCR_EL2_TGRAN_SL0_BASE - sl0; + s8 start_level = VTCR_EL2_TGRAN_SL0_BASE - sl0; pgd_sz = kvm_pgd_pages(ia_bits, start_level) * PAGE_SIZE; pgt->pgd = (kvm_pteref_t)mm_ops->zalloc_pages_exact(pgd_sz); @@ -1537,7 +1561,7 @@ size_t kvm_pgtable_stage2_pgd_size(u64 vtcr) { u32 ia_bits = VTCR_EL2_IPA(vtcr); u32 sl0 = FIELD_GET(VTCR_EL2_SL0_MASK, vtcr); - u32 start_level = VTCR_EL2_TGRAN_SL0_BASE - sl0; + s8 start_level = VTCR_EL2_TGRAN_SL0_BASE - sl0; return kvm_pgd_pages(ia_bits, start_level) * PAGE_SIZE; } @@ -1573,7 +1597,7 @@ void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt) pgt->pgd = NULL; } -void kvm_pgtable_stage2_free_unlinked(struct kvm_pgtable_mm_ops *mm_ops, void *pgtable, u32 level) +void kvm_pgtable_stage2_free_unlinked(struct kvm_pgtable_mm_ops *mm_ops, void *pgtable, s8 level) { kvm_pteref_t ptep = (kvm_pteref_t)pgtable; struct kvm_pgtable_walker walker = { diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index d87c8fcc4c24..d14504821b79 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -223,12 +223,12 @@ static void stage2_free_unlinked_table_rcu_cb(struct rcu_head *head) { struct page *page = container_of(head, struct page, rcu_head); void *pgtable = page_to_virt(page); - u32 level = page_private(page); + s8 level = page_private(page); kvm_pgtable_stage2_free_unlinked(&kvm_s2_mm_ops, pgtable, level); } -static void stage2_free_unlinked_table(void *addr, u32 level) +static void stage2_free_unlinked_table(void *addr, s8 level) { struct page *page = virt_to_page(addr); @@ -804,13 +804,13 @@ static int get_user_mapping_size(struct kvm *kvm, u64 addr) struct kvm_pgtable pgt = { .pgd = (kvm_pteref_t)kvm->mm->pgd, .ia_bits = vabits_actual, - .start_level = (KVM_PGTABLE_MAX_LEVELS - - CONFIG_PGTABLE_LEVELS), + .start_level = (KVM_PGTABLE_LAST_LEVEL - + CONFIG_PGTABLE_LEVELS + 1), .mm_ops = &kvm_user_mm_ops, }; unsigned long flags; kvm_pte_t pte = 0; /* Keep GCC quiet... */ - u32 level = ~0; + s8 level = S8_MAX; int ret; /* @@ -829,7 +829,9 @@ static int get_user_mapping_size(struct kvm *kvm, u64 addr) * Not seeing an error, but not updating level? Something went * deeply wrong... */ - if (WARN_ON(level >= KVM_PGTABLE_MAX_LEVELS)) + if (WARN_ON(level > KVM_PGTABLE_LAST_LEVEL)) + return -EFAULT; + if (WARN_ON(level < KVM_PGTABLE_FIRST_LEVEL)) return -EFAULT; /* Oops, the userspace PTs are gone... Replay the fault */ @@ -1374,7 +1376,7 @@ static bool kvm_vma_mte_allowed(struct vm_area_struct *vma) static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, struct kvm_memory_slot *memslot, unsigned long hva, - unsigned long fault_status) + bool fault_is_perm) { int ret = 0; bool write_fault, writable, force_pte = false; @@ -1388,17 +1390,17 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, gfn_t gfn; kvm_pfn_t pfn; bool logging_active = memslot_is_logging(memslot); - unsigned long fault_level = kvm_vcpu_trap_get_fault_level(vcpu); long vma_pagesize, fault_granule; enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R; struct kvm_pgtable *pgt; - fault_granule = 1UL << ARM64_HW_PGTABLE_LEVEL_SHIFT(fault_level); + if (fault_is_perm) + fault_granule = kvm_vcpu_trap_get_perm_fault_granule(vcpu); write_fault = kvm_is_write_fault(vcpu); exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu); VM_BUG_ON(write_fault && exec_fault); - if (fault_status == ESR_ELx_FSC_PERM && !write_fault && !exec_fault) { + if (fault_is_perm && !write_fault && !exec_fault) { kvm_err("Unexpected L2 read permission error\n"); return -EFAULT; } @@ -1409,8 +1411,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, * only exception to this is when dirty logging is enabled at runtime * and a write fault needs to collapse a block entry into a table. */ - if (fault_status != ESR_ELx_FSC_PERM || - (logging_active && write_fault)) { + if (!fault_is_perm || (logging_active && write_fault)) { ret = kvm_mmu_topup_memory_cache(memcache, kvm_mmu_cache_min_pages(vcpu->arch.hw_mmu)); if (ret) @@ -1527,8 +1528,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, * backed by a THP and thus use block mapping if possible. */ if (vma_pagesize == PAGE_SIZE && !(force_pte || device)) { - if (fault_status == ESR_ELx_FSC_PERM && - fault_granule > PAGE_SIZE) + if (fault_is_perm && fault_granule > PAGE_SIZE) vma_pagesize = fault_granule; else vma_pagesize = transparent_hugepage_adjust(kvm, memslot, @@ -1541,7 +1541,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, } } - if (fault_status != ESR_ELx_FSC_PERM && !device && kvm_has_mte(kvm)) { + if (!fault_is_perm && !device && kvm_has_mte(kvm)) { /* Check the VMM hasn't introduced a new disallowed VMA */ if (mte_allowed) { sanitise_mte_tags(kvm, pfn, vma_pagesize); @@ -1567,7 +1567,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, * permissions only if vma_pagesize equals fault_granule. Otherwise, * kvm_pgtable_stage2_map() should be called to change block size. */ - if (fault_status == ESR_ELx_FSC_PERM && vma_pagesize == fault_granule) + if (fault_is_perm && vma_pagesize == fault_granule) ret = kvm_pgtable_stage2_relax_perms(pgt, fault_ipa, prot); else ret = kvm_pgtable_stage2_map(pgt, fault_ipa, vma_pagesize, @@ -1618,7 +1618,7 @@ static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa) */ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu) { - unsigned long fault_status; + unsigned long esr; phys_addr_t fault_ipa; struct kvm_memory_slot *memslot; unsigned long hva; @@ -1626,12 +1626,12 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu) gfn_t gfn; int ret, idx; - fault_status = kvm_vcpu_trap_get_fault_type(vcpu); + esr = kvm_vcpu_get_esr(vcpu); fault_ipa = kvm_vcpu_get_fault_ipa(vcpu); is_iabt = kvm_vcpu_trap_is_iabt(vcpu); - if (fault_status == ESR_ELx_FSC_FAULT) { + if (esr_fsc_is_permission_fault(esr)) { /* Beyond sanitised PARange (which is the IPA limit) */ if (fault_ipa >= BIT_ULL(get_kvm_ipa_limit())) { kvm_inject_size_fault(vcpu); @@ -1666,9 +1666,9 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu) kvm_vcpu_get_hfar(vcpu), fault_ipa); /* Check the stage-2 fault is trans. fault or write fault */ - if (fault_status != ESR_ELx_FSC_FAULT && - fault_status != ESR_ELx_FSC_PERM && - fault_status != ESR_ELx_FSC_ACCESS) { + if (!esr_fsc_is_translation_fault(esr) && + !esr_fsc_is_permission_fault(esr) && + !esr_fsc_is_access_flag_fault(esr)) { kvm_err("Unsupported FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n", kvm_vcpu_trap_get_class(vcpu), (unsigned long)kvm_vcpu_trap_get_fault(vcpu), @@ -1730,13 +1730,14 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu) /* Userspace should not be able to register out-of-bounds IPAs */ VM_BUG_ON(fault_ipa >= kvm_phys_size(vcpu->arch.hw_mmu)); - if (fault_status == ESR_ELx_FSC_ACCESS) { + if (esr_fsc_is_access_flag_fault(esr)) { handle_access_fault(vcpu, fault_ipa); ret = 1; goto out_unlock; } - ret = user_mem_abort(vcpu, fault_ipa, memslot, hva, fault_status); + ret = user_mem_abort(vcpu, fault_ipa, memslot, hva, + esr_fsc_is_permission_fault(esr)); if (ret == 0) ret = 1; out: diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c index 5bb4de162cab..68d1d05672bd 100644 --- a/arch/arm64/kvm/reset.c +++ b/arch/arm64/kvm/reset.c @@ -280,12 +280,11 @@ int __init kvm_set_ipa_limit(void) parange = cpuid_feature_extract_unsigned_field(mmfr0, ID_AA64MMFR0_EL1_PARANGE_SHIFT); /* - * IPA size beyond 48 bits could not be supported - * on either 4K or 16K page size. Hence let's cap - * it to 48 bits, in case it's reported as larger - * on the system. + * IPA size beyond 48 bits for 4K and 16K page size is only supported + * when LPA2 is available. So if we have LPA2, enable it, else cap to 48 + * bits, in case it's reported as larger on the system. */ - if (PAGE_SIZE != SZ_64K) + if (!kvm_lpa2_is_enabled() && PAGE_SIZE != SZ_64K) parange = min(parange, (unsigned int)ID_AA64MMFR0_EL1_PARANGE_48); /* diff --git a/arch/arm64/tools/cpucaps b/arch/arm64/tools/cpucaps index b98c38288a9d..919eceb0b3da 100644 --- a/arch/arm64/tools/cpucaps +++ b/arch/arm64/tools/cpucaps @@ -37,6 +37,7 @@ HAS_GIC_PRIO_MASKING HAS_GIC_PRIO_RELAXED_SYNC HAS_HCX HAS_LDAPR +HAS_LPA2 HAS_LSE_ATOMICS HAS_MOPS HAS_NESTED_VIRT |