diff options
Diffstat (limited to 'arch/arm64')
24 files changed, 882 insertions, 132 deletions
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 6e1a2edb2be0..e170c7eb20cc 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1519,7 +1519,7 @@ config ARCH_SUPPORTS_CRASH_DUMP def_bool y config ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION - def_bool CRASH_CORE + def_bool CRASH_RESERVE config TRANS_TABLE def_bool y @@ -2229,6 +2229,15 @@ config UNWIND_PATCH_PAC_INTO_SCS select UNWIND_TABLES select DYNAMIC_SCS +config ARM64_CONTPTE + bool "Contiguous PTE mappings for user memory" if EXPERT + depends on TRANSPARENT_HUGEPAGE + default y + help + When enabled, user mappings are configured using the PTE contiguous + bit, for any mappings that meet the size and alignment requirements. + This reduces TLB pressure and improves performance. + endmenu # "Kernel Features" menu "Boot options" diff --git a/arch/arm64/include/asm/crash_core.h b/arch/arm64/include/asm/crash_reserve.h index 9f5c8d339f44..4afe027a4e7b 100644 --- a/arch/arm64/include/asm/crash_core.h +++ b/arch/arm64/include/asm/crash_reserve.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0-only */ -#ifndef _ARM64_CRASH_CORE_H -#define _ARM64_CRASH_CORE_H +#ifndef _ARM64_CRASH_RESERVE_H +#define _ARM64_CRASH_RESERVE_H /* Current arm64 boot protocol requires 2MB alignment */ #define CRASH_ALIGN SZ_2M diff --git a/arch/arm64/include/asm/kexec.h b/arch/arm64/include/asm/kexec.h index 9ac9572a3bbe..4d9cc7a76d9c 100644 --- a/arch/arm64/include/asm/kexec.h +++ b/arch/arm64/include/asm/kexec.h @@ -80,7 +80,7 @@ static inline void crash_setup_regs(struct pt_regs *newregs, } } -#if defined(CONFIG_KEXEC_CORE) && defined(CONFIG_HIBERNATION) +#if defined(CONFIG_CRASH_DUMP) && defined(CONFIG_HIBERNATION) extern bool crash_is_nosave(unsigned long pfn); extern void crash_prepare_suspend(void); extern void crash_post_resume(void); diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 8bec85350865..afdd56d26ad7 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -98,7 +98,8 @@ static inline pteval_t __phys_to_pte_val(phys_addr_t phys) __pte(__phys_to_pte_val((phys_addr_t)(pfn) << PAGE_SHIFT) | pgprot_val(prot)) #define pte_none(pte) (!pte_val(pte)) -#define pte_clear(mm,addr,ptep) set_pte(ptep, __pte(0)) +#define __pte_clear(mm, addr, ptep) \ + __set_pte(ptep, __pte(0)) #define pte_page(pte) (pfn_to_page(pte_pfn(pte))) /* @@ -138,11 +139,15 @@ static inline pteval_t __phys_to_pte_val(phys_addr_t phys) #define pte_valid_not_user(pte) \ ((pte_val(pte) & (PTE_VALID | PTE_USER | PTE_UXN)) == (PTE_VALID | PTE_UXN)) /* + * Returns true if the pte is valid and has the contiguous bit set. + */ +#define pte_valid_cont(pte) (pte_valid(pte) && pte_cont(pte)) +/* * Could the pte be present in the TLB? We must check mm_tlb_flush_pending * so that we don't erroneously return false for pages that have been * remapped as PROT_NONE but are yet to be flushed from the TLB. * Note that we can't make any assumptions based on the state of the access - * flag, since ptep_clear_flush_young() elides a DSB when invalidating the + * flag, since __ptep_clear_flush_young() elides a DSB when invalidating the * TLB. */ #define pte_accessible(mm, pte) \ @@ -266,7 +271,7 @@ static inline pte_t pte_mkdevmap(pte_t pte) return set_pte_bit(pte, __pgprot(PTE_DEVMAP | PTE_SPECIAL)); } -static inline void set_pte(pte_t *ptep, pte_t pte) +static inline void __set_pte(pte_t *ptep, pte_t pte) { WRITE_ONCE(*ptep, pte); @@ -280,6 +285,11 @@ static inline void set_pte(pte_t *ptep, pte_t pte) } } +static inline pte_t __ptep_get(pte_t *ptep) +{ + return READ_ONCE(*ptep); +} + extern void __sync_icache_dcache(pte_t pteval); bool pgattr_change_is_safe(u64 old, u64 new); @@ -307,7 +317,7 @@ static inline void __check_safe_pte_update(struct mm_struct *mm, pte_t *ptep, if (!IS_ENABLED(CONFIG_DEBUG_VM)) return; - old_pte = READ_ONCE(*ptep); + old_pte = __ptep_get(ptep); if (!pte_valid(old_pte) || !pte_valid(pte)) return; @@ -316,7 +326,7 @@ static inline void __check_safe_pte_update(struct mm_struct *mm, pte_t *ptep, /* * Check for potential race with hardware updates of the pte - * (ptep_set_access_flags safely changes valid ptes without going + * (__ptep_set_access_flags safely changes valid ptes without going * through an invalid entry). */ VM_WARN_ONCE(!pte_young(pte), @@ -346,23 +356,38 @@ static inline void __sync_cache_and_tags(pte_t pte, unsigned int nr_pages) mte_sync_tags(pte, nr_pages); } -static inline void set_ptes(struct mm_struct *mm, - unsigned long __always_unused addr, - pte_t *ptep, pte_t pte, unsigned int nr) +/* + * Select all bits except the pfn + */ +static inline pgprot_t pte_pgprot(pte_t pte) +{ + unsigned long pfn = pte_pfn(pte); + + return __pgprot(pte_val(pfn_pte(pfn, __pgprot(0))) ^ pte_val(pte)); +} + +#define pte_advance_pfn pte_advance_pfn +static inline pte_t pte_advance_pfn(pte_t pte, unsigned long nr) +{ + return pfn_pte(pte_pfn(pte) + nr, pte_pgprot(pte)); +} + +static inline void __set_ptes(struct mm_struct *mm, + unsigned long __always_unused addr, + pte_t *ptep, pte_t pte, unsigned int nr) { page_table_check_ptes_set(mm, ptep, pte, nr); __sync_cache_and_tags(pte, nr); for (;;) { __check_safe_pte_update(mm, ptep, pte); - set_pte(ptep, pte); + __set_pte(ptep, pte); if (--nr == 0) break; ptep++; - pte_val(pte) += PAGE_SIZE; + pte = pte_advance_pfn(pte, 1); } } -#define set_ptes set_ptes /* * Huge pte definitions. @@ -438,16 +463,6 @@ static inline pte_t pte_swp_clear_exclusive(pte_t pte) return clear_pte_bit(pte, __pgprot(PTE_SWP_EXCLUSIVE)); } -/* - * Select all bits except the pfn - */ -static inline pgprot_t pte_pgprot(pte_t pte) -{ - unsigned long pfn = pte_pfn(pte); - - return __pgprot(pte_val(pfn_pte(pfn, __pgprot(0))) ^ pte_val(pte)); -} - #ifdef CONFIG_NUMA_BALANCING /* * See the comment in include/linux/pgtable.h @@ -539,7 +554,7 @@ static inline void __set_pte_at(struct mm_struct *mm, { __sync_cache_and_tags(pte, nr); __check_safe_pte_update(mm, ptep, pte); - set_pte(ptep, pte); + __set_pte(ptep, pte); } static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr, @@ -1033,8 +1048,7 @@ static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot) return pte_pmd(pte_modify(pmd_pte(pmd), newprot)); } -#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS -extern int ptep_set_access_flags(struct vm_area_struct *vma, +extern int __ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address, pte_t *ptep, pte_t entry, int dirty); @@ -1044,7 +1058,8 @@ static inline int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, pmd_t entry, int dirty) { - return ptep_set_access_flags(vma, address, (pte_t *)pmdp, pmd_pte(entry), dirty); + return __ptep_set_access_flags(vma, address, (pte_t *)pmdp, + pmd_pte(entry), dirty); } static inline int pud_devmap(pud_t pud) @@ -1078,12 +1093,13 @@ static inline bool pud_user_accessible_page(pud_t pud) /* * Atomic pte/pmd modifications. */ -#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG -static inline int __ptep_test_and_clear_young(pte_t *ptep) +static inline int __ptep_test_and_clear_young(struct vm_area_struct *vma, + unsigned long address, + pte_t *ptep) { pte_t old_pte, pte; - pte = READ_ONCE(*ptep); + pte = __ptep_get(ptep); do { old_pte = pte; pte = pte_mkold(pte); @@ -1094,18 +1110,10 @@ static inline int __ptep_test_and_clear_young(pte_t *ptep) return pte_young(pte); } -static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, - unsigned long address, - pte_t *ptep) -{ - return __ptep_test_and_clear_young(ptep); -} - -#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH -static inline int ptep_clear_flush_young(struct vm_area_struct *vma, +static inline int __ptep_clear_flush_young(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) { - int young = ptep_test_and_clear_young(vma, address, ptep); + int young = __ptep_test_and_clear_young(vma, address, ptep); if (young) { /* @@ -1128,12 +1136,11 @@ static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { - return ptep_test_and_clear_young(vma, address, (pte_t *)pmdp); + return __ptep_test_and_clear_young(vma, address, (pte_t *)pmdp); } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ -#define __HAVE_ARCH_PTEP_GET_AND_CLEAR -static inline pte_t ptep_get_and_clear(struct mm_struct *mm, +static inline pte_t __ptep_get_and_clear(struct mm_struct *mm, unsigned long address, pte_t *ptep) { pte_t pte = __pte(xchg_relaxed(&pte_val(*ptep), 0)); @@ -1143,6 +1150,37 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm, return pte; } +static inline void __clear_full_ptes(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, unsigned int nr, int full) +{ + for (;;) { + __ptep_get_and_clear(mm, addr, ptep); + if (--nr == 0) + break; + ptep++; + addr += PAGE_SIZE; + } +} + +static inline pte_t __get_and_clear_full_ptes(struct mm_struct *mm, + unsigned long addr, pte_t *ptep, + unsigned int nr, int full) +{ + pte_t pte, tmp_pte; + + pte = __ptep_get_and_clear(mm, addr, ptep); + while (--nr) { + ptep++; + addr += PAGE_SIZE; + tmp_pte = __ptep_get_and_clear(mm, addr, ptep); + if (pte_dirty(tmp_pte)) + pte = pte_mkdirty(pte); + if (pte_young(tmp_pte)) + pte = pte_mkyoung(pte); + } + return pte; +} + #ifdef CONFIG_TRANSPARENT_HUGEPAGE #define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, @@ -1156,16 +1194,12 @@ static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ -/* - * ptep_set_wrprotect - mark read-only while trasferring potential hardware - * dirty status (PTE_DBM && !PTE_RDONLY) to the software PTE_DIRTY bit. - */ -#define __HAVE_ARCH_PTEP_SET_WRPROTECT -static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long address, pte_t *ptep) +static inline void ___ptep_set_wrprotect(struct mm_struct *mm, + unsigned long address, pte_t *ptep, + pte_t pte) { - pte_t old_pte, pte; + pte_t old_pte; - pte = READ_ONCE(*ptep); do { old_pte = pte; pte = pte_wrprotect(pte); @@ -1174,12 +1208,31 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addres } while (pte_val(pte) != pte_val(old_pte)); } +/* + * __ptep_set_wrprotect - mark read-only while trasferring potential hardware + * dirty status (PTE_DBM && !PTE_RDONLY) to the software PTE_DIRTY bit. + */ +static inline void __ptep_set_wrprotect(struct mm_struct *mm, + unsigned long address, pte_t *ptep) +{ + ___ptep_set_wrprotect(mm, address, ptep, __ptep_get(ptep)); +} + +static inline void __wrprotect_ptes(struct mm_struct *mm, unsigned long address, + pte_t *ptep, unsigned int nr) +{ + unsigned int i; + + for (i = 0; i < nr; i++, address += PAGE_SIZE, ptep++) + __ptep_set_wrprotect(mm, address, ptep); +} + #ifdef CONFIG_TRANSPARENT_HUGEPAGE #define __HAVE_ARCH_PMDP_SET_WRPROTECT static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long address, pmd_t *pmdp) { - ptep_set_wrprotect(mm, address, (pte_t *)pmdp); + __ptep_set_wrprotect(mm, address, (pte_t *)pmdp); } #define pmdp_establish pmdp_establish @@ -1257,7 +1310,7 @@ static inline void arch_swap_restore(swp_entry_t entry, struct folio *folio) #endif /* CONFIG_ARM64_MTE */ /* - * On AArch64, the cache coherency is handled via the set_pte_at() function. + * On AArch64, the cache coherency is handled via the __set_ptes() function. */ static inline void update_mmu_cache_range(struct vm_fault *vmf, struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, @@ -1309,6 +1362,282 @@ extern pte_t ptep_modify_prot_start(struct vm_area_struct *vma, extern void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t old_pte, pte_t new_pte); + +#ifdef CONFIG_ARM64_CONTPTE + +/* + * The contpte APIs are used to transparently manage the contiguous bit in ptes + * where it is possible and makes sense to do so. The PTE_CONT bit is considered + * a private implementation detail of the public ptep API (see below). + */ +extern void __contpte_try_fold(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte); +extern void __contpte_try_unfold(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte); +extern pte_t contpte_ptep_get(pte_t *ptep, pte_t orig_pte); +extern pte_t contpte_ptep_get_lockless(pte_t *orig_ptep); +extern void contpte_set_ptes(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte, unsigned int nr); +extern void contpte_clear_full_ptes(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, unsigned int nr, int full); +extern pte_t contpte_get_and_clear_full_ptes(struct mm_struct *mm, + unsigned long addr, pte_t *ptep, + unsigned int nr, int full); +extern int contpte_ptep_test_and_clear_young(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep); +extern int contpte_ptep_clear_flush_young(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep); +extern void contpte_wrprotect_ptes(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, unsigned int nr); +extern int contpte_ptep_set_access_flags(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, + pte_t entry, int dirty); + +static __always_inline void contpte_try_fold(struct mm_struct *mm, + unsigned long addr, pte_t *ptep, pte_t pte) +{ + /* + * Only bother trying if both the virtual and physical addresses are + * aligned and correspond to the last entry in a contig range. The core + * code mostly modifies ranges from low to high, so this is the likely + * the last modification in the contig range, so a good time to fold. + * We can't fold special mappings, because there is no associated folio. + */ + + const unsigned long contmask = CONT_PTES - 1; + bool valign = ((addr >> PAGE_SHIFT) & contmask) == contmask; + + if (unlikely(valign)) { + bool palign = (pte_pfn(pte) & contmask) == contmask; + + if (unlikely(palign && + pte_valid(pte) && !pte_cont(pte) && !pte_special(pte))) + __contpte_try_fold(mm, addr, ptep, pte); + } +} + +static __always_inline void contpte_try_unfold(struct mm_struct *mm, + unsigned long addr, pte_t *ptep, pte_t pte) +{ + if (unlikely(pte_valid_cont(pte))) + __contpte_try_unfold(mm, addr, ptep, pte); +} + +#define pte_batch_hint pte_batch_hint +static inline unsigned int pte_batch_hint(pte_t *ptep, pte_t pte) +{ + if (!pte_valid_cont(pte)) + return 1; + + return CONT_PTES - (((unsigned long)ptep >> 3) & (CONT_PTES - 1)); +} + +/* + * The below functions constitute the public API that arm64 presents to the + * core-mm to manipulate PTE entries within their page tables (or at least this + * is the subset of the API that arm64 needs to implement). These public + * versions will automatically and transparently apply the contiguous bit where + * it makes sense to do so. Therefore any users that are contig-aware (e.g. + * hugetlb, kernel mapper) should NOT use these APIs, but instead use the + * private versions, which are prefixed with double underscore. All of these + * APIs except for ptep_get_lockless() are expected to be called with the PTL + * held. Although the contiguous bit is considered private to the + * implementation, it is deliberately allowed to leak through the getters (e.g. + * ptep_get()), back to core code. This is required so that pte_leaf_size() can + * provide an accurate size for perf_get_pgtable_size(). But this leakage means + * its possible a pte will be passed to a setter with the contiguous bit set, so + * we explicitly clear the contiguous bit in those cases to prevent accidentally + * setting it in the pgtable. + */ + +#define ptep_get ptep_get +static inline pte_t ptep_get(pte_t *ptep) +{ + pte_t pte = __ptep_get(ptep); + + if (likely(!pte_valid_cont(pte))) + return pte; + + return contpte_ptep_get(ptep, pte); +} + +#define ptep_get_lockless ptep_get_lockless +static inline pte_t ptep_get_lockless(pte_t *ptep) +{ + pte_t pte = __ptep_get(ptep); + + if (likely(!pte_valid_cont(pte))) + return pte; + + return contpte_ptep_get_lockless(ptep); +} + +static inline void set_pte(pte_t *ptep, pte_t pte) +{ + /* + * We don't have the mm or vaddr so cannot unfold contig entries (since + * it requires tlb maintenance). set_pte() is not used in core code, so + * this should never even be called. Regardless do our best to service + * any call and emit a warning if there is any attempt to set a pte on + * top of an existing contig range. + */ + pte_t orig_pte = __ptep_get(ptep); + + WARN_ON_ONCE(pte_valid_cont(orig_pte)); + __set_pte(ptep, pte_mknoncont(pte)); +} + +#define set_ptes set_ptes +static __always_inline void set_ptes(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte, unsigned int nr) +{ + pte = pte_mknoncont(pte); + + if (likely(nr == 1)) { + contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep)); + __set_ptes(mm, addr, ptep, pte, 1); + contpte_try_fold(mm, addr, ptep, pte); + } else { + contpte_set_ptes(mm, addr, ptep, pte, nr); + } +} + +static inline void pte_clear(struct mm_struct *mm, + unsigned long addr, pte_t *ptep) +{ + contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep)); + __pte_clear(mm, addr, ptep); +} + +#define clear_full_ptes clear_full_ptes +static inline void clear_full_ptes(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, unsigned int nr, int full) +{ + if (likely(nr == 1)) { + contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep)); + __clear_full_ptes(mm, addr, ptep, nr, full); + } else { + contpte_clear_full_ptes(mm, addr, ptep, nr, full); + } +} + +#define get_and_clear_full_ptes get_and_clear_full_ptes +static inline pte_t get_and_clear_full_ptes(struct mm_struct *mm, + unsigned long addr, pte_t *ptep, + unsigned int nr, int full) +{ + pte_t pte; + + if (likely(nr == 1)) { + contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep)); + pte = __get_and_clear_full_ptes(mm, addr, ptep, nr, full); + } else { + pte = contpte_get_and_clear_full_ptes(mm, addr, ptep, nr, full); + } + + return pte; +} + +#define __HAVE_ARCH_PTEP_GET_AND_CLEAR +static inline pte_t ptep_get_and_clear(struct mm_struct *mm, + unsigned long addr, pte_t *ptep) +{ + contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep)); + return __ptep_get_and_clear(mm, addr, ptep); +} + +#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG +static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) +{ + pte_t orig_pte = __ptep_get(ptep); + + if (likely(!pte_valid_cont(orig_pte))) + return __ptep_test_and_clear_young(vma, addr, ptep); + + return contpte_ptep_test_and_clear_young(vma, addr, ptep); +} + +#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH +static inline int ptep_clear_flush_young(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) +{ + pte_t orig_pte = __ptep_get(ptep); + + if (likely(!pte_valid_cont(orig_pte))) + return __ptep_clear_flush_young(vma, addr, ptep); + + return contpte_ptep_clear_flush_young(vma, addr, ptep); +} + +#define wrprotect_ptes wrprotect_ptes +static __always_inline void wrprotect_ptes(struct mm_struct *mm, + unsigned long addr, pte_t *ptep, unsigned int nr) +{ + if (likely(nr == 1)) { + /* + * Optimization: wrprotect_ptes() can only be called for present + * ptes so we only need to check contig bit as condition for + * unfold, and we can remove the contig bit from the pte we read + * to avoid re-reading. This speeds up fork() which is sensitive + * for order-0 folios. Equivalent to contpte_try_unfold(). + */ + pte_t orig_pte = __ptep_get(ptep); + + if (unlikely(pte_cont(orig_pte))) { + __contpte_try_unfold(mm, addr, ptep, orig_pte); + orig_pte = pte_mknoncont(orig_pte); + } + ___ptep_set_wrprotect(mm, addr, ptep, orig_pte); + } else { + contpte_wrprotect_ptes(mm, addr, ptep, nr); + } +} + +#define __HAVE_ARCH_PTEP_SET_WRPROTECT +static inline void ptep_set_wrprotect(struct mm_struct *mm, + unsigned long addr, pte_t *ptep) +{ + wrprotect_ptes(mm, addr, ptep, 1); +} + +#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS +static inline int ptep_set_access_flags(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, + pte_t entry, int dirty) +{ + pte_t orig_pte = __ptep_get(ptep); + + entry = pte_mknoncont(entry); + + if (likely(!pte_valid_cont(orig_pte))) + return __ptep_set_access_flags(vma, addr, ptep, entry, dirty); + + return contpte_ptep_set_access_flags(vma, addr, ptep, entry, dirty); +} + +#else /* CONFIG_ARM64_CONTPTE */ + +#define ptep_get __ptep_get +#define set_pte __set_pte +#define set_ptes __set_ptes +#define pte_clear __pte_clear +#define clear_full_ptes __clear_full_ptes +#define get_and_clear_full_ptes __get_and_clear_full_ptes +#define __HAVE_ARCH_PTEP_GET_AND_CLEAR +#define ptep_get_and_clear __ptep_get_and_clear +#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG +#define ptep_test_and_clear_young __ptep_test_and_clear_young +#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH +#define ptep_clear_flush_young __ptep_clear_flush_young +#define __HAVE_ARCH_PTEP_SET_WRPROTECT +#define ptep_set_wrprotect __ptep_set_wrprotect +#define wrprotect_ptes __wrprotect_ptes +#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS +#define ptep_set_access_flags __ptep_set_access_flags + +#endif /* CONFIG_ARM64_CONTPTE */ + #endif /* !__ASSEMBLY__ */ #endif /* __ASM_PGTABLE_H */ diff --git a/arch/arm64/include/asm/ptdump.h b/arch/arm64/include/asm/ptdump.h index 581caac525b0..5b1701c76d1c 100644 --- a/arch/arm64/include/asm/ptdump.h +++ b/arch/arm64/include/asm/ptdump.h @@ -29,13 +29,6 @@ void __init ptdump_debugfs_register(struct ptdump_info *info, const char *name); static inline void ptdump_debugfs_register(struct ptdump_info *info, const char *name) { } #endif -void ptdump_check_wx(void); #endif /* CONFIG_PTDUMP_CORE */ -#ifdef CONFIG_DEBUG_WX -#define debug_checkwx() ptdump_check_wx() -#else -#define debug_checkwx() do { } while (0) -#endif - #endif /* __ASM_PTDUMP_H */ diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h index 1deb5d789c2e..3b0e8248e1a4 100644 --- a/arch/arm64/include/asm/tlbflush.h +++ b/arch/arm64/include/asm/tlbflush.h @@ -422,7 +422,7 @@ do { \ #define __flush_s2_tlb_range_op(op, start, pages, stride, tlb_level) \ __flush_tlb_range_op(op, start, pages, stride, 0, tlb_level, false, kvm_lpa2_is_enabled()); -static inline void __flush_tlb_range(struct vm_area_struct *vma, +static inline void __flush_tlb_range_nosync(struct vm_area_struct *vma, unsigned long start, unsigned long end, unsigned long stride, bool last_level, int tlb_level) @@ -456,10 +456,19 @@ static inline void __flush_tlb_range(struct vm_area_struct *vma, __flush_tlb_range_op(vae1is, start, pages, stride, asid, tlb_level, true, lpa2_is_enabled()); - dsb(ish); mmu_notifier_arch_invalidate_secondary_tlbs(vma->vm_mm, start, end); } +static inline void __flush_tlb_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end, + unsigned long stride, bool last_level, + int tlb_level) +{ + __flush_tlb_range_nosync(vma, start, end, stride, + last_level, tlb_level); + dsb(ish); +} + static inline void flush_tlb_range(struct vm_area_struct *vma, unsigned long start, unsigned long end) { diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile index 14b4a179bad3..763824963ed1 100644 --- a/arch/arm64/kernel/Makefile +++ b/arch/arm64/kernel/Makefile @@ -65,7 +65,7 @@ obj-$(CONFIG_KEXEC_FILE) += machine_kexec_file.o kexec_image.o obj-$(CONFIG_ARM64_RELOC_TEST) += arm64-reloc-test.o arm64-reloc-test-y := reloc_test_core.o reloc_test_syms.o obj-$(CONFIG_CRASH_DUMP) += crash_dump.o -obj-$(CONFIG_CRASH_CORE) += crash_core.o +obj-$(CONFIG_VMCORE_INFO) += vmcore_info.o obj-$(CONFIG_ARM_SDE_INTERFACE) += sdei.o obj-$(CONFIG_ARM64_PTR_AUTH) += pointer_auth.o obj-$(CONFIG_ARM64_MTE) += mte.o diff --git a/arch/arm64/kernel/efi.c b/arch/arm64/kernel/efi.c index 0228001347be..9afcc690fe73 100644 --- a/arch/arm64/kernel/efi.c +++ b/arch/arm64/kernel/efi.c @@ -103,7 +103,7 @@ static int __init set_permissions(pte_t *ptep, unsigned long addr, void *data) { struct set_perm_data *spd = data; const efi_memory_desc_t *md = spd->md; - pte_t pte = READ_ONCE(*ptep); + pte_t pte = __ptep_get(ptep); if (md->attribute & EFI_MEMORY_RO) pte = set_pte_bit(pte, __pgprot(PTE_RDONLY)); @@ -111,7 +111,7 @@ static int __init set_permissions(pte_t *ptep, unsigned long addr, void *data) pte = set_pte_bit(pte, __pgprot(PTE_PXN)); else if (system_supports_bti_kernel() && spd->has_bti) pte = set_pte_bit(pte, __pgprot(PTE_GP)); - set_pte(ptep, pte); + __set_pte(ptep, pte); return 0; } diff --git a/arch/arm64/kernel/machine_kexec.c b/arch/arm64/kernel/machine_kexec.c index b38aae5b488d..82e2203d86a3 100644 --- a/arch/arm64/kernel/machine_kexec.c +++ b/arch/arm64/kernel/machine_kexec.c @@ -255,7 +255,7 @@ void machine_crash_shutdown(struct pt_regs *regs) pr_info("Starting crashdump kernel...\n"); } -#ifdef CONFIG_HIBERNATION +#if defined(CONFIG_CRASH_DUMP) && defined(CONFIG_HIBERNATION) /* * To preserve the crash dump kernel image, the relevant memory segments * should be mapped again around the hibernation. diff --git a/arch/arm64/kernel/machine_kexec_file.c b/arch/arm64/kernel/machine_kexec_file.c index 0e017358f4ba..af1ca875c52c 100644 --- a/arch/arm64/kernel/machine_kexec_file.c +++ b/arch/arm64/kernel/machine_kexec_file.c @@ -39,6 +39,7 @@ int arch_kimage_file_post_load_cleanup(struct kimage *image) return kexec_image_post_load_cleanup_default(image); } +#ifdef CONFIG_CRASH_DUMP static int prepare_elf_headers(void **addr, unsigned long *sz) { struct crash_mem *cmem; @@ -80,6 +81,7 @@ out: kfree(cmem); return ret; } +#endif /* * Tries to add the initrd and DTB to the image. If it is not possible to find @@ -93,8 +95,8 @@ int load_other_segments(struct kimage *image, char *cmdline) { struct kexec_buf kbuf; - void *headers, *dtb = NULL; - unsigned long headers_sz, initrd_load_addr = 0, dtb_len, + void *dtb = NULL; + unsigned long initrd_load_addr = 0, dtb_len, orig_segments = image->nr_segments; int ret = 0; @@ -102,7 +104,10 @@ int load_other_segments(struct kimage *image, /* not allocate anything below the kernel */ kbuf.buf_min = kernel_load_addr + kernel_size; +#ifdef CONFIG_CRASH_DUMP /* load elf core header */ + void *headers; + unsigned long headers_sz; if (image->type == KEXEC_TYPE_CRASH) { ret = prepare_elf_headers(&headers, &headers_sz); if (ret) { @@ -130,6 +135,7 @@ int load_other_segments(struct kimage *image, kexec_dprintk("Loaded elf core header at 0x%lx bufsz=0x%lx memsz=0x%lx\n", image->elf_load_addr, kbuf.bufsz, kbuf.memsz); } +#endif /* load initrd */ if (initrd) { diff --git a/arch/arm64/kernel/mte.c b/arch/arm64/kernel/mte.c index a41ef3213e1e..dcdcccd40891 100644 --- a/arch/arm64/kernel/mte.c +++ b/arch/arm64/kernel/mte.c @@ -67,7 +67,7 @@ int memcmp_pages(struct page *page1, struct page *page2) /* * If the page content is identical but at least one of the pages is * tagged, return non-zero to avoid KSM merging. If only one of the - * pages is tagged, set_pte_at() may zero or change the tags of the + * pages is tagged, __set_ptes() may zero or change the tags of the * other page via mte_sync_tags(). */ if (page_mte_tagged(page1) || page_mte_tagged(page2)) diff --git a/arch/arm64/kernel/crash_core.c b/arch/arm64/kernel/vmcore_info.c index 66cde752cd74..b19d5d6cb8b3 100644 --- a/arch/arm64/kernel/crash_core.c +++ b/arch/arm64/kernel/vmcore_info.c @@ -4,7 +4,7 @@ * Copyright (C) Huawei Futurewei Technologies. */ -#include <linux/crash_core.h> +#include <linux/vmcore_info.h> #include <asm/cpufeature.h> #include <asm/memory.h> #include <asm/pgtable-hwdef.h> @@ -23,7 +23,6 @@ void arch_crash_save_vmcoreinfo(void) /* Please note VMCOREINFO_NUMBER() uses "%d", not "%x" */ vmcoreinfo_append_str("NUMBER(MODULES_VADDR)=0x%lx\n", MODULES_VADDR); vmcoreinfo_append_str("NUMBER(MODULES_END)=0x%lx\n", MODULES_END); - vmcoreinfo_append_str("NUMBER(VMALLOC_START)=0x%lx\n", VMALLOC_START); vmcoreinfo_append_str("NUMBER(VMALLOC_END)=0x%lx\n", VMALLOC_END); vmcoreinfo_append_str("NUMBER(VMEMMAP_START)=0x%lx\n", VMEMMAP_START); vmcoreinfo_append_str("NUMBER(VMEMMAP_END)=0x%lx\n", VMEMMAP_END); diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c index aaf1d4939739..629145fd3161 100644 --- a/arch/arm64/kvm/guest.c +++ b/arch/arm64/kvm/guest.c @@ -1072,7 +1072,7 @@ int kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm, } else { /* * Only locking to serialise with a concurrent - * set_pte_at() in the VMM but still overriding the + * __set_ptes() in the VMM but still overriding the * tags, hence ignoring the return value. */ try_page_mte_tagging(page); diff --git a/arch/arm64/mm/Makefile b/arch/arm64/mm/Makefile index dbd1bc95967d..60454256945b 100644 --- a/arch/arm64/mm/Makefile +++ b/arch/arm64/mm/Makefile @@ -3,6 +3,7 @@ obj-y := dma-mapping.o extable.o fault.o init.o \ cache.o copypage.o flush.o \ ioremap.o mmap.o pgd.o mmu.o \ context.o proc.o pageattr.o fixmap.o +obj-$(CONFIG_ARM64_CONTPTE) += contpte.o obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o obj-$(CONFIG_PTDUMP_CORE) += ptdump.o obj-$(CONFIG_PTDUMP_DEBUGFS) += ptdump_debugfs.o diff --git a/arch/arm64/mm/contpte.c b/arch/arm64/mm/contpte.c new file mode 100644 index 000000000000..1b64b4c3f8bf --- /dev/null +++ b/arch/arm64/mm/contpte.c @@ -0,0 +1,408 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2023 ARM Ltd. + */ + +#include <linux/mm.h> +#include <linux/efi.h> +#include <linux/export.h> +#include <asm/tlbflush.h> + +static inline bool mm_is_user(struct mm_struct *mm) +{ + /* + * Don't attempt to apply the contig bit to kernel mappings, because + * dynamically adding/removing the contig bit can cause page faults. + * These racing faults are ok for user space, since they get serialized + * on the PTL. But kernel mappings can't tolerate faults. + */ + if (unlikely(mm_is_efi(mm))) + return false; + return mm != &init_mm; +} + +static inline pte_t *contpte_align_down(pte_t *ptep) +{ + return PTR_ALIGN_DOWN(ptep, sizeof(*ptep) * CONT_PTES); +} + +static void contpte_try_unfold_partial(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, unsigned int nr) +{ + /* + * Unfold any partially covered contpte block at the beginning and end + * of the range. + */ + + if (ptep != contpte_align_down(ptep) || nr < CONT_PTES) + contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep)); + + if (ptep + nr != contpte_align_down(ptep + nr)) { + unsigned long last_addr = addr + PAGE_SIZE * (nr - 1); + pte_t *last_ptep = ptep + nr - 1; + + contpte_try_unfold(mm, last_addr, last_ptep, + __ptep_get(last_ptep)); + } +} + +static void contpte_convert(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte) +{ + struct vm_area_struct vma = TLB_FLUSH_VMA(mm, 0); + unsigned long start_addr; + pte_t *start_ptep; + int i; + + start_ptep = ptep = contpte_align_down(ptep); + start_addr = addr = ALIGN_DOWN(addr, CONT_PTE_SIZE); + pte = pfn_pte(ALIGN_DOWN(pte_pfn(pte), CONT_PTES), pte_pgprot(pte)); + + for (i = 0; i < CONT_PTES; i++, ptep++, addr += PAGE_SIZE) { + pte_t ptent = __ptep_get_and_clear(mm, addr, ptep); + + if (pte_dirty(ptent)) + pte = pte_mkdirty(pte); + + if (pte_young(ptent)) + pte = pte_mkyoung(pte); + } + + __flush_tlb_range(&vma, start_addr, addr, PAGE_SIZE, true, 3); + + __set_ptes(mm, start_addr, start_ptep, pte, CONT_PTES); +} + +void __contpte_try_fold(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte) +{ + /* + * We have already checked that the virtual and pysical addresses are + * correctly aligned for a contpte mapping in contpte_try_fold() so the + * remaining checks are to ensure that the contpte range is fully + * covered by a single folio, and ensure that all the ptes are valid + * with contiguous PFNs and matching prots. We ignore the state of the + * access and dirty bits for the purpose of deciding if its a contiguous + * range; the folding process will generate a single contpte entry which + * has a single access and dirty bit. Those 2 bits are the logical OR of + * their respective bits in the constituent pte entries. In order to + * ensure the contpte range is covered by a single folio, we must + * recover the folio from the pfn, but special mappings don't have a + * folio backing them. Fortunately contpte_try_fold() already checked + * that the pte is not special - we never try to fold special mappings. + * Note we can't use vm_normal_page() for this since we don't have the + * vma. + */ + + unsigned long folio_start, folio_end; + unsigned long cont_start, cont_end; + pte_t expected_pte, subpte; + struct folio *folio; + struct page *page; + unsigned long pfn; + pte_t *orig_ptep; + pgprot_t prot; + + int i; + + if (!mm_is_user(mm)) + return; + + page = pte_page(pte); + folio = page_folio(page); + folio_start = addr - (page - &folio->page) * PAGE_SIZE; + folio_end = folio_start + folio_nr_pages(folio) * PAGE_SIZE; + cont_start = ALIGN_DOWN(addr, CONT_PTE_SIZE); + cont_end = cont_start + CONT_PTE_SIZE; + + if (folio_start > cont_start || folio_end < cont_end) + return; + + pfn = ALIGN_DOWN(pte_pfn(pte), CONT_PTES); + prot = pte_pgprot(pte_mkold(pte_mkclean(pte))); + expected_pte = pfn_pte(pfn, prot); + orig_ptep = ptep; + ptep = contpte_align_down(ptep); + + for (i = 0; i < CONT_PTES; i++) { + subpte = pte_mkold(pte_mkclean(__ptep_get(ptep))); + if (!pte_same(subpte, expected_pte)) + return; + expected_pte = pte_advance_pfn(expected_pte, 1); + ptep++; + } + + pte = pte_mkcont(pte); + contpte_convert(mm, addr, orig_ptep, pte); +} +EXPORT_SYMBOL_GPL(__contpte_try_fold); + +void __contpte_try_unfold(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte) +{ + /* + * We have already checked that the ptes are contiguous in + * contpte_try_unfold(), so just check that the mm is user space. + */ + if (!mm_is_user(mm)) + return; + + pte = pte_mknoncont(pte); + contpte_convert(mm, addr, ptep, pte); +} +EXPORT_SYMBOL_GPL(__contpte_try_unfold); + +pte_t contpte_ptep_get(pte_t *ptep, pte_t orig_pte) +{ + /* + * Gather access/dirty bits, which may be populated in any of the ptes + * of the contig range. We are guaranteed to be holding the PTL, so any + * contiguous range cannot be unfolded or otherwise modified under our + * feet. + */ + + pte_t pte; + int i; + + ptep = contpte_align_down(ptep); + + for (i = 0; i < CONT_PTES; i++, ptep++) { + pte = __ptep_get(ptep); + + if (pte_dirty(pte)) + orig_pte = pte_mkdirty(orig_pte); + + if (pte_young(pte)) + orig_pte = pte_mkyoung(orig_pte); + } + + return orig_pte; +} +EXPORT_SYMBOL_GPL(contpte_ptep_get); + +pte_t contpte_ptep_get_lockless(pte_t *orig_ptep) +{ + /* + * The ptep_get_lockless() API requires us to read and return *orig_ptep + * so that it is self-consistent, without the PTL held, so we may be + * racing with other threads modifying the pte. Usually a READ_ONCE() + * would suffice, but for the contpte case, we also need to gather the + * access and dirty bits from across all ptes in the contiguous block, + * and we can't read all of those neighbouring ptes atomically, so any + * contiguous range may be unfolded/modified/refolded under our feet. + * Therefore we ensure we read a _consistent_ contpte range by checking + * that all ptes in the range are valid and have CONT_PTE set, that all + * pfns are contiguous and that all pgprots are the same (ignoring + * access/dirty). If we find a pte that is not consistent, then we must + * be racing with an update so start again. If the target pte does not + * have CONT_PTE set then that is considered consistent on its own + * because it is not part of a contpte range. + */ + + pgprot_t orig_prot; + unsigned long pfn; + pte_t orig_pte; + pgprot_t prot; + pte_t *ptep; + pte_t pte; + int i; + +retry: + orig_pte = __ptep_get(orig_ptep); + + if (!pte_valid_cont(orig_pte)) + return orig_pte; + + orig_prot = pte_pgprot(pte_mkold(pte_mkclean(orig_pte))); + ptep = contpte_align_down(orig_ptep); + pfn = pte_pfn(orig_pte) - (orig_ptep - ptep); + + for (i = 0; i < CONT_PTES; i++, ptep++, pfn++) { + pte = __ptep_get(ptep); + prot = pte_pgprot(pte_mkold(pte_mkclean(pte))); + + if (!pte_valid_cont(pte) || + pte_pfn(pte) != pfn || + pgprot_val(prot) != pgprot_val(orig_prot)) + goto retry; + + if (pte_dirty(pte)) + orig_pte = pte_mkdirty(orig_pte); + + if (pte_young(pte)) + orig_pte = pte_mkyoung(orig_pte); + } + + return orig_pte; +} +EXPORT_SYMBOL_GPL(contpte_ptep_get_lockless); + +void contpte_set_ptes(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte, unsigned int nr) +{ + unsigned long next; + unsigned long end; + unsigned long pfn; + pgprot_t prot; + + /* + * The set_ptes() spec guarantees that when nr > 1, the initial state of + * all ptes is not-present. Therefore we never need to unfold or + * otherwise invalidate a range before we set the new ptes. + * contpte_set_ptes() should never be called for nr < 2. + */ + VM_WARN_ON(nr == 1); + + if (!mm_is_user(mm)) + return __set_ptes(mm, addr, ptep, pte, nr); + + end = addr + (nr << PAGE_SHIFT); + pfn = pte_pfn(pte); + prot = pte_pgprot(pte); + + do { + next = pte_cont_addr_end(addr, end); + nr = (next - addr) >> PAGE_SHIFT; + pte = pfn_pte(pfn, prot); + + if (((addr | next | (pfn << PAGE_SHIFT)) & ~CONT_PTE_MASK) == 0) + pte = pte_mkcont(pte); + else + pte = pte_mknoncont(pte); + + __set_ptes(mm, addr, ptep, pte, nr); + + addr = next; + ptep += nr; + pfn += nr; + + } while (addr != end); +} +EXPORT_SYMBOL_GPL(contpte_set_ptes); + +void contpte_clear_full_ptes(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, unsigned int nr, int full) +{ + contpte_try_unfold_partial(mm, addr, ptep, nr); + __clear_full_ptes(mm, addr, ptep, nr, full); +} +EXPORT_SYMBOL_GPL(contpte_clear_full_ptes); + +pte_t contpte_get_and_clear_full_ptes(struct mm_struct *mm, + unsigned long addr, pte_t *ptep, + unsigned int nr, int full) +{ + contpte_try_unfold_partial(mm, addr, ptep, nr); + return __get_and_clear_full_ptes(mm, addr, ptep, nr, full); +} +EXPORT_SYMBOL_GPL(contpte_get_and_clear_full_ptes); + +int contpte_ptep_test_and_clear_young(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) +{ + /* + * ptep_clear_flush_young() technically requires us to clear the access + * flag for a _single_ pte. However, the core-mm code actually tracks + * access/dirty per folio, not per page. And since we only create a + * contig range when the range is covered by a single folio, we can get + * away with clearing young for the whole contig range here, so we avoid + * having to unfold. + */ + + int young = 0; + int i; + + ptep = contpte_align_down(ptep); + addr = ALIGN_DOWN(addr, CONT_PTE_SIZE); + + for (i = 0; i < CONT_PTES; i++, ptep++, addr += PAGE_SIZE) + young |= __ptep_test_and_clear_young(vma, addr, ptep); + + return young; +} +EXPORT_SYMBOL_GPL(contpte_ptep_test_and_clear_young); + +int contpte_ptep_clear_flush_young(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) +{ + int young; + + young = contpte_ptep_test_and_clear_young(vma, addr, ptep); + + if (young) { + /* + * See comment in __ptep_clear_flush_young(); same rationale for + * eliding the trailing DSB applies here. + */ + addr = ALIGN_DOWN(addr, CONT_PTE_SIZE); + __flush_tlb_range_nosync(vma, addr, addr + CONT_PTE_SIZE, + PAGE_SIZE, true, 3); + } + + return young; +} +EXPORT_SYMBOL_GPL(contpte_ptep_clear_flush_young); + +void contpte_wrprotect_ptes(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, unsigned int nr) +{ + /* + * If wrprotecting an entire contig range, we can avoid unfolding. Just + * set wrprotect and wait for the later mmu_gather flush to invalidate + * the tlb. Until the flush, the page may or may not be wrprotected. + * After the flush, it is guaranteed wrprotected. If it's a partial + * range though, we must unfold, because we can't have a case where + * CONT_PTE is set but wrprotect applies to a subset of the PTEs; this + * would cause it to continue to be unpredictable after the flush. + */ + + contpte_try_unfold_partial(mm, addr, ptep, nr); + __wrprotect_ptes(mm, addr, ptep, nr); +} +EXPORT_SYMBOL_GPL(contpte_wrprotect_ptes); + +int contpte_ptep_set_access_flags(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, + pte_t entry, int dirty) +{ + unsigned long start_addr; + pte_t orig_pte; + int i; + + /* + * Gather the access/dirty bits for the contiguous range. If nothing has + * changed, its a noop. + */ + orig_pte = pte_mknoncont(ptep_get(ptep)); + if (pte_val(orig_pte) == pte_val(entry)) + return 0; + + /* + * We can fix up access/dirty bits without having to unfold the contig + * range. But if the write bit is changing, we must unfold. + */ + if (pte_write(orig_pte) == pte_write(entry)) { + /* + * For HW access management, we technically only need to update + * the flag on a single pte in the range. But for SW access + * management, we need to update all the ptes to prevent extra + * faults. Avoid per-page tlb flush in __ptep_set_access_flags() + * and instead flush the whole range at the end. + */ + ptep = contpte_align_down(ptep); + start_addr = addr = ALIGN_DOWN(addr, CONT_PTE_SIZE); + + for (i = 0; i < CONT_PTES; i++, ptep++, addr += PAGE_SIZE) + __ptep_set_access_flags(vma, addr, ptep, entry, 0); + + if (dirty) + __flush_tlb_range(vma, start_addr, addr, + PAGE_SIZE, true, 3); + } else { + __contpte_try_unfold(vma->vm_mm, addr, ptep, orig_pte); + __ptep_set_access_flags(vma, addr, ptep, entry, dirty); + } + + return 1; +} +EXPORT_SYMBOL_GPL(contpte_ptep_set_access_flags); diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 60265ede48fe..8251e2fea9c7 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -191,7 +191,7 @@ static void show_pte(unsigned long addr) if (!ptep) break; - pte = READ_ONCE(*ptep); + pte = __ptep_get(ptep); pr_cont(", pte=%016llx", pte_val(pte)); pte_unmap(ptep); } while(0); @@ -205,16 +205,16 @@ static void show_pte(unsigned long addr) * * It needs to cope with hardware update of the accessed/dirty state by other * agents in the system and can safely skip the __sync_icache_dcache() call as, - * like set_pte_at(), the PTE is never changed from no-exec to exec here. + * like __set_ptes(), the PTE is never changed from no-exec to exec here. * * Returns whether or not the PTE actually changed. */ -int ptep_set_access_flags(struct vm_area_struct *vma, - unsigned long address, pte_t *ptep, - pte_t entry, int dirty) +int __ptep_set_access_flags(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep, + pte_t entry, int dirty) { pteval_t old_pteval, pteval; - pte_t pte = READ_ONCE(*ptep); + pte_t pte = __ptep_get(ptep); if (pte_same(pte, entry)) return 0; diff --git a/arch/arm64/mm/fixmap.c b/arch/arm64/mm/fixmap.c index d22506e9c7fd..de1e09d986ad 100644 --- a/arch/arm64/mm/fixmap.c +++ b/arch/arm64/mm/fixmap.c @@ -124,9 +124,9 @@ void __set_fixmap(enum fixed_addresses idx, ptep = fixmap_pte(addr); if (pgprot_val(flags)) { - set_pte(ptep, pfn_pte(phys >> PAGE_SHIFT, flags)); + __set_pte(ptep, pfn_pte(phys >> PAGE_SHIFT, flags)); } else { - pte_clear(&init_mm, addr, ptep); + __pte_clear(&init_mm, addr, ptep); flush_tlb_kernel_range(addr, addr+PAGE_SIZE); } } diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index 8116ac599f80..0f0e10bb0a95 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -45,13 +45,6 @@ void __init arm64_hugetlb_cma_reserve(void) else order = CONT_PMD_SHIFT - PAGE_SHIFT; - /* - * HugeTLB CMA reservation is required for gigantic - * huge pages which could not be allocated via the - * page allocator. Just warn if there is any change - * breaking this assumption. - */ - WARN_ON(order <= MAX_PAGE_ORDER); hugetlb_cma_reserve(order); } #endif /* CONFIG_CMA */ @@ -152,14 +145,14 @@ pte_t huge_ptep_get(pte_t *ptep) { int ncontig, i; size_t pgsize; - pte_t orig_pte = ptep_get(ptep); + pte_t orig_pte = __ptep_get(ptep); if (!pte_present(orig_pte) || !pte_cont(orig_pte)) return orig_pte; ncontig = num_contig_ptes(page_size(pte_page(orig_pte)), &pgsize); for (i = 0; i < ncontig; i++, ptep++) { - pte_t pte = ptep_get(ptep); + pte_t pte = __ptep_get(ptep); if (pte_dirty(pte)) orig_pte = pte_mkdirty(orig_pte); @@ -184,11 +177,11 @@ static pte_t get_clear_contig(struct mm_struct *mm, unsigned long pgsize, unsigned long ncontig) { - pte_t orig_pte = ptep_get(ptep); + pte_t orig_pte = __ptep_get(ptep); unsigned long i; for (i = 0; i < ncontig; i++, addr += pgsize, ptep++) { - pte_t pte = ptep_get_and_clear(mm, addr, ptep); + pte_t pte = __ptep_get_and_clear(mm, addr, ptep); /* * If HW_AFDBM is enabled, then the HW could turn on @@ -236,7 +229,7 @@ static void clear_flush(struct mm_struct *mm, unsigned long i, saddr = addr; for (i = 0; i < ncontig; i++, addr += pgsize, ptep++) - ptep_clear(mm, addr, ptep); + __ptep_get_and_clear(mm, addr, ptep); flush_tlb_range(&vma, saddr, addr); } @@ -254,12 +247,12 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, if (!pte_present(pte)) { for (i = 0; i < ncontig; i++, ptep++, addr += pgsize) - set_pte_at(mm, addr, ptep, pte); + __set_ptes(mm, addr, ptep, pte, 1); return; } if (!pte_cont(pte)) { - set_pte_at(mm, addr, ptep, pte); + __set_ptes(mm, addr, ptep, pte, 1); return; } @@ -270,7 +263,7 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, clear_flush(mm, addr, ptep, pgsize, ncontig); for (i = 0; i < ncontig; i++, ptep++, addr += pgsize, pfn += dpfn) - set_pte_at(mm, addr, ptep, pfn_pte(pfn, hugeprot)); + __set_ptes(mm, addr, ptep, pfn_pte(pfn, hugeprot), 1); } pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, @@ -400,7 +393,7 @@ void huge_pte_clear(struct mm_struct *mm, unsigned long addr, ncontig = num_contig_ptes(sz, &pgsize); for (i = 0; i < ncontig; i++, addr += pgsize, ptep++) - pte_clear(mm, addr, ptep); + __pte_clear(mm, addr, ptep); } pte_t huge_ptep_get_and_clear(struct mm_struct *mm, @@ -408,10 +401,10 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm, { int ncontig; size_t pgsize; - pte_t orig_pte = ptep_get(ptep); + pte_t orig_pte = __ptep_get(ptep); if (!pte_cont(orig_pte)) - return ptep_get_and_clear(mm, addr, ptep); + return __ptep_get_and_clear(mm, addr, ptep); ncontig = find_num_contig(mm, addr, ptep, &pgsize); @@ -431,11 +424,11 @@ static int __cont_access_flags_changed(pte_t *ptep, pte_t pte, int ncontig) { int i; - if (pte_write(pte) != pte_write(ptep_get(ptep))) + if (pte_write(pte) != pte_write(__ptep_get(ptep))) return 1; for (i = 0; i < ncontig; i++) { - pte_t orig_pte = ptep_get(ptep + i); + pte_t orig_pte = __ptep_get(ptep + i); if (pte_dirty(pte) != pte_dirty(orig_pte)) return 1; @@ -459,7 +452,7 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma, pte_t orig_pte; if (!pte_cont(pte)) - return ptep_set_access_flags(vma, addr, ptep, pte, dirty); + return __ptep_set_access_flags(vma, addr, ptep, pte, dirty); ncontig = find_num_contig(mm, addr, ptep, &pgsize); dpfn = pgsize >> PAGE_SHIFT; @@ -478,7 +471,7 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma, hugeprot = pte_pgprot(pte); for (i = 0; i < ncontig; i++, ptep++, addr += pgsize, pfn += dpfn) - set_pte_at(mm, addr, ptep, pfn_pte(pfn, hugeprot)); + __set_ptes(mm, addr, ptep, pfn_pte(pfn, hugeprot), 1); return 1; } @@ -492,8 +485,8 @@ void huge_ptep_set_wrprotect(struct mm_struct *mm, size_t pgsize; pte_t pte; - if (!pte_cont(READ_ONCE(*ptep))) { - ptep_set_wrprotect(mm, addr, ptep); + if (!pte_cont(__ptep_get(ptep))) { + __ptep_set_wrprotect(mm, addr, ptep); return; } @@ -507,7 +500,7 @@ void huge_ptep_set_wrprotect(struct mm_struct *mm, pfn = pte_pfn(pte); for (i = 0; i < ncontig; i++, ptep++, addr += pgsize, pfn += dpfn) - set_pte_at(mm, addr, ptep, pfn_pte(pfn, hugeprot)); + __set_ptes(mm, addr, ptep, pfn_pte(pfn, hugeprot), 1); } pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, @@ -517,7 +510,7 @@ pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, size_t pgsize; int ncontig; - if (!pte_cont(READ_ONCE(*ptep))) + if (!pte_cont(__ptep_get(ptep))) return ptep_clear_flush(vma, addr, ptep); ncontig = find_num_contig(mm, addr, ptep, &pgsize); @@ -550,7 +543,7 @@ pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr * when the permission changes from executable to non-executable * in cases where cpu is affected with errata #2645198. */ - if (pte_user_exec(READ_ONCE(*ptep))) + if (pte_user_exec(__ptep_get(ptep))) return huge_ptep_clear_flush(vma, addr, ptep); } return huge_ptep_get_and_clear(vma->vm_mm, addr, ptep); diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 0f427b50fdc3..03efd86dce0a 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -100,7 +100,7 @@ static void __init arch_reserve_crashkernel(void) bool high = false; int ret; - if (!IS_ENABLED(CONFIG_KEXEC_CORE)) + if (!IS_ENABLED(CONFIG_CRASH_RESERVE)) return; ret = parse_crashkernel(cmdline, memblock_phys_mem_size(), diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c index fbddbf9faf19..b65a29440a0c 100644 --- a/arch/arm64/mm/kasan_init.c +++ b/arch/arm64/mm/kasan_init.c @@ -125,8 +125,8 @@ static void __init kasan_pte_populate(pmd_t *pmdp, unsigned long addr, if (!early) memset(__va(page_phys), KASAN_SHADOW_INIT, PAGE_SIZE); next = addr + PAGE_SIZE; - set_pte(ptep, pfn_pte(__phys_to_pfn(page_phys), PAGE_KERNEL)); - } while (ptep++, addr = next, addr != end && pte_none(READ_ONCE(*ptep))); + __set_pte(ptep, pfn_pte(__phys_to_pfn(page_phys), PAGE_KERNEL)); + } while (ptep++, addr = next, addr != end && pte_none(__ptep_get(ptep))); } static void __init kasan_pmd_populate(pud_t *pudp, unsigned long addr, @@ -366,7 +366,7 @@ static void __init kasan_init_shadow(void) * so we should make sure that it maps the zero page read-only. */ for (i = 0; i < PTRS_PER_PTE; i++) - set_pte(&kasan_early_shadow_pte[i], + __set_pte(&kasan_early_shadow_pte[i], pfn_pte(sym_to_pfn(kasan_early_shadow_page), PAGE_KERNEL_RO)); diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index bf5b1c426ad0..495b732d5af3 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -179,16 +179,16 @@ static void init_pte(pmd_t *pmdp, unsigned long addr, unsigned long end, ptep = pte_set_fixmap_offset(pmdp, addr); do { - pte_t old_pte = READ_ONCE(*ptep); + pte_t old_pte = __ptep_get(ptep); - set_pte(ptep, pfn_pte(__phys_to_pfn(phys), prot)); + __set_pte(ptep, pfn_pte(__phys_to_pfn(phys), prot)); /* * After the PTE entry has been populated once, we * only allow updates to the permission attributes. */ BUG_ON(!pgattr_change_is_safe(pte_val(old_pte), - READ_ONCE(pte_val(*ptep)))); + pte_val(__ptep_get(ptep)))); phys += PAGE_SIZE; } while (ptep++, addr += PAGE_SIZE, addr != end); @@ -682,8 +682,6 @@ void mark_rodata_ro(void) WRITE_ONCE(rodata_is_rw, false); update_mapping_prot(__pa_symbol(__start_rodata), (unsigned long)__start_rodata, section_size, PAGE_KERNEL_RO); - - debug_checkwx(); } static void __init declare_vma(struct vm_struct *vma, @@ -846,12 +844,12 @@ static void unmap_hotplug_pte_range(pmd_t *pmdp, unsigned long addr, do { ptep = pte_offset_kernel(pmdp, addr); - pte = READ_ONCE(*ptep); + pte = __ptep_get(ptep); if (pte_none(pte)) continue; WARN_ON(!pte_present(pte)); - pte_clear(&init_mm, addr, ptep); + __pte_clear(&init_mm, addr, ptep); flush_tlb_kernel_range(addr, addr + PAGE_SIZE); if (free_mapped) free_hotplug_page_range(pte_page(pte), @@ -979,7 +977,7 @@ static void free_empty_pte_table(pmd_t *pmdp, unsigned long addr, do { ptep = pte_offset_kernel(pmdp, addr); - pte = READ_ONCE(*ptep); + pte = __ptep_get(ptep); /* * This is just a sanity check here which verifies that @@ -998,7 +996,7 @@ static void free_empty_pte_table(pmd_t *pmdp, unsigned long addr, */ ptep = pte_offset_kernel(pmdp, 0UL); for (i = 0; i < PTRS_PER_PTE; i++) { - if (!pte_none(READ_ONCE(ptep[i]))) + if (!pte_none(__ptep_get(&ptep[i]))) return; } @@ -1494,7 +1492,7 @@ pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte * when the permission changes from executable to non-executable * in cases where cpu is affected with errata #2645198. */ - if (pte_user_exec(READ_ONCE(*ptep))) + if (pte_user_exec(ptep_get(ptep))) return ptep_clear_flush(vma, addr, ptep); } return ptep_get_and_clear(vma->vm_mm, addr, ptep); diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c index 924843f1f661..0c4e3ecf989d 100644 --- a/arch/arm64/mm/pageattr.c +++ b/arch/arm64/mm/pageattr.c @@ -36,12 +36,12 @@ bool can_set_direct_map(void) static int change_page_range(pte_t *ptep, unsigned long addr, void *data) { struct page_change_data *cdata = data; - pte_t pte = READ_ONCE(*ptep); + pte_t pte = __ptep_get(ptep); pte = clear_pte_bit(pte, cdata->clear_mask); pte = set_pte_bit(pte, cdata->set_mask); - set_pte(ptep, pte); + __set_pte(ptep, pte); return 0; } @@ -245,5 +245,5 @@ bool kernel_page_present(struct page *page) return true; ptep = pte_offset_kernel(pmdp, addr); - return pte_valid(READ_ONCE(*ptep)); + return pte_valid(__ptep_get(ptep)); } diff --git a/arch/arm64/mm/ptdump.c b/arch/arm64/mm/ptdump.c index 5b87f8d623f7..6986827e0d64 100644 --- a/arch/arm64/mm/ptdump.c +++ b/arch/arm64/mm/ptdump.c @@ -322,7 +322,7 @@ static struct ptdump_info kernel_ptdump_info __ro_after_init = { .mm = &init_mm, }; -void ptdump_check_wx(void) +bool ptdump_check_wx(void) { struct pg_state st = { .seq = NULL, @@ -343,11 +343,16 @@ void ptdump_check_wx(void) ptdump_walk_pgd(&st.ptdump, &init_mm, NULL); - if (st.wx_pages || st.uxn_pages) + if (st.wx_pages || st.uxn_pages) { pr_warn("Checked W+X mappings: FAILED, %lu W+X pages found, %lu non-UXN pages found\n", st.wx_pages, st.uxn_pages); - else + + return false; + } else { pr_info("Checked W+X mappings: passed, no W+X pages found\n"); + + return true; + } } static int __init ptdump_init(void) diff --git a/arch/arm64/mm/trans_pgd.c b/arch/arm64/mm/trans_pgd.c index 7b14df3c6477..5139a28130c0 100644 --- a/arch/arm64/mm/trans_pgd.c +++ b/arch/arm64/mm/trans_pgd.c @@ -33,7 +33,7 @@ static void *trans_alloc(struct trans_pgd_info *info) static void _copy_pte(pte_t *dst_ptep, pte_t *src_ptep, unsigned long addr) { - pte_t pte = READ_ONCE(*src_ptep); + pte_t pte = __ptep_get(src_ptep); if (pte_valid(pte)) { /* @@ -41,7 +41,7 @@ static void _copy_pte(pte_t *dst_ptep, pte_t *src_ptep, unsigned long addr) * read only (code, rodata). Clear the RDONLY bit from * the temporary mappings we use during restore. */ - set_pte(dst_ptep, pte_mkwrite_novma(pte)); + __set_pte(dst_ptep, pte_mkwrite_novma(pte)); } else if ((debug_pagealloc_enabled() || is_kfence_address((void *)addr)) && !pte_none(pte)) { /* @@ -55,7 +55,7 @@ static void _copy_pte(pte_t *dst_ptep, pte_t *src_ptep, unsigned long addr) */ BUG_ON(!pfn_valid(pte_pfn(pte))); - set_pte(dst_ptep, pte_mkpresent(pte_mkwrite_novma(pte))); + __set_pte(dst_ptep, pte_mkpresent(pte_mkwrite_novma(pte))); } } |