From a5e8131a0329673f70faee2e9ffb02e8a5bb3c89 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 30 Jan 2024 11:34:33 +0100 Subject: arm64, powerpc, riscv, s390, x86: ptdump: refactor CONFIG_DEBUG_WX All architectures using the core ptdump functionality also implement CONFIG_DEBUG_WX, and they all do it more or less the same way, with a function called debug_checkwx() that is called by mark_rodata_ro(), which is a substitute to ptdump_check_wx() when CONFIG_DEBUG_WX is set and a no-op otherwise. Refactor by centrally defining debug_checkwx() in linux/ptdump.h and call debug_checkwx() immediately after calling mark_rodata_ro() instead of calling it at the end of every mark_rodata_ro(). On x86_32, mark_rodata_ro() first checks __supported_pte_mask has _PAGE_NX before calling debug_checkwx(). Now the check is inside the callee ptdump_walk_pgd_level_checkwx(). On powerpc_64, mark_rodata_ro() bails out early before calling ptdump_check_wx() when the MMU doesn't have KERNEL_RO feature. The check is now also done in ptdump_check_wx() as it is called outside mark_rodata_ro(). Link: https://lkml.kernel.org/r/a59b102d7964261d31ead0316a9f18628e4e7a8e.1706610398.git.christophe.leroy@csgroup.eu Signed-off-by: Christophe Leroy Reviewed-by: Alexandre Ghiti Cc: Albert Ou Cc: Alexander Gordeev Cc: Andy Lutomirski Cc: "Aneesh Kumar K.V (IBM)" Cc: Borislav Petkov (AMD) Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Dave Hansen Cc: Gerald Schaefer Cc: Greg KH Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Kees Cook Cc: Mark Rutland Cc: Michael Ellerman Cc: "Naveen N. Rao" Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Phong Tran Cc: Russell King Cc: Steven Price Cc: Sven Schnelle Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/arm64/include/asm/ptdump.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'arch/arm64/include') diff --git a/arch/arm64/include/asm/ptdump.h b/arch/arm64/include/asm/ptdump.h index 581caac525b0..5b1701c76d1c 100644 --- a/arch/arm64/include/asm/ptdump.h +++ b/arch/arm64/include/asm/ptdump.h @@ -29,13 +29,6 @@ void __init ptdump_debugfs_register(struct ptdump_info *info, const char *name); static inline void ptdump_debugfs_register(struct ptdump_info *info, const char *name) { } #endif -void ptdump_check_wx(void); #endif /* CONFIG_PTDUMP_CORE */ -#ifdef CONFIG_DEBUG_WX -#define debug_checkwx() ptdump_check_wx() -#else -#define debug_checkwx() do { } while (0) -#endif - #endif /* __ASM_PTDUMP_H */ -- cgit v1.2.3 From 6e8f588708971e0626f5be808e8c4b6cdb86eb0b Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Mon, 29 Jan 2024 13:46:35 +0100 Subject: arm64/mm: make set_ptes() robust when OAs cross 48-bit boundary Patch series "mm/memory: optimize fork() with PTE-mapped THP", v3. Now that the rmap overhaul[1] is upstream that provides a clean interface for rmap batching, let's implement PTE batching during fork when processing PTE-mapped THPs. This series is partially based on Ryan's previous work[2] to implement cont-pte support on arm64, but its a complete rewrite based on [1] to optimize all architectures independent of any such PTE bits, and to use the new rmap batching functions that simplify the code and prepare for further rmap accounting changes. We collect consecutive PTEs that map consecutive pages of the same large folio, making sure that the other PTE bits are compatible, and (a) adjust the refcount only once per batch, (b) call rmap handling functions only once per batch and (c) perform batch PTE setting/updates. While this series should be beneficial for adding cont-pte support on ARM64[2], it's one of the requirements for maintaining a total mapcount[3] for large folios with minimal added overhead and further changes[4] that build up on top of the total mapcount. Independent of all that, this series results in a speedup during fork with PTE-mapped THP, which is the default with THPs that are smaller than a PMD (for example, 16KiB to 1024KiB mTHPs for anonymous memory[5]). On an Intel Xeon Silver 4210R CPU, fork'ing with 1GiB of PTE-mapped folios of the same size (stddev < 1%) results in the following runtimes for fork() (shorter is better): Folio Size | v6.8-rc1 | New | Change ------------------------------------------ 4KiB | 0.014328 | 0.014035 | - 2% 16KiB | 0.014263 | 0.01196 | -16% 32KiB | 0.014334 | 0.01094 | -24% 64KiB | 0.014046 | 0.010444 | -26% 128KiB | 0.014011 | 0.010063 | -28% 256KiB | 0.013993 | 0.009938 | -29% 512KiB | 0.013983 | 0.00985 | -30% 1024KiB | 0.013986 | 0.00982 | -30% 2048KiB | 0.014305 | 0.010076 | -30% Note that these numbers are even better than the ones from v1 (verified over multiple reboots), even though there were only minimal code changes. Well, I removed a pte_mkclean() call for anon folios, maybe that also plays a role. But my experience is that fork() is extremely sensitive to code size, inlining, ... so I suspect we'll see on other architectures rather a change of -20% instead of -30%, and it will be easy to "lose" some of that speedup in the future by subtle code changes. Next up is PTE batching when unmapping. Only tested on x86-64. Compile-tested on most other architectures. [1] https://lkml.kernel.org/r/20231220224504.646757-1-david@redhat.com [2] https://lkml.kernel.org/r/20231218105100.172635-1-ryan.roberts@arm.com [3] https://lkml.kernel.org/r/20230809083256.699513-1-david@redhat.com [4] https://lkml.kernel.org/r/20231124132626.235350-1-david@redhat.com [5] https://lkml.kernel.org/r/20231207161211.2374093-1-ryan.roberts@arm.com This patch (of 15): Since the high bits [51:48] of an OA are not stored contiguously in the PTE, there is a theoretical bug in set_ptes(), which just adds PAGE_SIZE to the pte to get the pte with the next pfn. This works until the pfn crosses the 48-bit boundary, at which point we overflow into the upper attributes. Of course one could argue (and Matthew Wilcox has :) that we will never see a folio cross this boundary because we only allow naturally aligned power-of-2 allocation, so this would require a half-petabyte folio. So its only a theoretical bug. But its better that the code is robust regardless. I've implemented pte_next_pfn() as part of the fix, which is an opt-in core-mm interface. So that is now available to the core-mm, which will be needed shortly to support forthcoming fork()-batching optimizations. Link: https://lkml.kernel.org/r/20240129124649.189745-1-david@redhat.com Link: https://lkml.kernel.org/r/20240125173534.1659317-1-ryan.roberts@arm.com Link: https://lkml.kernel.org/r/20240129124649.189745-2-david@redhat.com Fixes: 4a169d61c2ed ("arm64: implement the new page table range API") Closes: https://lore.kernel.org/linux-mm/fdaeb9a5-d890-499a-92c8-d171df43ad01@arm.com/ Signed-off-by: Ryan Roberts Signed-off-by: David Hildenbrand Reviewed-by: Catalin Marinas Reviewed-by: David Hildenbrand Tested-by: Ryan Roberts Reviewed-by: Mike Rapoport (IBM) Cc: Albert Ou Cc: Alexander Gordeev Cc: Aneesh Kumar K.V Cc: Christian Borntraeger Cc: Christophe Leroy Cc: David S. Miller Cc: Dinh Nguyen Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Matthew Wilcox Cc: Michael Ellerman Cc: Naveen N. Rao Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Russell King (Oracle) Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Will Deacon Cc: Alexandre Ghiti Signed-off-by: Andrew Morton --- arch/arm64/include/asm/pgtable.h | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) (limited to 'arch/arm64/include') diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 79ce70fbb751..52d0b0a763f1 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -341,6 +341,22 @@ static inline void __sync_cache_and_tags(pte_t pte, unsigned int nr_pages) mte_sync_tags(pte, nr_pages); } +/* + * Select all bits except the pfn + */ +static inline pgprot_t pte_pgprot(pte_t pte) +{ + unsigned long pfn = pte_pfn(pte); + + return __pgprot(pte_val(pfn_pte(pfn, __pgprot(0))) ^ pte_val(pte)); +} + +#define pte_next_pfn pte_next_pfn +static inline pte_t pte_next_pfn(pte_t pte) +{ + return pfn_pte(pte_pfn(pte) + 1, pte_pgprot(pte)); +} + static inline void set_ptes(struct mm_struct *mm, unsigned long __always_unused addr, pte_t *ptep, pte_t pte, unsigned int nr) @@ -354,7 +370,7 @@ static inline void set_ptes(struct mm_struct *mm, if (--nr == 0) break; ptep++; - pte_val(pte) += PAGE_SIZE; + pte = pte_next_pfn(pte); } } #define set_ptes set_ptes @@ -433,16 +449,6 @@ static inline pte_t pte_swp_clear_exclusive(pte_t pte) return clear_pte_bit(pte, __pgprot(PTE_SWP_EXCLUSIVE)); } -/* - * Select all bits except the pfn - */ -static inline pgprot_t pte_pgprot(pte_t pte) -{ - unsigned long pfn = pte_pfn(pte); - - return __pgprot(pte_val(pfn_pte(pfn, __pgprot(0))) ^ pte_val(pte)); -} - #ifdef CONFIG_NUMA_BALANCING /* * See the comment in include/linux/pgtable.h -- cgit v1.2.3 From c1bd2b4028ae5b4d2ada64b31c40cc44cdf00972 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Thu, 15 Feb 2024 10:31:51 +0000 Subject: arm64/mm: convert pte_next_pfn() to pte_advance_pfn() Core-mm needs to be able to advance the pfn by an arbitrary amount, so override the new pte_advance_pfn() API to do so. Link: https://lkml.kernel.org/r/20240215103205.2607016-5-ryan.roberts@arm.com Signed-off-by: Ryan Roberts Acked-by: David Hildenbrand Acked-by: Mark Rutland Acked-by: Catalin Marinas Cc: Alistair Popple Cc: Andrey Ryabinin Cc: Ard Biesheuvel Cc: Barry Song <21cnbao@gmail.com> Cc: Borislav Petkov (AMD) Cc: Dave Hansen Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: James Morse Cc: John Hubbard Cc: Kefeng Wang Cc: Marc Zyngier Cc: Matthew Wilcox (Oracle) Cc: Thomas Gleixner Cc: Will Deacon Cc: Yang Shi Cc: Zi Yan Signed-off-by: Andrew Morton --- arch/arm64/include/asm/pgtable.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/arm64/include') diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 52d0b0a763f1..b6d3e9e0a946 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -351,10 +351,10 @@ static inline pgprot_t pte_pgprot(pte_t pte) return __pgprot(pte_val(pfn_pte(pfn, __pgprot(0))) ^ pte_val(pte)); } -#define pte_next_pfn pte_next_pfn -static inline pte_t pte_next_pfn(pte_t pte) +#define pte_advance_pfn pte_advance_pfn +static inline pte_t pte_advance_pfn(pte_t pte, unsigned long nr) { - return pfn_pte(pte_pfn(pte) + 1, pte_pgprot(pte)); + return pfn_pte(pte_pfn(pte) + nr, pte_pgprot(pte)); } static inline void set_ptes(struct mm_struct *mm, @@ -370,7 +370,7 @@ static inline void set_ptes(struct mm_struct *mm, if (--nr == 0) break; ptep++; - pte = pte_next_pfn(pte); + pte = pte_advance_pfn(pte, 1); } } #define set_ptes set_ptes -- cgit v1.2.3 From 532736558e8ef2865eae1d84b52dda4422cac810 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Thu, 15 Feb 2024 10:31:54 +0000 Subject: arm64/mm: convert READ_ONCE(*ptep) to ptep_get(ptep) There are a number of places in the arch code that read a pte by using the READ_ONCE() macro. Refactor these call sites to instead use the ptep_get() helper, which itself is a READ_ONCE(). Generated code should be the same. This will benefit us when we shortly introduce the transparent contpte support. In this case, ptep_get() will become more complex so we now have all the code abstracted through it. Link: https://lkml.kernel.org/r/20240215103205.2607016-8-ryan.roberts@arm.com Signed-off-by: Ryan Roberts Tested-by: John Hubbard Acked-by: Mark Rutland Acked-by: Catalin Marinas Cc: Alistair Popple Cc: Andrey Ryabinin Cc: Ard Biesheuvel Cc: Barry Song <21cnbao@gmail.com> Cc: Borislav Petkov (AMD) Cc: Dave Hansen Cc: David Hildenbrand Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: James Morse Cc: Kefeng Wang Cc: Marc Zyngier Cc: Matthew Wilcox (Oracle) Cc: Thomas Gleixner Cc: Will Deacon Cc: Yang Shi Cc: Zi Yan Signed-off-by: Andrew Morton --- arch/arm64/include/asm/pgtable.h | 12 +++++++++--- arch/arm64/kernel/efi.c | 2 +- arch/arm64/mm/fault.c | 4 ++-- arch/arm64/mm/hugetlbpage.c | 6 +++--- arch/arm64/mm/kasan_init.c | 2 +- arch/arm64/mm/mmu.c | 12 ++++++------ arch/arm64/mm/pageattr.c | 4 ++-- arch/arm64/mm/trans_pgd.c | 2 +- 8 files changed, 25 insertions(+), 19 deletions(-) (limited to 'arch/arm64/include') diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index b6d3e9e0a946..de034ca40bad 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -275,6 +275,12 @@ static inline void set_pte(pte_t *ptep, pte_t pte) } } +#define ptep_get ptep_get +static inline pte_t ptep_get(pte_t *ptep) +{ + return READ_ONCE(*ptep); +} + extern void __sync_icache_dcache(pte_t pteval); bool pgattr_change_is_safe(u64 old, u64 new); @@ -302,7 +308,7 @@ static inline void __check_safe_pte_update(struct mm_struct *mm, pte_t *ptep, if (!IS_ENABLED(CONFIG_DEBUG_VM)) return; - old_pte = READ_ONCE(*ptep); + old_pte = ptep_get(ptep); if (!pte_valid(old_pte) || !pte_valid(pte)) return; @@ -904,7 +910,7 @@ static inline int __ptep_test_and_clear_young(pte_t *ptep) { pte_t old_pte, pte; - pte = READ_ONCE(*ptep); + pte = ptep_get(ptep); do { old_pte = pte; pte = pte_mkold(pte); @@ -986,7 +992,7 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addres { pte_t old_pte, pte; - pte = READ_ONCE(*ptep); + pte = ptep_get(ptep); do { old_pte = pte; pte = pte_wrprotect(pte); diff --git a/arch/arm64/kernel/efi.c b/arch/arm64/kernel/efi.c index 0228001347be..d0e08e93b246 100644 --- a/arch/arm64/kernel/efi.c +++ b/arch/arm64/kernel/efi.c @@ -103,7 +103,7 @@ static int __init set_permissions(pte_t *ptep, unsigned long addr, void *data) { struct set_perm_data *spd = data; const efi_memory_desc_t *md = spd->md; - pte_t pte = READ_ONCE(*ptep); + pte_t pte = ptep_get(ptep); if (md->attribute & EFI_MEMORY_RO) pte = set_pte_bit(pte, __pgprot(PTE_RDONLY)); diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 55f6455a8284..a254761fa1bd 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -191,7 +191,7 @@ static void show_pte(unsigned long addr) if (!ptep) break; - pte = READ_ONCE(*ptep); + pte = ptep_get(ptep); pr_cont(", pte=%016llx", pte_val(pte)); pte_unmap(ptep); } while(0); @@ -214,7 +214,7 @@ int ptep_set_access_flags(struct vm_area_struct *vma, pte_t entry, int dirty) { pteval_t old_pteval, pteval; - pte_t pte = READ_ONCE(*ptep); + pte_t pte = ptep_get(ptep); if (pte_same(pte, entry)) return 0; diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index 6720ec8d50e7..2892f925ed66 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -485,7 +485,7 @@ void huge_ptep_set_wrprotect(struct mm_struct *mm, size_t pgsize; pte_t pte; - if (!pte_cont(READ_ONCE(*ptep))) { + if (!pte_cont(ptep_get(ptep))) { ptep_set_wrprotect(mm, addr, ptep); return; } @@ -510,7 +510,7 @@ pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, size_t pgsize; int ncontig; - if (!pte_cont(READ_ONCE(*ptep))) + if (!pte_cont(ptep_get(ptep))) return ptep_clear_flush(vma, addr, ptep); ncontig = find_num_contig(mm, addr, ptep, &pgsize); @@ -543,7 +543,7 @@ pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr * when the permission changes from executable to non-executable * in cases where cpu is affected with errata #2645198. */ - if (pte_user_exec(READ_ONCE(*ptep))) + if (pte_user_exec(ptep_get(ptep))) return huge_ptep_clear_flush(vma, addr, ptep); } return huge_ptep_get_and_clear(vma->vm_mm, addr, ptep); diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c index 4c7ad574b946..c2a9f4f6c7dd 100644 --- a/arch/arm64/mm/kasan_init.c +++ b/arch/arm64/mm/kasan_init.c @@ -113,7 +113,7 @@ static void __init kasan_pte_populate(pmd_t *pmdp, unsigned long addr, memset(__va(page_phys), KASAN_SHADOW_INIT, PAGE_SIZE); next = addr + PAGE_SIZE; set_pte(ptep, pfn_pte(__phys_to_pfn(page_phys), PAGE_KERNEL)); - } while (ptep++, addr = next, addr != end && pte_none(READ_ONCE(*ptep))); + } while (ptep++, addr = next, addr != end && pte_none(ptep_get(ptep))); } static void __init kasan_pmd_populate(pud_t *pudp, unsigned long addr, diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 3a27d887f7dd..343629a17042 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -173,7 +173,7 @@ static void init_pte(pmd_t *pmdp, unsigned long addr, unsigned long end, ptep = pte_set_fixmap_offset(pmdp, addr); do { - pte_t old_pte = READ_ONCE(*ptep); + pte_t old_pte = ptep_get(ptep); set_pte(ptep, pfn_pte(__phys_to_pfn(phys), prot)); @@ -182,7 +182,7 @@ static void init_pte(pmd_t *pmdp, unsigned long addr, unsigned long end, * only allow updates to the permission attributes. */ BUG_ON(!pgattr_change_is_safe(pte_val(old_pte), - READ_ONCE(pte_val(*ptep)))); + pte_val(ptep_get(ptep)))); phys += PAGE_SIZE; } while (ptep++, addr += PAGE_SIZE, addr != end); @@ -852,7 +852,7 @@ static void unmap_hotplug_pte_range(pmd_t *pmdp, unsigned long addr, do { ptep = pte_offset_kernel(pmdp, addr); - pte = READ_ONCE(*ptep); + pte = ptep_get(ptep); if (pte_none(pte)) continue; @@ -985,7 +985,7 @@ static void free_empty_pte_table(pmd_t *pmdp, unsigned long addr, do { ptep = pte_offset_kernel(pmdp, addr); - pte = READ_ONCE(*ptep); + pte = ptep_get(ptep); /* * This is just a sanity check here which verifies that @@ -1004,7 +1004,7 @@ static void free_empty_pte_table(pmd_t *pmdp, unsigned long addr, */ ptep = pte_offset_kernel(pmdp, 0UL); for (i = 0; i < PTRS_PER_PTE; i++) { - if (!pte_none(READ_ONCE(ptep[i]))) + if (!pte_none(ptep_get(&ptep[i]))) return; } @@ -1473,7 +1473,7 @@ pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte * when the permission changes from executable to non-executable * in cases where cpu is affected with errata #2645198. */ - if (pte_user_exec(READ_ONCE(*ptep))) + if (pte_user_exec(ptep_get(ptep))) return ptep_clear_flush(vma, addr, ptep); } return ptep_get_and_clear(vma->vm_mm, addr, ptep); diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c index 924843f1f661..73a5e8f82586 100644 --- a/arch/arm64/mm/pageattr.c +++ b/arch/arm64/mm/pageattr.c @@ -36,7 +36,7 @@ bool can_set_direct_map(void) static int change_page_range(pte_t *ptep, unsigned long addr, void *data) { struct page_change_data *cdata = data; - pte_t pte = READ_ONCE(*ptep); + pte_t pte = ptep_get(ptep); pte = clear_pte_bit(pte, cdata->clear_mask); pte = set_pte_bit(pte, cdata->set_mask); @@ -245,5 +245,5 @@ bool kernel_page_present(struct page *page) return true; ptep = pte_offset_kernel(pmdp, addr); - return pte_valid(READ_ONCE(*ptep)); + return pte_valid(ptep_get(ptep)); } diff --git a/arch/arm64/mm/trans_pgd.c b/arch/arm64/mm/trans_pgd.c index 7b14df3c6477..f71ab4704cce 100644 --- a/arch/arm64/mm/trans_pgd.c +++ b/arch/arm64/mm/trans_pgd.c @@ -33,7 +33,7 @@ static void *trans_alloc(struct trans_pgd_info *info) static void _copy_pte(pte_t *dst_ptep, pte_t *src_ptep, unsigned long addr) { - pte_t pte = READ_ONCE(*src_ptep); + pte_t pte = ptep_get(src_ptep); if (pte_valid(pte)) { /* -- cgit v1.2.3 From 659e193027910a5d3083e34b488ab459d2ef5082 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Thu, 15 Feb 2024 10:31:55 +0000 Subject: arm64/mm: convert set_pte_at() to set_ptes(..., 1) Since set_ptes() was introduced, set_pte_at() has been implemented as a generic macro around set_ptes(..., 1). So this change should continue to generate the same code. However, making this change prepares us for the transparent contpte support. It means we can reroute set_ptes() to __set_ptes(). Since set_pte_at() is a generic macro, there will be no equivalent __set_pte_at() to reroute to. Note that a couple of calls to set_pte_at() remain in the arch code. This is intentional, since those call sites are acting on behalf of core-mm and should continue to call into the public set_ptes() rather than the arch-private __set_ptes(). Link: https://lkml.kernel.org/r/20240215103205.2607016-9-ryan.roberts@arm.com Signed-off-by: Ryan Roberts Tested-by: John Hubbard Acked-by: Mark Rutland Acked-by: Catalin Marinas Cc: Alistair Popple Cc: Andrey Ryabinin Cc: Ard Biesheuvel Cc: Barry Song <21cnbao@gmail.com> Cc: Borislav Petkov (AMD) Cc: Dave Hansen Cc: David Hildenbrand Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: James Morse Cc: Kefeng Wang Cc: Marc Zyngier Cc: Matthew Wilcox (Oracle) Cc: Thomas Gleixner Cc: Will Deacon Cc: Yang Shi Cc: Zi Yan Signed-off-by: Andrew Morton --- arch/arm64/include/asm/pgtable.h | 2 +- arch/arm64/kernel/mte.c | 2 +- arch/arm64/kvm/guest.c | 2 +- arch/arm64/mm/fault.c | 2 +- arch/arm64/mm/hugetlbpage.c | 10 +++++----- 5 files changed, 9 insertions(+), 9 deletions(-) (limited to 'arch/arm64/include') diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index de034ca40bad..9a2df85eb493 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -1084,7 +1084,7 @@ static inline void arch_swap_restore(swp_entry_t entry, struct folio *folio) #endif /* CONFIG_ARM64_MTE */ /* - * On AArch64, the cache coherency is handled via the set_pte_at() function. + * On AArch64, the cache coherency is handled via the set_ptes() function. */ static inline void update_mmu_cache_range(struct vm_fault *vmf, struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, diff --git a/arch/arm64/kernel/mte.c b/arch/arm64/kernel/mte.c index a41ef3213e1e..59bfe2e96f8f 100644 --- a/arch/arm64/kernel/mte.c +++ b/arch/arm64/kernel/mte.c @@ -67,7 +67,7 @@ int memcmp_pages(struct page *page1, struct page *page2) /* * If the page content is identical but at least one of the pages is * tagged, return non-zero to avoid KSM merging. If only one of the - * pages is tagged, set_pte_at() may zero or change the tags of the + * pages is tagged, set_ptes() may zero or change the tags of the * other page via mte_sync_tags(). */ if (page_mte_tagged(page1) || page_mte_tagged(page2)) diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c index aaf1d4939739..6e0df623c8e9 100644 --- a/arch/arm64/kvm/guest.c +++ b/arch/arm64/kvm/guest.c @@ -1072,7 +1072,7 @@ int kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm, } else { /* * Only locking to serialise with a concurrent - * set_pte_at() in the VMM but still overriding the + * set_ptes() in the VMM but still overriding the * tags, hence ignoring the return value. */ try_page_mte_tagging(page); diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index a254761fa1bd..3235e23309ec 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -205,7 +205,7 @@ static void show_pte(unsigned long addr) * * It needs to cope with hardware update of the accessed/dirty state by other * agents in the system and can safely skip the __sync_icache_dcache() call as, - * like set_pte_at(), the PTE is never changed from no-exec to exec here. + * like set_ptes(), the PTE is never changed from no-exec to exec here. * * Returns whether or not the PTE actually changed. */ diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index 2892f925ed66..27f6160890d1 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -247,12 +247,12 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, if (!pte_present(pte)) { for (i = 0; i < ncontig; i++, ptep++, addr += pgsize) - set_pte_at(mm, addr, ptep, pte); + set_ptes(mm, addr, ptep, pte, 1); return; } if (!pte_cont(pte)) { - set_pte_at(mm, addr, ptep, pte); + set_ptes(mm, addr, ptep, pte, 1); return; } @@ -263,7 +263,7 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, clear_flush(mm, addr, ptep, pgsize, ncontig); for (i = 0; i < ncontig; i++, ptep++, addr += pgsize, pfn += dpfn) - set_pte_at(mm, addr, ptep, pfn_pte(pfn, hugeprot)); + set_ptes(mm, addr, ptep, pfn_pte(pfn, hugeprot), 1); } pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, @@ -471,7 +471,7 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma, hugeprot = pte_pgprot(pte); for (i = 0; i < ncontig; i++, ptep++, addr += pgsize, pfn += dpfn) - set_pte_at(mm, addr, ptep, pfn_pte(pfn, hugeprot)); + set_ptes(mm, addr, ptep, pfn_pte(pfn, hugeprot), 1); return 1; } @@ -500,7 +500,7 @@ void huge_ptep_set_wrprotect(struct mm_struct *mm, pfn = pte_pfn(pte); for (i = 0; i < ncontig; i++, ptep++, addr += pgsize, pfn += dpfn) - set_pte_at(mm, addr, ptep, pfn_pte(pfn, hugeprot)); + set_ptes(mm, addr, ptep, pfn_pte(pfn, hugeprot), 1); } pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, -- cgit v1.2.3 From 5a00bfd6a52cf31e93d5f1b734087deb32a3cffa Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Thu, 15 Feb 2024 10:31:57 +0000 Subject: arm64/mm: new ptep layer to manage contig bit Create a new layer for the in-table PTE manipulation APIs. For now, The existing API is prefixed with double underscore to become the arch-private API and the public API is just a simple wrapper that calls the private API. The public API implementation will subsequently be used to transparently manipulate the contiguous bit where appropriate. But since there are already some contig-aware users (e.g. hugetlb, kernel mapper), we must first ensure those users use the private API directly so that the future contig-bit manipulations in the public API do not interfere with those existing uses. The following APIs are treated this way: - ptep_get - set_pte - set_ptes - pte_clear - ptep_get_and_clear - ptep_test_and_clear_young - ptep_clear_flush_young - ptep_set_wrprotect - ptep_set_access_flags Link: https://lkml.kernel.org/r/20240215103205.2607016-11-ryan.roberts@arm.com Signed-off-by: Ryan Roberts Tested-by: John Hubbard Acked-by: Mark Rutland Acked-by: Catalin Marinas Cc: Alistair Popple Cc: Andrey Ryabinin Cc: Ard Biesheuvel Cc: Barry Song <21cnbao@gmail.com> Cc: Borislav Petkov (AMD) Cc: Dave Hansen Cc: David Hildenbrand Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: James Morse Cc: Kefeng Wang Cc: Marc Zyngier Cc: Matthew Wilcox (Oracle) Cc: Thomas Gleixner Cc: Will Deacon Cc: Yang Shi Cc: Zi Yan Signed-off-by: Andrew Morton --- arch/arm64/include/asm/pgtable.h | 83 ++++++++++++++++++++++------------------ arch/arm64/kernel/efi.c | 4 +- arch/arm64/kernel/mte.c | 2 +- arch/arm64/kvm/guest.c | 2 +- arch/arm64/mm/fault.c | 12 +++--- arch/arm64/mm/fixmap.c | 4 +- arch/arm64/mm/hugetlbpage.c | 40 +++++++++---------- arch/arm64/mm/kasan_init.c | 6 +-- arch/arm64/mm/mmu.c | 14 +++---- arch/arm64/mm/pageattr.c | 6 +-- arch/arm64/mm/trans_pgd.c | 6 +-- 11 files changed, 93 insertions(+), 86 deletions(-) (limited to 'arch/arm64/include') diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 9a2df85eb493..7336d40a893a 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -93,7 +93,8 @@ static inline pteval_t __phys_to_pte_val(phys_addr_t phys) __pte(__phys_to_pte_val((phys_addr_t)(pfn) << PAGE_SHIFT) | pgprot_val(prot)) #define pte_none(pte) (!pte_val(pte)) -#define pte_clear(mm,addr,ptep) set_pte(ptep, __pte(0)) +#define __pte_clear(mm, addr, ptep) \ + __set_pte(ptep, __pte(0)) #define pte_page(pte) (pfn_to_page(pte_pfn(pte))) /* @@ -137,7 +138,7 @@ static inline pteval_t __phys_to_pte_val(phys_addr_t phys) * so that we don't erroneously return false for pages that have been * remapped as PROT_NONE but are yet to be flushed from the TLB. * Note that we can't make any assumptions based on the state of the access - * flag, since ptep_clear_flush_young() elides a DSB when invalidating the + * flag, since __ptep_clear_flush_young() elides a DSB when invalidating the * TLB. */ #define pte_accessible(mm, pte) \ @@ -261,7 +262,7 @@ static inline pte_t pte_mkdevmap(pte_t pte) return set_pte_bit(pte, __pgprot(PTE_DEVMAP | PTE_SPECIAL)); } -static inline void set_pte(pte_t *ptep, pte_t pte) +static inline void __set_pte(pte_t *ptep, pte_t pte) { WRITE_ONCE(*ptep, pte); @@ -275,8 +276,7 @@ static inline void set_pte(pte_t *ptep, pte_t pte) } } -#define ptep_get ptep_get -static inline pte_t ptep_get(pte_t *ptep) +static inline pte_t __ptep_get(pte_t *ptep) { return READ_ONCE(*ptep); } @@ -308,7 +308,7 @@ static inline void __check_safe_pte_update(struct mm_struct *mm, pte_t *ptep, if (!IS_ENABLED(CONFIG_DEBUG_VM)) return; - old_pte = ptep_get(ptep); + old_pte = __ptep_get(ptep); if (!pte_valid(old_pte) || !pte_valid(pte)) return; @@ -317,7 +317,7 @@ static inline void __check_safe_pte_update(struct mm_struct *mm, pte_t *ptep, /* * Check for potential race with hardware updates of the pte - * (ptep_set_access_flags safely changes valid ptes without going + * (__ptep_set_access_flags safely changes valid ptes without going * through an invalid entry). */ VM_WARN_ONCE(!pte_young(pte), @@ -363,23 +363,22 @@ static inline pte_t pte_advance_pfn(pte_t pte, unsigned long nr) return pfn_pte(pte_pfn(pte) + nr, pte_pgprot(pte)); } -static inline void set_ptes(struct mm_struct *mm, - unsigned long __always_unused addr, - pte_t *ptep, pte_t pte, unsigned int nr) +static inline void __set_ptes(struct mm_struct *mm, + unsigned long __always_unused addr, + pte_t *ptep, pte_t pte, unsigned int nr) { page_table_check_ptes_set(mm, ptep, pte, nr); __sync_cache_and_tags(pte, nr); for (;;) { __check_safe_pte_update(mm, ptep, pte); - set_pte(ptep, pte); + __set_pte(ptep, pte); if (--nr == 0) break; ptep++; pte = pte_advance_pfn(pte, 1); } } -#define set_ptes set_ptes /* * Huge pte definitions. @@ -546,7 +545,7 @@ static inline void __set_pte_at(struct mm_struct *mm, { __sync_cache_and_tags(pte, nr); __check_safe_pte_update(mm, ptep, pte); - set_pte(ptep, pte); + __set_pte(ptep, pte); } static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr, @@ -860,8 +859,7 @@ static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot) return pte_pmd(pte_modify(pmd_pte(pmd), newprot)); } -#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS -extern int ptep_set_access_flags(struct vm_area_struct *vma, +extern int __ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address, pte_t *ptep, pte_t entry, int dirty); @@ -871,7 +869,8 @@ static inline int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, pmd_t entry, int dirty) { - return ptep_set_access_flags(vma, address, (pte_t *)pmdp, pmd_pte(entry), dirty); + return __ptep_set_access_flags(vma, address, (pte_t *)pmdp, + pmd_pte(entry), dirty); } static inline int pud_devmap(pud_t pud) @@ -905,12 +904,13 @@ static inline bool pud_user_accessible_page(pud_t pud) /* * Atomic pte/pmd modifications. */ -#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG -static inline int __ptep_test_and_clear_young(pte_t *ptep) +static inline int __ptep_test_and_clear_young(struct vm_area_struct *vma, + unsigned long address, + pte_t *ptep) { pte_t old_pte, pte; - pte = ptep_get(ptep); + pte = __ptep_get(ptep); do { old_pte = pte; pte = pte_mkold(pte); @@ -921,18 +921,10 @@ static inline int __ptep_test_and_clear_young(pte_t *ptep) return pte_young(pte); } -static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, - unsigned long address, - pte_t *ptep) -{ - return __ptep_test_and_clear_young(ptep); -} - -#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH -static inline int ptep_clear_flush_young(struct vm_area_struct *vma, +static inline int __ptep_clear_flush_young(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) { - int young = ptep_test_and_clear_young(vma, address, ptep); + int young = __ptep_test_and_clear_young(vma, address, ptep); if (young) { /* @@ -955,12 +947,11 @@ static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { - return ptep_test_and_clear_young(vma, address, (pte_t *)pmdp); + return __ptep_test_and_clear_young(vma, address, (pte_t *)pmdp); } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ -#define __HAVE_ARCH_PTEP_GET_AND_CLEAR -static inline pte_t ptep_get_and_clear(struct mm_struct *mm, +static inline pte_t __ptep_get_and_clear(struct mm_struct *mm, unsigned long address, pte_t *ptep) { pte_t pte = __pte(xchg_relaxed(&pte_val(*ptep), 0)); @@ -984,15 +975,15 @@ static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ /* - * ptep_set_wrprotect - mark read-only while trasferring potential hardware + * __ptep_set_wrprotect - mark read-only while trasferring potential hardware * dirty status (PTE_DBM && !PTE_RDONLY) to the software PTE_DIRTY bit. */ -#define __HAVE_ARCH_PTEP_SET_WRPROTECT -static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long address, pte_t *ptep) +static inline void __ptep_set_wrprotect(struct mm_struct *mm, + unsigned long address, pte_t *ptep) { pte_t old_pte, pte; - pte = ptep_get(ptep); + pte = __ptep_get(ptep); do { old_pte = pte; pte = pte_wrprotect(pte); @@ -1006,7 +997,7 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addres static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long address, pmd_t *pmdp) { - ptep_set_wrprotect(mm, address, (pte_t *)pmdp); + __ptep_set_wrprotect(mm, address, (pte_t *)pmdp); } #define pmdp_establish pmdp_establish @@ -1084,7 +1075,7 @@ static inline void arch_swap_restore(swp_entry_t entry, struct folio *folio) #endif /* CONFIG_ARM64_MTE */ /* - * On AArch64, the cache coherency is handled via the set_ptes() function. + * On AArch64, the cache coherency is handled via the __set_ptes() function. */ static inline void update_mmu_cache_range(struct vm_fault *vmf, struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, @@ -1136,6 +1127,22 @@ extern pte_t ptep_modify_prot_start(struct vm_area_struct *vma, extern void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t old_pte, pte_t new_pte); + +#define ptep_get __ptep_get +#define set_pte __set_pte +#define set_ptes __set_ptes +#define pte_clear __pte_clear +#define __HAVE_ARCH_PTEP_GET_AND_CLEAR +#define ptep_get_and_clear __ptep_get_and_clear +#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG +#define ptep_test_and_clear_young __ptep_test_and_clear_young +#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH +#define ptep_clear_flush_young __ptep_clear_flush_young +#define __HAVE_ARCH_PTEP_SET_WRPROTECT +#define ptep_set_wrprotect __ptep_set_wrprotect +#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS +#define ptep_set_access_flags __ptep_set_access_flags + #endif /* !__ASSEMBLY__ */ #endif /* __ASM_PGTABLE_H */ diff --git a/arch/arm64/kernel/efi.c b/arch/arm64/kernel/efi.c index d0e08e93b246..9afcc690fe73 100644 --- a/arch/arm64/kernel/efi.c +++ b/arch/arm64/kernel/efi.c @@ -103,7 +103,7 @@ static int __init set_permissions(pte_t *ptep, unsigned long addr, void *data) { struct set_perm_data *spd = data; const efi_memory_desc_t *md = spd->md; - pte_t pte = ptep_get(ptep); + pte_t pte = __ptep_get(ptep); if (md->attribute & EFI_MEMORY_RO) pte = set_pte_bit(pte, __pgprot(PTE_RDONLY)); @@ -111,7 +111,7 @@ static int __init set_permissions(pte_t *ptep, unsigned long addr, void *data) pte = set_pte_bit(pte, __pgprot(PTE_PXN)); else if (system_supports_bti_kernel() && spd->has_bti) pte = set_pte_bit(pte, __pgprot(PTE_GP)); - set_pte(ptep, pte); + __set_pte(ptep, pte); return 0; } diff --git a/arch/arm64/kernel/mte.c b/arch/arm64/kernel/mte.c index 59bfe2e96f8f..dcdcccd40891 100644 --- a/arch/arm64/kernel/mte.c +++ b/arch/arm64/kernel/mte.c @@ -67,7 +67,7 @@ int memcmp_pages(struct page *page1, struct page *page2) /* * If the page content is identical but at least one of the pages is * tagged, return non-zero to avoid KSM merging. If only one of the - * pages is tagged, set_ptes() may zero or change the tags of the + * pages is tagged, __set_ptes() may zero or change the tags of the * other page via mte_sync_tags(). */ if (page_mte_tagged(page1) || page_mte_tagged(page2)) diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c index 6e0df623c8e9..629145fd3161 100644 --- a/arch/arm64/kvm/guest.c +++ b/arch/arm64/kvm/guest.c @@ -1072,7 +1072,7 @@ int kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm, } else { /* * Only locking to serialise with a concurrent - * set_ptes() in the VMM but still overriding the + * __set_ptes() in the VMM but still overriding the * tags, hence ignoring the return value. */ try_page_mte_tagging(page); diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 3235e23309ec..9a1c66183d16 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -191,7 +191,7 @@ static void show_pte(unsigned long addr) if (!ptep) break; - pte = ptep_get(ptep); + pte = __ptep_get(ptep); pr_cont(", pte=%016llx", pte_val(pte)); pte_unmap(ptep); } while(0); @@ -205,16 +205,16 @@ static void show_pte(unsigned long addr) * * It needs to cope with hardware update of the accessed/dirty state by other * agents in the system and can safely skip the __sync_icache_dcache() call as, - * like set_ptes(), the PTE is never changed from no-exec to exec here. + * like __set_ptes(), the PTE is never changed from no-exec to exec here. * * Returns whether or not the PTE actually changed. */ -int ptep_set_access_flags(struct vm_area_struct *vma, - unsigned long address, pte_t *ptep, - pte_t entry, int dirty) +int __ptep_set_access_flags(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep, + pte_t entry, int dirty) { pteval_t old_pteval, pteval; - pte_t pte = ptep_get(ptep); + pte_t pte = __ptep_get(ptep); if (pte_same(pte, entry)) return 0; diff --git a/arch/arm64/mm/fixmap.c b/arch/arm64/mm/fixmap.c index c0a3301203bd..bfc02568805a 100644 --- a/arch/arm64/mm/fixmap.c +++ b/arch/arm64/mm/fixmap.c @@ -121,9 +121,9 @@ void __set_fixmap(enum fixed_addresses idx, ptep = fixmap_pte(addr); if (pgprot_val(flags)) { - set_pte(ptep, pfn_pte(phys >> PAGE_SHIFT, flags)); + __set_pte(ptep, pfn_pte(phys >> PAGE_SHIFT, flags)); } else { - pte_clear(&init_mm, addr, ptep); + __pte_clear(&init_mm, addr, ptep); flush_tlb_kernel_range(addr, addr+PAGE_SIZE); } } diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index 48e8b429879d..0f0e10bb0a95 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -145,14 +145,14 @@ pte_t huge_ptep_get(pte_t *ptep) { int ncontig, i; size_t pgsize; - pte_t orig_pte = ptep_get(ptep); + pte_t orig_pte = __ptep_get(ptep); if (!pte_present(orig_pte) || !pte_cont(orig_pte)) return orig_pte; ncontig = num_contig_ptes(page_size(pte_page(orig_pte)), &pgsize); for (i = 0; i < ncontig; i++, ptep++) { - pte_t pte = ptep_get(ptep); + pte_t pte = __ptep_get(ptep); if (pte_dirty(pte)) orig_pte = pte_mkdirty(orig_pte); @@ -177,11 +177,11 @@ static pte_t get_clear_contig(struct mm_struct *mm, unsigned long pgsize, unsigned long ncontig) { - pte_t orig_pte = ptep_get(ptep); + pte_t orig_pte = __ptep_get(ptep); unsigned long i; for (i = 0; i < ncontig; i++, addr += pgsize, ptep++) { - pte_t pte = ptep_get_and_clear(mm, addr, ptep); + pte_t pte = __ptep_get_and_clear(mm, addr, ptep); /* * If HW_AFDBM is enabled, then the HW could turn on @@ -229,7 +229,7 @@ static void clear_flush(struct mm_struct *mm, unsigned long i, saddr = addr; for (i = 0; i < ncontig; i++, addr += pgsize, ptep++) - ptep_get_and_clear(mm, addr, ptep); + __ptep_get_and_clear(mm, addr, ptep); flush_tlb_range(&vma, saddr, addr); } @@ -247,12 +247,12 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, if (!pte_present(pte)) { for (i = 0; i < ncontig; i++, ptep++, addr += pgsize) - set_ptes(mm, addr, ptep, pte, 1); + __set_ptes(mm, addr, ptep, pte, 1); return; } if (!pte_cont(pte)) { - set_ptes(mm, addr, ptep, pte, 1); + __set_ptes(mm, addr, ptep, pte, 1); return; } @@ -263,7 +263,7 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, clear_flush(mm, addr, ptep, pgsize, ncontig); for (i = 0; i < ncontig; i++, ptep++, addr += pgsize, pfn += dpfn) - set_ptes(mm, addr, ptep, pfn_pte(pfn, hugeprot), 1); + __set_ptes(mm, addr, ptep, pfn_pte(pfn, hugeprot), 1); } pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, @@ -393,7 +393,7 @@ void huge_pte_clear(struct mm_struct *mm, unsigned long addr, ncontig = num_contig_ptes(sz, &pgsize); for (i = 0; i < ncontig; i++, addr += pgsize, ptep++) - pte_clear(mm, addr, ptep); + __pte_clear(mm, addr, ptep); } pte_t huge_ptep_get_and_clear(struct mm_struct *mm, @@ -401,10 +401,10 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm, { int ncontig; size_t pgsize; - pte_t orig_pte = ptep_get(ptep); + pte_t orig_pte = __ptep_get(ptep); if (!pte_cont(orig_pte)) - return ptep_get_and_clear(mm, addr, ptep); + return __ptep_get_and_clear(mm, addr, ptep); ncontig = find_num_contig(mm, addr, ptep, &pgsize); @@ -424,11 +424,11 @@ static int __cont_access_flags_changed(pte_t *ptep, pte_t pte, int ncontig) { int i; - if (pte_write(pte) != pte_write(ptep_get(ptep))) + if (pte_write(pte) != pte_write(__ptep_get(ptep))) return 1; for (i = 0; i < ncontig; i++) { - pte_t orig_pte = ptep_get(ptep + i); + pte_t orig_pte = __ptep_get(ptep + i); if (pte_dirty(pte) != pte_dirty(orig_pte)) return 1; @@ -452,7 +452,7 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma, pte_t orig_pte; if (!pte_cont(pte)) - return ptep_set_access_flags(vma, addr, ptep, pte, dirty); + return __ptep_set_access_flags(vma, addr, ptep, pte, dirty); ncontig = find_num_contig(mm, addr, ptep, &pgsize); dpfn = pgsize >> PAGE_SHIFT; @@ -471,7 +471,7 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma, hugeprot = pte_pgprot(pte); for (i = 0; i < ncontig; i++, ptep++, addr += pgsize, pfn += dpfn) - set_ptes(mm, addr, ptep, pfn_pte(pfn, hugeprot), 1); + __set_ptes(mm, addr, ptep, pfn_pte(pfn, hugeprot), 1); return 1; } @@ -485,8 +485,8 @@ void huge_ptep_set_wrprotect(struct mm_struct *mm, size_t pgsize; pte_t pte; - if (!pte_cont(ptep_get(ptep))) { - ptep_set_wrprotect(mm, addr, ptep); + if (!pte_cont(__ptep_get(ptep))) { + __ptep_set_wrprotect(mm, addr, ptep); return; } @@ -500,7 +500,7 @@ void huge_ptep_set_wrprotect(struct mm_struct *mm, pfn = pte_pfn(pte); for (i = 0; i < ncontig; i++, ptep++, addr += pgsize, pfn += dpfn) - set_ptes(mm, addr, ptep, pfn_pte(pfn, hugeprot), 1); + __set_ptes(mm, addr, ptep, pfn_pte(pfn, hugeprot), 1); } pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, @@ -510,7 +510,7 @@ pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, size_t pgsize; int ncontig; - if (!pte_cont(ptep_get(ptep))) + if (!pte_cont(__ptep_get(ptep))) return ptep_clear_flush(vma, addr, ptep); ncontig = find_num_contig(mm, addr, ptep, &pgsize); @@ -543,7 +543,7 @@ pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr * when the permission changes from executable to non-executable * in cases where cpu is affected with errata #2645198. */ - if (pte_user_exec(ptep_get(ptep))) + if (pte_user_exec(__ptep_get(ptep))) return huge_ptep_clear_flush(vma, addr, ptep); } return huge_ptep_get_and_clear(vma->vm_mm, addr, ptep); diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c index c2a9f4f6c7dd..9ee16cfce587 100644 --- a/arch/arm64/mm/kasan_init.c +++ b/arch/arm64/mm/kasan_init.c @@ -112,8 +112,8 @@ static void __init kasan_pte_populate(pmd_t *pmdp, unsigned long addr, if (!early) memset(__va(page_phys), KASAN_SHADOW_INIT, PAGE_SIZE); next = addr + PAGE_SIZE; - set_pte(ptep, pfn_pte(__phys_to_pfn(page_phys), PAGE_KERNEL)); - } while (ptep++, addr = next, addr != end && pte_none(ptep_get(ptep))); + __set_pte(ptep, pfn_pte(__phys_to_pfn(page_phys), PAGE_KERNEL)); + } while (ptep++, addr = next, addr != end && pte_none(__ptep_get(ptep))); } static void __init kasan_pmd_populate(pud_t *pudp, unsigned long addr, @@ -271,7 +271,7 @@ static void __init kasan_init_shadow(void) * so we should make sure that it maps the zero page read-only. */ for (i = 0; i < PTRS_PER_PTE; i++) - set_pte(&kasan_early_shadow_pte[i], + __set_pte(&kasan_early_shadow_pte[i], pfn_pte(sym_to_pfn(kasan_early_shadow_page), PAGE_KERNEL_RO)); diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 343629a17042..6208c7541f87 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -173,16 +173,16 @@ static void init_pte(pmd_t *pmdp, unsigned long addr, unsigned long end, ptep = pte_set_fixmap_offset(pmdp, addr); do { - pte_t old_pte = ptep_get(ptep); + pte_t old_pte = __ptep_get(ptep); - set_pte(ptep, pfn_pte(__phys_to_pfn(phys), prot)); + __set_pte(ptep, pfn_pte(__phys_to_pfn(phys), prot)); /* * After the PTE entry has been populated once, we * only allow updates to the permission attributes. */ BUG_ON(!pgattr_change_is_safe(pte_val(old_pte), - pte_val(ptep_get(ptep)))); + pte_val(__ptep_get(ptep)))); phys += PAGE_SIZE; } while (ptep++, addr += PAGE_SIZE, addr != end); @@ -852,12 +852,12 @@ static void unmap_hotplug_pte_range(pmd_t *pmdp, unsigned long addr, do { ptep = pte_offset_kernel(pmdp, addr); - pte = ptep_get(ptep); + pte = __ptep_get(ptep); if (pte_none(pte)) continue; WARN_ON(!pte_present(pte)); - pte_clear(&init_mm, addr, ptep); + __pte_clear(&init_mm, addr, ptep); flush_tlb_kernel_range(addr, addr + PAGE_SIZE); if (free_mapped) free_hotplug_page_range(pte_page(pte), @@ -985,7 +985,7 @@ static void free_empty_pte_table(pmd_t *pmdp, unsigned long addr, do { ptep = pte_offset_kernel(pmdp, addr); - pte = ptep_get(ptep); + pte = __ptep_get(ptep); /* * This is just a sanity check here which verifies that @@ -1004,7 +1004,7 @@ static void free_empty_pte_table(pmd_t *pmdp, unsigned long addr, */ ptep = pte_offset_kernel(pmdp, 0UL); for (i = 0; i < PTRS_PER_PTE; i++) { - if (!pte_none(ptep_get(&ptep[i]))) + if (!pte_none(__ptep_get(&ptep[i]))) return; } diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c index 73a5e8f82586..0c4e3ecf989d 100644 --- a/arch/arm64/mm/pageattr.c +++ b/arch/arm64/mm/pageattr.c @@ -36,12 +36,12 @@ bool can_set_direct_map(void) static int change_page_range(pte_t *ptep, unsigned long addr, void *data) { struct page_change_data *cdata = data; - pte_t pte = ptep_get(ptep); + pte_t pte = __ptep_get(ptep); pte = clear_pte_bit(pte, cdata->clear_mask); pte = set_pte_bit(pte, cdata->set_mask); - set_pte(ptep, pte); + __set_pte(ptep, pte); return 0; } @@ -245,5 +245,5 @@ bool kernel_page_present(struct page *page) return true; ptep = pte_offset_kernel(pmdp, addr); - return pte_valid(ptep_get(ptep)); + return pte_valid(__ptep_get(ptep)); } diff --git a/arch/arm64/mm/trans_pgd.c b/arch/arm64/mm/trans_pgd.c index f71ab4704cce..5139a28130c0 100644 --- a/arch/arm64/mm/trans_pgd.c +++ b/arch/arm64/mm/trans_pgd.c @@ -33,7 +33,7 @@ static void *trans_alloc(struct trans_pgd_info *info) static void _copy_pte(pte_t *dst_ptep, pte_t *src_ptep, unsigned long addr) { - pte_t pte = ptep_get(src_ptep); + pte_t pte = __ptep_get(src_ptep); if (pte_valid(pte)) { /* @@ -41,7 +41,7 @@ static void _copy_pte(pte_t *dst_ptep, pte_t *src_ptep, unsigned long addr) * read only (code, rodata). Clear the RDONLY bit from * the temporary mappings we use during restore. */ - set_pte(dst_ptep, pte_mkwrite_novma(pte)); + __set_pte(dst_ptep, pte_mkwrite_novma(pte)); } else if ((debug_pagealloc_enabled() || is_kfence_address((void *)addr)) && !pte_none(pte)) { /* @@ -55,7 +55,7 @@ static void _copy_pte(pte_t *dst_ptep, pte_t *src_ptep, unsigned long addr) */ BUG_ON(!pfn_valid(pte_pfn(pte))); - set_pte(dst_ptep, pte_mkpresent(pte_mkwrite_novma(pte))); + __set_pte(dst_ptep, pte_mkpresent(pte_mkwrite_novma(pte))); } } -- cgit v1.2.3 From d9d8dc2bd3fb2689309f704fe85e6dde2b1bd73a Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Thu, 15 Feb 2024 10:31:58 +0000 Subject: arm64/mm: dplit __flush_tlb_range() to elide trailing DSB Split __flush_tlb_range() into __flush_tlb_range_nosync() + __flush_tlb_range(), in the same way as the existing flush_tlb_page() arrangement. This allows calling __flush_tlb_range_nosync() to elide the trailing DSB. Forthcoming "contpte" code will take advantage of this when clearing the young bit from a contiguous range of ptes. Ordering between dsb and mmu_notifier_arch_invalidate_secondary_tlbs() has changed, but now aligns with the ordering of __flush_tlb_page(). It has been discussed that __flush_tlb_page() may be wrong though. Regardless, both will be resolved separately if needed. Link: https://lkml.kernel.org/r/20240215103205.2607016-12-ryan.roberts@arm.com Signed-off-by: Ryan Roberts Reviewed-by: David Hildenbrand Tested-by: John Hubbard Acked-by: Mark Rutland Acked-by: Catalin Marinas Cc: Alistair Popple Cc: Andrey Ryabinin Cc: Ard Biesheuvel Cc: Barry Song <21cnbao@gmail.com> Cc: Borislav Petkov (AMD) Cc: Dave Hansen Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: James Morse Cc: Kefeng Wang Cc: Marc Zyngier Cc: Matthew Wilcox (Oracle) Cc: Thomas Gleixner Cc: Will Deacon Cc: Yang Shi Cc: Zi Yan Signed-off-by: Andrew Morton --- arch/arm64/include/asm/tlbflush.h | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'arch/arm64/include') diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h index 1deb5d789c2e..3b0e8248e1a4 100644 --- a/arch/arm64/include/asm/tlbflush.h +++ b/arch/arm64/include/asm/tlbflush.h @@ -422,7 +422,7 @@ do { \ #define __flush_s2_tlb_range_op(op, start, pages, stride, tlb_level) \ __flush_tlb_range_op(op, start, pages, stride, 0, tlb_level, false, kvm_lpa2_is_enabled()); -static inline void __flush_tlb_range(struct vm_area_struct *vma, +static inline void __flush_tlb_range_nosync(struct vm_area_struct *vma, unsigned long start, unsigned long end, unsigned long stride, bool last_level, int tlb_level) @@ -456,10 +456,19 @@ static inline void __flush_tlb_range(struct vm_area_struct *vma, __flush_tlb_range_op(vae1is, start, pages, stride, asid, tlb_level, true, lpa2_is_enabled()); - dsb(ish); mmu_notifier_arch_invalidate_secondary_tlbs(vma->vm_mm, start, end); } +static inline void __flush_tlb_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end, + unsigned long stride, bool last_level, + int tlb_level) +{ + __flush_tlb_range_nosync(vma, start, end, stride, + last_level, tlb_level); + dsb(ish); +} + static inline void flush_tlb_range(struct vm_area_struct *vma, unsigned long start, unsigned long end) { -- cgit v1.2.3 From 4602e5757bcceb231c3a13c36c373ad4a750eddb Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Thu, 15 Feb 2024 10:31:59 +0000 Subject: arm64/mm: wire up PTE_CONT for user mappings With the ptep API sufficiently refactored, we can now introduce a new "contpte" API layer, which transparently manages the PTE_CONT bit for user mappings. In this initial implementation, only suitable batches of PTEs, set via set_ptes(), are mapped with the PTE_CONT bit. Any subsequent modification of individual PTEs will cause an "unfold" operation to repaint the contpte block as individual PTEs before performing the requested operation. While, a modification of a single PTE could cause the block of PTEs to which it belongs to become eligible for "folding" into a contpte entry, "folding" is not performed in this initial implementation due to the costs of checking the requirements are met. Due to this, contpte mappings will degrade back to normal pte mappings over time if/when protections are changed. This will be solved in a future patch. Since a contpte block only has a single access and dirty bit, the semantic here changes slightly; when getting a pte (e.g. ptep_get()) that is part of a contpte mapping, the access and dirty information are pulled from the block (so all ptes in the block return the same access/dirty info). When changing the access/dirty info on a pte (e.g. ptep_set_access_flags()) that is part of a contpte mapping, this change will affect the whole contpte block. This is works fine in practice since we guarantee that only a single folio is mapped by a contpte block, and the core-mm tracks access/dirty information per folio. In order for the public functions, which used to be pure inline, to continue to be callable by modules, export all the contpte_* symbols that are now called by those public inline functions. The feature is enabled/disabled with the ARM64_CONTPTE Kconfig parameter at build time. It defaults to enabled as long as its dependency, TRANSPARENT_HUGEPAGE is also enabled. The core-mm depends upon TRANSPARENT_HUGEPAGE to be able to allocate large folios, so if its not enabled, then there is no chance of meeting the physical contiguity requirement for contpte mappings. Link: https://lkml.kernel.org/r/20240215103205.2607016-13-ryan.roberts@arm.com Signed-off-by: Ryan Roberts Acked-by: Ard Biesheuvel Tested-by: John Hubbard Acked-by: Mark Rutland Reviewed-by: Catalin Marinas Cc: Alistair Popple Cc: Andrey Ryabinin Cc: Barry Song <21cnbao@gmail.com> Cc: Borislav Petkov (AMD) Cc: Dave Hansen Cc: David Hildenbrand Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: James Morse Cc: Kefeng Wang Cc: Marc Zyngier Cc: Matthew Wilcox (Oracle) Cc: Thomas Gleixner Cc: Will Deacon Cc: Yang Shi Cc: Zi Yan Signed-off-by: Andrew Morton --- arch/arm64/Kconfig | 9 ++ arch/arm64/include/asm/pgtable.h | 167 +++++++++++++++++++++++ arch/arm64/mm/Makefile | 1 + arch/arm64/mm/contpte.c | 285 +++++++++++++++++++++++++++++++++++++++ include/linux/efi.h | 5 + 5 files changed, 467 insertions(+) create mode 100644 arch/arm64/mm/contpte.c (limited to 'arch/arm64/include') diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index aa7c1d435139..386566138620 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -2229,6 +2229,15 @@ config UNWIND_PATCH_PAC_INTO_SCS select UNWIND_TABLES select DYNAMIC_SCS +config ARM64_CONTPTE + bool "Contiguous PTE mappings for user memory" if EXPERT + depends on TRANSPARENT_HUGEPAGE + default y + help + When enabled, user mappings are configured using the PTE contiguous + bit, for any mappings that meet the size and alignment requirements. + This reduces TLB pressure and improves performance. + endmenu # "Kernel Features" menu "Boot options" diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 7336d40a893a..831099cfc96b 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -133,6 +133,10 @@ static inline pteval_t __phys_to_pte_val(phys_addr_t phys) */ #define pte_valid_not_user(pte) \ ((pte_val(pte) & (PTE_VALID | PTE_USER | PTE_UXN)) == (PTE_VALID | PTE_UXN)) +/* + * Returns true if the pte is valid and has the contiguous bit set. + */ +#define pte_valid_cont(pte) (pte_valid(pte) && pte_cont(pte)) /* * Could the pte be present in the TLB? We must check mm_tlb_flush_pending * so that we don't erroneously return false for pages that have been @@ -1128,6 +1132,167 @@ extern void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t old_pte, pte_t new_pte); +#ifdef CONFIG_ARM64_CONTPTE + +/* + * The contpte APIs are used to transparently manage the contiguous bit in ptes + * where it is possible and makes sense to do so. The PTE_CONT bit is considered + * a private implementation detail of the public ptep API (see below). + */ +extern void __contpte_try_unfold(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte); +extern pte_t contpte_ptep_get(pte_t *ptep, pte_t orig_pte); +extern pte_t contpte_ptep_get_lockless(pte_t *orig_ptep); +extern void contpte_set_ptes(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte, unsigned int nr); +extern int contpte_ptep_test_and_clear_young(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep); +extern int contpte_ptep_clear_flush_young(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep); +extern int contpte_ptep_set_access_flags(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, + pte_t entry, int dirty); + +static inline void contpte_try_unfold(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte) +{ + if (unlikely(pte_valid_cont(pte))) + __contpte_try_unfold(mm, addr, ptep, pte); +} + +/* + * The below functions constitute the public API that arm64 presents to the + * core-mm to manipulate PTE entries within their page tables (or at least this + * is the subset of the API that arm64 needs to implement). These public + * versions will automatically and transparently apply the contiguous bit where + * it makes sense to do so. Therefore any users that are contig-aware (e.g. + * hugetlb, kernel mapper) should NOT use these APIs, but instead use the + * private versions, which are prefixed with double underscore. All of these + * APIs except for ptep_get_lockless() are expected to be called with the PTL + * held. Although the contiguous bit is considered private to the + * implementation, it is deliberately allowed to leak through the getters (e.g. + * ptep_get()), back to core code. This is required so that pte_leaf_size() can + * provide an accurate size for perf_get_pgtable_size(). But this leakage means + * its possible a pte will be passed to a setter with the contiguous bit set, so + * we explicitly clear the contiguous bit in those cases to prevent accidentally + * setting it in the pgtable. + */ + +#define ptep_get ptep_get +static inline pte_t ptep_get(pte_t *ptep) +{ + pte_t pte = __ptep_get(ptep); + + if (likely(!pte_valid_cont(pte))) + return pte; + + return contpte_ptep_get(ptep, pte); +} + +#define ptep_get_lockless ptep_get_lockless +static inline pte_t ptep_get_lockless(pte_t *ptep) +{ + pte_t pte = __ptep_get(ptep); + + if (likely(!pte_valid_cont(pte))) + return pte; + + return contpte_ptep_get_lockless(ptep); +} + +static inline void set_pte(pte_t *ptep, pte_t pte) +{ + /* + * We don't have the mm or vaddr so cannot unfold contig entries (since + * it requires tlb maintenance). set_pte() is not used in core code, so + * this should never even be called. Regardless do our best to service + * any call and emit a warning if there is any attempt to set a pte on + * top of an existing contig range. + */ + pte_t orig_pte = __ptep_get(ptep); + + WARN_ON_ONCE(pte_valid_cont(orig_pte)); + __set_pte(ptep, pte_mknoncont(pte)); +} + +#define set_ptes set_ptes +static inline void set_ptes(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte, unsigned int nr) +{ + pte = pte_mknoncont(pte); + + if (likely(nr == 1)) { + contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep)); + __set_ptes(mm, addr, ptep, pte, 1); + } else { + contpte_set_ptes(mm, addr, ptep, pte, nr); + } +} + +static inline void pte_clear(struct mm_struct *mm, + unsigned long addr, pte_t *ptep) +{ + contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep)); + __pte_clear(mm, addr, ptep); +} + +#define __HAVE_ARCH_PTEP_GET_AND_CLEAR +static inline pte_t ptep_get_and_clear(struct mm_struct *mm, + unsigned long addr, pte_t *ptep) +{ + contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep)); + return __ptep_get_and_clear(mm, addr, ptep); +} + +#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG +static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) +{ + pte_t orig_pte = __ptep_get(ptep); + + if (likely(!pte_valid_cont(orig_pte))) + return __ptep_test_and_clear_young(vma, addr, ptep); + + return contpte_ptep_test_and_clear_young(vma, addr, ptep); +} + +#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH +static inline int ptep_clear_flush_young(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) +{ + pte_t orig_pte = __ptep_get(ptep); + + if (likely(!pte_valid_cont(orig_pte))) + return __ptep_clear_flush_young(vma, addr, ptep); + + return contpte_ptep_clear_flush_young(vma, addr, ptep); +} + +#define __HAVE_ARCH_PTEP_SET_WRPROTECT +static inline void ptep_set_wrprotect(struct mm_struct *mm, + unsigned long addr, pte_t *ptep) +{ + contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep)); + __ptep_set_wrprotect(mm, addr, ptep); +} + +#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS +static inline int ptep_set_access_flags(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, + pte_t entry, int dirty) +{ + pte_t orig_pte = __ptep_get(ptep); + + entry = pte_mknoncont(entry); + + if (likely(!pte_valid_cont(orig_pte))) + return __ptep_set_access_flags(vma, addr, ptep, entry, dirty); + + return contpte_ptep_set_access_flags(vma, addr, ptep, entry, dirty); +} + +#else /* CONFIG_ARM64_CONTPTE */ + #define ptep_get __ptep_get #define set_pte __set_pte #define set_ptes __set_ptes @@ -1143,6 +1308,8 @@ extern void ptep_modify_prot_commit(struct vm_area_struct *vma, #define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS #define ptep_set_access_flags __ptep_set_access_flags +#endif /* CONFIG_ARM64_CONTPTE */ + #endif /* !__ASSEMBLY__ */ #endif /* __ASM_PGTABLE_H */ diff --git a/arch/arm64/mm/Makefile b/arch/arm64/mm/Makefile index dbd1bc95967d..60454256945b 100644 --- a/arch/arm64/mm/Makefile +++ b/arch/arm64/mm/Makefile @@ -3,6 +3,7 @@ obj-y := dma-mapping.o extable.o fault.o init.o \ cache.o copypage.o flush.o \ ioremap.o mmap.o pgd.o mmu.o \ context.o proc.o pageattr.o fixmap.o +obj-$(CONFIG_ARM64_CONTPTE) += contpte.o obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o obj-$(CONFIG_PTDUMP_CORE) += ptdump.o obj-$(CONFIG_PTDUMP_DEBUGFS) += ptdump_debugfs.o diff --git a/arch/arm64/mm/contpte.c b/arch/arm64/mm/contpte.c new file mode 100644 index 000000000000..6d7f40667fa2 --- /dev/null +++ b/arch/arm64/mm/contpte.c @@ -0,0 +1,285 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2023 ARM Ltd. + */ + +#include +#include +#include +#include + +static inline bool mm_is_user(struct mm_struct *mm) +{ + /* + * Don't attempt to apply the contig bit to kernel mappings, because + * dynamically adding/removing the contig bit can cause page faults. + * These racing faults are ok for user space, since they get serialized + * on the PTL. But kernel mappings can't tolerate faults. + */ + if (unlikely(mm_is_efi(mm))) + return false; + return mm != &init_mm; +} + +static inline pte_t *contpte_align_down(pte_t *ptep) +{ + return PTR_ALIGN_DOWN(ptep, sizeof(*ptep) * CONT_PTES); +} + +static void contpte_convert(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte) +{ + struct vm_area_struct vma = TLB_FLUSH_VMA(mm, 0); + unsigned long start_addr; + pte_t *start_ptep; + int i; + + start_ptep = ptep = contpte_align_down(ptep); + start_addr = addr = ALIGN_DOWN(addr, CONT_PTE_SIZE); + pte = pfn_pte(ALIGN_DOWN(pte_pfn(pte), CONT_PTES), pte_pgprot(pte)); + + for (i = 0; i < CONT_PTES; i++, ptep++, addr += PAGE_SIZE) { + pte_t ptent = __ptep_get_and_clear(mm, addr, ptep); + + if (pte_dirty(ptent)) + pte = pte_mkdirty(pte); + + if (pte_young(ptent)) + pte = pte_mkyoung(pte); + } + + __flush_tlb_range(&vma, start_addr, addr, PAGE_SIZE, true, 3); + + __set_ptes(mm, start_addr, start_ptep, pte, CONT_PTES); +} + +void __contpte_try_unfold(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte) +{ + /* + * We have already checked that the ptes are contiguous in + * contpte_try_unfold(), so just check that the mm is user space. + */ + if (!mm_is_user(mm)) + return; + + pte = pte_mknoncont(pte); + contpte_convert(mm, addr, ptep, pte); +} +EXPORT_SYMBOL(__contpte_try_unfold); + +pte_t contpte_ptep_get(pte_t *ptep, pte_t orig_pte) +{ + /* + * Gather access/dirty bits, which may be populated in any of the ptes + * of the contig range. We are guaranteed to be holding the PTL, so any + * contiguous range cannot be unfolded or otherwise modified under our + * feet. + */ + + pte_t pte; + int i; + + ptep = contpte_align_down(ptep); + + for (i = 0; i < CONT_PTES; i++, ptep++) { + pte = __ptep_get(ptep); + + if (pte_dirty(pte)) + orig_pte = pte_mkdirty(orig_pte); + + if (pte_young(pte)) + orig_pte = pte_mkyoung(orig_pte); + } + + return orig_pte; +} +EXPORT_SYMBOL(contpte_ptep_get); + +pte_t contpte_ptep_get_lockless(pte_t *orig_ptep) +{ + /* + * Gather access/dirty bits, which may be populated in any of the ptes + * of the contig range. We may not be holding the PTL, so any contiguous + * range may be unfolded/modified/refolded under our feet. Therefore we + * ensure we read a _consistent_ contpte range by checking that all ptes + * in the range are valid and have CONT_PTE set, that all pfns are + * contiguous and that all pgprots are the same (ignoring access/dirty). + * If we find a pte that is not consistent, then we must be racing with + * an update so start again. If the target pte does not have CONT_PTE + * set then that is considered consistent on its own because it is not + * part of a contpte range. + */ + + pgprot_t orig_prot; + unsigned long pfn; + pte_t orig_pte; + pgprot_t prot; + pte_t *ptep; + pte_t pte; + int i; + +retry: + orig_pte = __ptep_get(orig_ptep); + + if (!pte_valid_cont(orig_pte)) + return orig_pte; + + orig_prot = pte_pgprot(pte_mkold(pte_mkclean(orig_pte))); + ptep = contpte_align_down(orig_ptep); + pfn = pte_pfn(orig_pte) - (orig_ptep - ptep); + + for (i = 0; i < CONT_PTES; i++, ptep++, pfn++) { + pte = __ptep_get(ptep); + prot = pte_pgprot(pte_mkold(pte_mkclean(pte))); + + if (!pte_valid_cont(pte) || + pte_pfn(pte) != pfn || + pgprot_val(prot) != pgprot_val(orig_prot)) + goto retry; + + if (pte_dirty(pte)) + orig_pte = pte_mkdirty(orig_pte); + + if (pte_young(pte)) + orig_pte = pte_mkyoung(orig_pte); + } + + return orig_pte; +} +EXPORT_SYMBOL(contpte_ptep_get_lockless); + +void contpte_set_ptes(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte, unsigned int nr) +{ + unsigned long next; + unsigned long end; + unsigned long pfn; + pgprot_t prot; + + /* + * The set_ptes() spec guarantees that when nr > 1, the initial state of + * all ptes is not-present. Therefore we never need to unfold or + * otherwise invalidate a range before we set the new ptes. + * contpte_set_ptes() should never be called for nr < 2. + */ + VM_WARN_ON(nr == 1); + + if (!mm_is_user(mm)) + return __set_ptes(mm, addr, ptep, pte, nr); + + end = addr + (nr << PAGE_SHIFT); + pfn = pte_pfn(pte); + prot = pte_pgprot(pte); + + do { + next = pte_cont_addr_end(addr, end); + nr = (next - addr) >> PAGE_SHIFT; + pte = pfn_pte(pfn, prot); + + if (((addr | next | (pfn << PAGE_SHIFT)) & ~CONT_PTE_MASK) == 0) + pte = pte_mkcont(pte); + else + pte = pte_mknoncont(pte); + + __set_ptes(mm, addr, ptep, pte, nr); + + addr = next; + ptep += nr; + pfn += nr; + + } while (addr != end); +} +EXPORT_SYMBOL(contpte_set_ptes); + +int contpte_ptep_test_and_clear_young(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) +{ + /* + * ptep_clear_flush_young() technically requires us to clear the access + * flag for a _single_ pte. However, the core-mm code actually tracks + * access/dirty per folio, not per page. And since we only create a + * contig range when the range is covered by a single folio, we can get + * away with clearing young for the whole contig range here, so we avoid + * having to unfold. + */ + + int young = 0; + int i; + + ptep = contpte_align_down(ptep); + addr = ALIGN_DOWN(addr, CONT_PTE_SIZE); + + for (i = 0; i < CONT_PTES; i++, ptep++, addr += PAGE_SIZE) + young |= __ptep_test_and_clear_young(vma, addr, ptep); + + return young; +} +EXPORT_SYMBOL(contpte_ptep_test_and_clear_young); + +int contpte_ptep_clear_flush_young(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) +{ + int young; + + young = contpte_ptep_test_and_clear_young(vma, addr, ptep); + + if (young) { + /* + * See comment in __ptep_clear_flush_young(); same rationale for + * eliding the trailing DSB applies here. + */ + addr = ALIGN_DOWN(addr, CONT_PTE_SIZE); + __flush_tlb_range_nosync(vma, addr, addr + CONT_PTE_SIZE, + PAGE_SIZE, true, 3); + } + + return young; +} +EXPORT_SYMBOL(contpte_ptep_clear_flush_young); + +int contpte_ptep_set_access_flags(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, + pte_t entry, int dirty) +{ + unsigned long start_addr; + pte_t orig_pte; + int i; + + /* + * Gather the access/dirty bits for the contiguous range. If nothing has + * changed, its a noop. + */ + orig_pte = pte_mknoncont(ptep_get(ptep)); + if (pte_val(orig_pte) == pte_val(entry)) + return 0; + + /* + * We can fix up access/dirty bits without having to unfold the contig + * range. But if the write bit is changing, we must unfold. + */ + if (pte_write(orig_pte) == pte_write(entry)) { + /* + * For HW access management, we technically only need to update + * the flag on a single pte in the range. But for SW access + * management, we need to update all the ptes to prevent extra + * faults. Avoid per-page tlb flush in __ptep_set_access_flags() + * and instead flush the whole range at the end. + */ + ptep = contpte_align_down(ptep); + start_addr = addr = ALIGN_DOWN(addr, CONT_PTE_SIZE); + + for (i = 0; i < CONT_PTES; i++, ptep++, addr += PAGE_SIZE) + __ptep_set_access_flags(vma, addr, ptep, entry, 0); + + if (dirty) + __flush_tlb_range(vma, start_addr, addr, + PAGE_SIZE, true, 3); + } else { + __contpte_try_unfold(vma->vm_mm, addr, ptep, orig_pte); + __ptep_set_access_flags(vma, addr, ptep, entry, dirty); + } + + return 1; +} +EXPORT_SYMBOL(contpte_ptep_set_access_flags); diff --git a/include/linux/efi.h b/include/linux/efi.h index c74f47711f0b..57da15e7429c 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -692,6 +692,11 @@ extern struct efi { extern struct mm_struct efi_mm; +static inline bool mm_is_efi(struct mm_struct *mm) +{ + return IS_ENABLED(CONFIG_EFI) && mm == &efi_mm; +} + static inline int efi_guidcmp (efi_guid_t left, efi_guid_t right) { -- cgit v1.2.3 From 311a6cf29690bb8295327bad0e76e0ad48cadcc4 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Thu, 15 Feb 2024 10:32:00 +0000 Subject: arm64/mm: implement new wrprotect_ptes() batch API Optimize the contpte implementation to fix some of the fork performance regression introduced by the initial contpte commit. Subsequent patches will solve it entirely. During fork(), any private memory in the parent must be write-protected. Previously this was done 1 PTE at a time. But the core-mm supports batched wrprotect via the new wrprotect_ptes() API. So let's implement that API and for fully covered contpte mappings, we no longer need to unfold the contpte. This has 2 benefits: - reduced unfolding, reduces the number of tlbis that must be issued. - The memory remains contpte-mapped ("folded") in the parent, so it continues to benefit from the more efficient use of the TLB after the fork. The optimization to wrprotect a whole contpte block without unfolding is possible thanks to the tightening of the Arm ARM in respect to the definition and behaviour when 'Misprogramming the Contiguous bit'. See section D21194 at https://developer.arm.com/documentation/102105/ja-07/ Link: https://lkml.kernel.org/r/20240215103205.2607016-14-ryan.roberts@arm.com Signed-off-by: Ryan Roberts Tested-by: John Hubbard Acked-by: Mark Rutland Acked-by: Catalin Marinas Cc: Alistair Popple Cc: Andrey Ryabinin Cc: Ard Biesheuvel Cc: Barry Song <21cnbao@gmail.com> Cc: Borislav Petkov (AMD) Cc: Dave Hansen Cc: David Hildenbrand Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: James Morse Cc: Kefeng Wang Cc: Marc Zyngier Cc: Matthew Wilcox (Oracle) Cc: Thomas Gleixner Cc: Will Deacon Cc: Yang Shi Cc: Zi Yan Signed-off-by: Andrew Morton --- arch/arm64/include/asm/pgtable.h | 61 +++++++++++++++++++++++++++++++++------- arch/arm64/mm/contpte.c | 38 +++++++++++++++++++++++++ 2 files changed, 89 insertions(+), 10 deletions(-) (limited to 'arch/arm64/include') diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 831099cfc96b..8643227c318b 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -978,16 +978,12 @@ static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ -/* - * __ptep_set_wrprotect - mark read-only while trasferring potential hardware - * dirty status (PTE_DBM && !PTE_RDONLY) to the software PTE_DIRTY bit. - */ -static inline void __ptep_set_wrprotect(struct mm_struct *mm, - unsigned long address, pte_t *ptep) +static inline void ___ptep_set_wrprotect(struct mm_struct *mm, + unsigned long address, pte_t *ptep, + pte_t pte) { - pte_t old_pte, pte; + pte_t old_pte; - pte = __ptep_get(ptep); do { old_pte = pte; pte = pte_wrprotect(pte); @@ -996,6 +992,25 @@ static inline void __ptep_set_wrprotect(struct mm_struct *mm, } while (pte_val(pte) != pte_val(old_pte)); } +/* + * __ptep_set_wrprotect - mark read-only while trasferring potential hardware + * dirty status (PTE_DBM && !PTE_RDONLY) to the software PTE_DIRTY bit. + */ +static inline void __ptep_set_wrprotect(struct mm_struct *mm, + unsigned long address, pte_t *ptep) +{ + ___ptep_set_wrprotect(mm, address, ptep, __ptep_get(ptep)); +} + +static inline void __wrprotect_ptes(struct mm_struct *mm, unsigned long address, + pte_t *ptep, unsigned int nr) +{ + unsigned int i; + + for (i = 0; i < nr; i++, address += PAGE_SIZE, ptep++) + __ptep_set_wrprotect(mm, address, ptep); +} + #ifdef CONFIG_TRANSPARENT_HUGEPAGE #define __HAVE_ARCH_PMDP_SET_WRPROTECT static inline void pmdp_set_wrprotect(struct mm_struct *mm, @@ -1149,6 +1164,8 @@ extern int contpte_ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep); extern int contpte_ptep_clear_flush_young(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep); +extern void contpte_wrprotect_ptes(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, unsigned int nr); extern int contpte_ptep_set_access_flags(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t entry, int dirty); @@ -1268,12 +1285,35 @@ static inline int ptep_clear_flush_young(struct vm_area_struct *vma, return contpte_ptep_clear_flush_young(vma, addr, ptep); } +#define wrprotect_ptes wrprotect_ptes +static inline void wrprotect_ptes(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, unsigned int nr) +{ + if (likely(nr == 1)) { + /* + * Optimization: wrprotect_ptes() can only be called for present + * ptes so we only need to check contig bit as condition for + * unfold, and we can remove the contig bit from the pte we read + * to avoid re-reading. This speeds up fork() which is sensitive + * for order-0 folios. Equivalent to contpte_try_unfold(). + */ + pte_t orig_pte = __ptep_get(ptep); + + if (unlikely(pte_cont(orig_pte))) { + __contpte_try_unfold(mm, addr, ptep, orig_pte); + orig_pte = pte_mknoncont(orig_pte); + } + ___ptep_set_wrprotect(mm, addr, ptep, orig_pte); + } else { + contpte_wrprotect_ptes(mm, addr, ptep, nr); + } +} + #define __HAVE_ARCH_PTEP_SET_WRPROTECT static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { - contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep)); - __ptep_set_wrprotect(mm, addr, ptep); + wrprotect_ptes(mm, addr, ptep, 1); } #define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS @@ -1305,6 +1345,7 @@ static inline int ptep_set_access_flags(struct vm_area_struct *vma, #define ptep_clear_flush_young __ptep_clear_flush_young #define __HAVE_ARCH_PTEP_SET_WRPROTECT #define ptep_set_wrprotect __ptep_set_wrprotect +#define wrprotect_ptes __wrprotect_ptes #define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS #define ptep_set_access_flags __ptep_set_access_flags diff --git a/arch/arm64/mm/contpte.c b/arch/arm64/mm/contpte.c index 6d7f40667fa2..bedb58524535 100644 --- a/arch/arm64/mm/contpte.c +++ b/arch/arm64/mm/contpte.c @@ -26,6 +26,26 @@ static inline pte_t *contpte_align_down(pte_t *ptep) return PTR_ALIGN_DOWN(ptep, sizeof(*ptep) * CONT_PTES); } +static void contpte_try_unfold_partial(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, unsigned int nr) +{ + /* + * Unfold any partially covered contpte block at the beginning and end + * of the range. + */ + + if (ptep != contpte_align_down(ptep) || nr < CONT_PTES) + contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep)); + + if (ptep + nr != contpte_align_down(ptep + nr)) { + unsigned long last_addr = addr + PAGE_SIZE * (nr - 1); + pte_t *last_ptep = ptep + nr - 1; + + contpte_try_unfold(mm, last_addr, last_ptep, + __ptep_get(last_ptep)); + } +} + static void contpte_convert(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) { @@ -238,6 +258,24 @@ int contpte_ptep_clear_flush_young(struct vm_area_struct *vma, } EXPORT_SYMBOL(contpte_ptep_clear_flush_young); +void contpte_wrprotect_ptes(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, unsigned int nr) +{ + /* + * If wrprotecting an entire contig range, we can avoid unfolding. Just + * set wrprotect and wait for the later mmu_gather flush to invalidate + * the tlb. Until the flush, the page may or may not be wrprotected. + * After the flush, it is guaranteed wrprotected. If it's a partial + * range though, we must unfold, because we can't have a case where + * CONT_PTE is set but wrprotect applies to a subset of the PTEs; this + * would cause it to continue to be unpredictable after the flush. + */ + + contpte_try_unfold_partial(mm, addr, ptep, nr); + __wrprotect_ptes(mm, addr, ptep, nr); +} +EXPORT_SYMBOL(contpte_wrprotect_ptes); + int contpte_ptep_set_access_flags(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t entry, int dirty) -- cgit v1.2.3 From 6b1e4efb6f5499ae8f9f5cdda7502285a0edbf51 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Thu, 15 Feb 2024 10:32:01 +0000 Subject: arm64/mm: implement new [get_and_]clear_full_ptes() batch APIs Optimize the contpte implementation to fix some of the exit/munmap/dontneed performance regression introduced by the initial contpte commit. Subsequent patches will solve it entirely. During exit(), munmap() or madvise(MADV_DONTNEED), mappings must be cleared. Previously this was done 1 PTE at a time. But the core-mm supports batched clear via the new [get_and_]clear_full_ptes() APIs. So let's implement those APIs and for fully covered contpte mappings, we no longer need to unfold the contpte. This significantly reduces unfolding operations, reducing the number of tlbis that must be issued. Link: https://lkml.kernel.org/r/20240215103205.2607016-15-ryan.roberts@arm.com Signed-off-by: Ryan Roberts Tested-by: John Hubbard Acked-by: Mark Rutland Acked-by: Catalin Marinas Cc: Alistair Popple Cc: Andrey Ryabinin Cc: Ard Biesheuvel Cc: Barry Song <21cnbao@gmail.com> Cc: Borislav Petkov (AMD) Cc: Dave Hansen Cc: David Hildenbrand Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: James Morse Cc: Kefeng Wang Cc: Marc Zyngier Cc: Matthew Wilcox (Oracle) Cc: Thomas Gleixner Cc: Will Deacon Cc: Yang Shi Cc: Zi Yan Signed-off-by: Andrew Morton --- arch/arm64/include/asm/pgtable.h | 67 ++++++++++++++++++++++++++++++++++++++++ arch/arm64/mm/contpte.c | 17 ++++++++++ 2 files changed, 84 insertions(+) (limited to 'arch/arm64/include') diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 8643227c318b..a8f1a35e3086 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -965,6 +965,37 @@ static inline pte_t __ptep_get_and_clear(struct mm_struct *mm, return pte; } +static inline void __clear_full_ptes(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, unsigned int nr, int full) +{ + for (;;) { + __ptep_get_and_clear(mm, addr, ptep); + if (--nr == 0) + break; + ptep++; + addr += PAGE_SIZE; + } +} + +static inline pte_t __get_and_clear_full_ptes(struct mm_struct *mm, + unsigned long addr, pte_t *ptep, + unsigned int nr, int full) +{ + pte_t pte, tmp_pte; + + pte = __ptep_get_and_clear(mm, addr, ptep); + while (--nr) { + ptep++; + addr += PAGE_SIZE; + tmp_pte = __ptep_get_and_clear(mm, addr, ptep); + if (pte_dirty(tmp_pte)) + pte = pte_mkdirty(pte); + if (pte_young(tmp_pte)) + pte = pte_mkyoung(pte); + } + return pte; +} + #ifdef CONFIG_TRANSPARENT_HUGEPAGE #define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, @@ -1160,6 +1191,11 @@ extern pte_t contpte_ptep_get(pte_t *ptep, pte_t orig_pte); extern pte_t contpte_ptep_get_lockless(pte_t *orig_ptep); extern void contpte_set_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte, unsigned int nr); +extern void contpte_clear_full_ptes(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, unsigned int nr, int full); +extern pte_t contpte_get_and_clear_full_ptes(struct mm_struct *mm, + unsigned long addr, pte_t *ptep, + unsigned int nr, int full); extern int contpte_ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep); extern int contpte_ptep_clear_flush_young(struct vm_area_struct *vma, @@ -1253,6 +1289,35 @@ static inline void pte_clear(struct mm_struct *mm, __pte_clear(mm, addr, ptep); } +#define clear_full_ptes clear_full_ptes +static inline void clear_full_ptes(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, unsigned int nr, int full) +{ + if (likely(nr == 1)) { + contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep)); + __clear_full_ptes(mm, addr, ptep, nr, full); + } else { + contpte_clear_full_ptes(mm, addr, ptep, nr, full); + } +} + +#define get_and_clear_full_ptes get_and_clear_full_ptes +static inline pte_t get_and_clear_full_ptes(struct mm_struct *mm, + unsigned long addr, pte_t *ptep, + unsigned int nr, int full) +{ + pte_t pte; + + if (likely(nr == 1)) { + contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep)); + pte = __get_and_clear_full_ptes(mm, addr, ptep, nr, full); + } else { + pte = contpte_get_and_clear_full_ptes(mm, addr, ptep, nr, full); + } + + return pte; +} + #define __HAVE_ARCH_PTEP_GET_AND_CLEAR static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) @@ -1337,6 +1402,8 @@ static inline int ptep_set_access_flags(struct vm_area_struct *vma, #define set_pte __set_pte #define set_ptes __set_ptes #define pte_clear __pte_clear +#define clear_full_ptes __clear_full_ptes +#define get_and_clear_full_ptes __get_and_clear_full_ptes #define __HAVE_ARCH_PTEP_GET_AND_CLEAR #define ptep_get_and_clear __ptep_get_and_clear #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG diff --git a/arch/arm64/mm/contpte.c b/arch/arm64/mm/contpte.c index bedb58524535..50e0173dc5ee 100644 --- a/arch/arm64/mm/contpte.c +++ b/arch/arm64/mm/contpte.c @@ -212,6 +212,23 @@ void contpte_set_ptes(struct mm_struct *mm, unsigned long addr, } EXPORT_SYMBOL(contpte_set_ptes); +void contpte_clear_full_ptes(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, unsigned int nr, int full) +{ + contpte_try_unfold_partial(mm, addr, ptep, nr); + __clear_full_ptes(mm, addr, ptep, nr, full); +} +EXPORT_SYMBOL(contpte_clear_full_ptes); + +pte_t contpte_get_and_clear_full_ptes(struct mm_struct *mm, + unsigned long addr, pte_t *ptep, + unsigned int nr, int full) +{ + contpte_try_unfold_partial(mm, addr, ptep, nr); + return __get_and_clear_full_ptes(mm, addr, ptep, nr, full); +} +EXPORT_SYMBOL(contpte_get_and_clear_full_ptes); + int contpte_ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { -- cgit v1.2.3 From fb5451e5f72b31002760083a99fbb41771c4f1ad Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Thu, 15 Feb 2024 10:32:03 +0000 Subject: arm64/mm: implement pte_batch_hint() When core code iterates over a range of ptes and calls ptep_get() for each of them, if the range happens to cover contpte mappings, the number of pte reads becomes amplified by a factor of the number of PTEs in a contpte block. This is because for each call to ptep_get(), the implementation must read all of the ptes in the contpte block to which it belongs to gather the access and dirty bits. This causes a hotspot for fork(), as well as operations that unmap memory such as munmap(), exit and madvise(MADV_DONTNEED). Fortunately we can fix this by implementing pte_batch_hint() which allows their iterators to skip getting the contpte tail ptes when gathering the batch of ptes to operate on. This results in the number of PTE reads returning to 1 per pte. Link: https://lkml.kernel.org/r/20240215103205.2607016-17-ryan.roberts@arm.com Signed-off-by: Ryan Roberts Acked-by: Mark Rutland Reviewed-by: David Hildenbrand Tested-by: John Hubbard Acked-by: Catalin Marinas Cc: Alistair Popple Cc: Andrey Ryabinin Cc: Ard Biesheuvel Cc: Barry Song <21cnbao@gmail.com> Cc: Borislav Petkov (AMD) Cc: Dave Hansen Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: James Morse Cc: Kefeng Wang Cc: Marc Zyngier Cc: Matthew Wilcox (Oracle) Cc: Thomas Gleixner Cc: Will Deacon Cc: Yang Shi Cc: Zi Yan Signed-off-by: Andrew Morton --- arch/arm64/include/asm/pgtable.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'arch/arm64/include') diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index a8f1a35e3086..d759a20d2929 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -1213,6 +1213,15 @@ static inline void contpte_try_unfold(struct mm_struct *mm, unsigned long addr, __contpte_try_unfold(mm, addr, ptep, pte); } +#define pte_batch_hint pte_batch_hint +static inline unsigned int pte_batch_hint(pte_t *ptep, pte_t pte) +{ + if (!pte_valid_cont(pte)) + return 1; + + return CONT_PTES - (((unsigned long)ptep >> 3) & (CONT_PTES - 1)); +} + /* * The below functions constitute the public API that arm64 presents to the * core-mm to manipulate PTE entries within their page tables (or at least this -- cgit v1.2.3 From b972fc6afba002319fe23bc698ce6431ee43868c Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Thu, 15 Feb 2024 10:32:04 +0000 Subject: arm64/mm: __always_inline to improve fork() perf As set_ptes() and wrprotect_ptes() become a bit more complex, the compiler may choose not to inline them. But this is critical for fork() performance. So mark the functions, along with contpte_try_unfold() which is called by them, as __always_inline. This is worth ~1% on the fork() microbenchmark with order-0 folios (the common case). Link: https://lkml.kernel.org/r/20240215103205.2607016-18-ryan.roberts@arm.com Signed-off-by: Ryan Roberts Acked-by: Mark Rutland Acked-by: Catalin Marinas Cc: Alistair Popple Cc: Andrey Ryabinin Cc: Ard Biesheuvel Cc: Barry Song <21cnbao@gmail.com> Cc: Borislav Petkov (AMD) Cc: Dave Hansen Cc: David Hildenbrand Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: James Morse Cc: John Hubbard Cc: Kefeng Wang Cc: Marc Zyngier Cc: Matthew Wilcox (Oracle) Cc: Thomas Gleixner Cc: Will Deacon Cc: Yang Shi Cc: Zi Yan Signed-off-by: Andrew Morton --- arch/arm64/include/asm/pgtable.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/arm64/include') diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index d759a20d2929..8310875133ff 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -1206,8 +1206,8 @@ extern int contpte_ptep_set_access_flags(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t entry, int dirty); -static inline void contpte_try_unfold(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pte) +static __always_inline void contpte_try_unfold(struct mm_struct *mm, + unsigned long addr, pte_t *ptep, pte_t pte) { if (unlikely(pte_valid_cont(pte))) __contpte_try_unfold(mm, addr, ptep, pte); @@ -1278,7 +1278,7 @@ static inline void set_pte(pte_t *ptep, pte_t pte) } #define set_ptes set_ptes -static inline void set_ptes(struct mm_struct *mm, unsigned long addr, +static __always_inline void set_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte, unsigned int nr) { pte = pte_mknoncont(pte); @@ -1360,8 +1360,8 @@ static inline int ptep_clear_flush_young(struct vm_area_struct *vma, } #define wrprotect_ptes wrprotect_ptes -static inline void wrprotect_ptes(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, unsigned int nr) +static __always_inline void wrprotect_ptes(struct mm_struct *mm, + unsigned long addr, pte_t *ptep, unsigned int nr) { if (likely(nr == 1)) { /* -- cgit v1.2.3 From f0c2264958e18bc7bc35b567d51b99461e4de34f Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Thu, 15 Feb 2024 10:32:05 +0000 Subject: arm64/mm: automatically fold contpte mappings There are situations where a change to a single PTE could cause the contpte block in which it resides to become foldable (i.e. could be repainted with the contiguous bit). Such situations arise, for example, when user space temporarily changes protections, via mprotect, for individual pages, such can be the case for certain garbage collectors. We would like to detect when such a PTE change occurs. However this can be expensive due to the amount of checking required. Therefore only perform the checks when an indiviual PTE is modified via mprotect (ptep_modify_prot_commit() -> set_pte_at() -> set_ptes(nr=1)) and only when we are setting the final PTE in a contpte-aligned block. Link: https://lkml.kernel.org/r/20240215103205.2607016-19-ryan.roberts@arm.com Signed-off-by: Ryan Roberts Acked-by: Mark Rutland Acked-by: Catalin Marinas Cc: Alistair Popple Cc: Andrey Ryabinin Cc: Ard Biesheuvel Cc: Barry Song <21cnbao@gmail.com> Cc: Borislav Petkov (AMD) Cc: Dave Hansen Cc: David Hildenbrand Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: James Morse Cc: John Hubbard Cc: Kefeng Wang Cc: Marc Zyngier Cc: Matthew Wilcox (Oracle) Cc: Thomas Gleixner Cc: Will Deacon Cc: Yang Shi Cc: Zi Yan Signed-off-by: Andrew Morton --- arch/arm64/include/asm/pgtable.h | 26 ++++++++++++++++ arch/arm64/mm/contpte.c | 64 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 90 insertions(+) (limited to 'arch/arm64/include') diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 8310875133ff..401087e8a43d 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -1185,6 +1185,8 @@ extern void ptep_modify_prot_commit(struct vm_area_struct *vma, * where it is possible and makes sense to do so. The PTE_CONT bit is considered * a private implementation detail of the public ptep API (see below). */ +extern void __contpte_try_fold(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte); extern void __contpte_try_unfold(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte); extern pte_t contpte_ptep_get(pte_t *ptep, pte_t orig_pte); @@ -1206,6 +1208,29 @@ extern int contpte_ptep_set_access_flags(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t entry, int dirty); +static __always_inline void contpte_try_fold(struct mm_struct *mm, + unsigned long addr, pte_t *ptep, pte_t pte) +{ + /* + * Only bother trying if both the virtual and physical addresses are + * aligned and correspond to the last entry in a contig range. The core + * code mostly modifies ranges from low to high, so this is the likely + * the last modification in the contig range, so a good time to fold. + * We can't fold special mappings, because there is no associated folio. + */ + + const unsigned long contmask = CONT_PTES - 1; + bool valign = ((addr >> PAGE_SHIFT) & contmask) == contmask; + + if (unlikely(valign)) { + bool palign = (pte_pfn(pte) & contmask) == contmask; + + if (unlikely(palign && + pte_valid(pte) && !pte_cont(pte) && !pte_special(pte))) + __contpte_try_fold(mm, addr, ptep, pte); + } +} + static __always_inline void contpte_try_unfold(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) { @@ -1286,6 +1311,7 @@ static __always_inline void set_ptes(struct mm_struct *mm, unsigned long addr, if (likely(nr == 1)) { contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep)); __set_ptes(mm, addr, ptep, pte, 1); + contpte_try_fold(mm, addr, ptep, pte); } else { contpte_set_ptes(mm, addr, ptep, pte, nr); } diff --git a/arch/arm64/mm/contpte.c b/arch/arm64/mm/contpte.c index 50e0173dc5ee..16788f07716d 100644 --- a/arch/arm64/mm/contpte.c +++ b/arch/arm64/mm/contpte.c @@ -73,6 +73,70 @@ static void contpte_convert(struct mm_struct *mm, unsigned long addr, __set_ptes(mm, start_addr, start_ptep, pte, CONT_PTES); } +void __contpte_try_fold(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte) +{ + /* + * We have already checked that the virtual and pysical addresses are + * correctly aligned for a contpte mapping in contpte_try_fold() so the + * remaining checks are to ensure that the contpte range is fully + * covered by a single folio, and ensure that all the ptes are valid + * with contiguous PFNs and matching prots. We ignore the state of the + * access and dirty bits for the purpose of deciding if its a contiguous + * range; the folding process will generate a single contpte entry which + * has a single access and dirty bit. Those 2 bits are the logical OR of + * their respective bits in the constituent pte entries. In order to + * ensure the contpte range is covered by a single folio, we must + * recover the folio from the pfn, but special mappings don't have a + * folio backing them. Fortunately contpte_try_fold() already checked + * that the pte is not special - we never try to fold special mappings. + * Note we can't use vm_normal_page() for this since we don't have the + * vma. + */ + + unsigned long folio_start, folio_end; + unsigned long cont_start, cont_end; + pte_t expected_pte, subpte; + struct folio *folio; + struct page *page; + unsigned long pfn; + pte_t *orig_ptep; + pgprot_t prot; + + int i; + + if (!mm_is_user(mm)) + return; + + page = pte_page(pte); + folio = page_folio(page); + folio_start = addr - (page - &folio->page) * PAGE_SIZE; + folio_end = folio_start + folio_nr_pages(folio) * PAGE_SIZE; + cont_start = ALIGN_DOWN(addr, CONT_PTE_SIZE); + cont_end = cont_start + CONT_PTE_SIZE; + + if (folio_start > cont_start || folio_end < cont_end) + return; + + pfn = ALIGN_DOWN(pte_pfn(pte), CONT_PTES); + prot = pte_pgprot(pte_mkold(pte_mkclean(pte))); + expected_pte = pfn_pte(pfn, prot); + orig_ptep = ptep; + ptep = contpte_align_down(ptep); + + for (i = 0; i < CONT_PTES; i++) { + subpte = pte_mkold(pte_mkclean(__ptep_get(ptep))); + if (!pte_same(subpte, expected_pte)) + return; + expected_pte = pte_advance_pfn(expected_pte, 1); + ptep++; + } + + pte = pte_mkcont(pte); + contpte_convert(mm, addr, orig_ptep, pte); +} +EXPORT_SYMBOL(__contpte_try_fold); + void __contpte_try_unfold(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) { -- cgit v1.2.3 From 85fcde402db191b5f222ebfecda653777d7d084e Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Wed, 24 Jan 2024 13:12:41 +0800 Subject: kexec: split crashkernel reservation code out from crash_core.c Patch series "Split crash out from kexec and clean up related config items", v3. Motivation: ============= Previously, LKP reported a building error. When investigating, it can't be resolved reasonablly with the present messy kdump config items. https://lore.kernel.org/oe-kbuild-all/202312182200.Ka7MzifQ-lkp@intel.com/ The kdump (crash dumping) related config items could causes confusions: Firstly, CRASH_CORE enables codes including - crashkernel reservation; - elfcorehdr updating; - vmcoreinfo exporting; - crash hotplug handling; Now fadump of powerpc, kcore dynamic debugging and kdump all selects CRASH_CORE, while fadump - fadump needs crashkernel parsing, vmcoreinfo exporting, and accessing global variable 'elfcorehdr_addr'; - kcore only needs vmcoreinfo exporting; - kdump needs all of the current kernel/crash_core.c. So only enabling PROC_CORE or FA_DUMP will enable CRASH_CORE, this mislead people that we enable crash dumping, actual it's not. Secondly, It's not reasonable to allow KEXEC_CORE select CRASH_CORE. Because KEXEC_CORE enables codes which allocate control pages, copy kexec/kdump segments, and prepare for switching. These codes are shared by both kexec reboot and kdump. We could want kexec reboot, but disable kdump. In that case, CRASH_CORE should not be selected. -------------------- CONFIG_CRASH_CORE=y CONFIG_KEXEC_CORE=y CONFIG_KEXEC=y CONFIG_KEXEC_FILE=y --------------------- Thirdly, It's not reasonable to allow CRASH_DUMP select KEXEC_CORE. That could make KEXEC_CORE, CRASH_DUMP are enabled independently from KEXEC or KEXEC_FILE. However, w/o KEXEC or KEXEC_FILE, the KEXEC_CORE code built in doesn't make any sense because no kernel loading or switching will happen to utilize the KEXEC_CORE code. --------------------- CONFIG_CRASH_CORE=y CONFIG_KEXEC_CORE=y CONFIG_CRASH_DUMP=y --------------------- In this case, what is worse, on arch sh and arm, KEXEC relies on MMU, while CRASH_DUMP can still be enabled when !MMU, then compiling error is seen as the lkp test robot reported in above link. ------arch/sh/Kconfig------ config ARCH_SUPPORTS_KEXEC def_bool MMU config ARCH_SUPPORTS_CRASH_DUMP def_bool BROKEN_ON_SMP --------------------------- Changes: =========== 1, split out crash_reserve.c from crash_core.c; 2, split out vmcore_infoc. from crash_core.c; 3, move crash related codes in kexec_core.c into crash_core.c; 4, remove dependency of FA_DUMP on CRASH_DUMP; 5, clean up kdump related config items; 6, wrap up crash codes in crash related ifdefs on all 8 arch-es which support crash dumping, except of ppc; Achievement: =========== With above changes, I can rearrange the config item logic as below (the right item depends on or is selected by the left item): PROC_KCORE -----------> VMCORE_INFO |----------> VMCORE_INFO FA_DUMP----| |----------> CRASH_RESERVE ---->VMCORE_INFO / |---->CRASH_RESERVE KEXEC --| /| |--> KEXEC_CORE--> CRASH_DUMP-->/-|---->PROC_VMCORE KEXEC_FILE --| \ | \---->CRASH_HOTPLUG KEXEC --| |--> KEXEC_CORE (for kexec reboot only) KEXEC_FILE --| Test ======== On all 8 architectures, including x86_64, arm64, s390x, sh, arm, mips, riscv, loongarch, I did below three cases of config item setting and building all passed. Take configs on x86_64 as exampmle here: (1) Both CONFIG_KEXEC and KEXEC_FILE is unset, then all kexec/kdump items are unset automatically: # Kexec and crash features # CONFIG_KEXEC is not set # CONFIG_KEXEC_FILE is not set # end of Kexec and crash features (2) set CONFIG_KEXEC_FILE and 'make olddefconfig': --------------- # Kexec and crash features CONFIG_CRASH_RESERVE=y CONFIG_VMCORE_INFO=y CONFIG_KEXEC_CORE=y CONFIG_KEXEC_FILE=y CONFIG_CRASH_DUMP=y CONFIG_CRASH_HOTPLUG=y CONFIG_CRASH_MAX_MEMORY_RANGES=8192 # end of Kexec and crash features --------------- (3) unset CONFIG_CRASH_DUMP in case 2 and execute 'make olddefconfig': ------------------------ # Kexec and crash features CONFIG_KEXEC_CORE=y CONFIG_KEXEC_FILE=y # end of Kexec and crash features ------------------------ Note: For ppc, it needs investigation to make clear how to split out crash code in arch folder. Hope Hari and Pingfan can help have a look, see if it's doable. Now, I make it either have both kexec and crash enabled, or disable both of them altogether. This patch (of 14): Both kdump and fa_dump of ppc rely on crashkernel reservation. Move the relevant codes into separate files: crash_reserve.c, include/linux/crash_reserve.h. And also add config item CRASH_RESERVE to control its enabling of the codes. And update config items which has relationship with crashkernel reservation. And also change ifdeffery from CONFIG_CRASH_CORE to CONFIG_CRASH_RESERVE when those scopes are only crashkernel reservation related. And also rename arch/XXX/include/asm/{crash_core.h => crash_reserve.h} on arm64, x86 and risc-v because those architectures' crash_core.h is only related to crashkernel reservation. [akpm@linux-foundation.org: s/CRASH_RESEERVE/CRASH_RESERVE/, per Klara Modin] Link: https://lkml.kernel.org/r/20240124051254.67105-1-bhe@redhat.com Link: https://lkml.kernel.org/r/20240124051254.67105-2-bhe@redhat.com Signed-off-by: Baoquan He Acked-by: Hari Bathini Cc: Al Viro Cc: Eric W. Biederman Cc: Pingfan Liu Cc: Klara Modin Cc: Michael Kelley Cc: Nathan Chancellor Cc: Stephen Rothwell Cc: Yang Li Signed-off-by: Andrew Morton --- arch/arm64/Kconfig | 2 +- arch/arm64/include/asm/crash_core.h | 10 - arch/arm64/include/asm/crash_reserve.h | 10 + arch/powerpc/Kconfig | 1 + arch/powerpc/mm/nohash/kaslr_booke.c | 4 +- arch/riscv/Kconfig | 2 +- arch/riscv/include/asm/crash_core.h | 11 - arch/riscv/include/asm/crash_reserve.h | 11 + arch/x86/Kconfig | 2 +- arch/x86/include/asm/crash_core.h | 42 --- arch/x86/include/asm/crash_reserve.h | 42 +++ include/linux/crash_core.h | 40 --- include/linux/crash_reserve.h | 48 ++++ include/linux/kexec.h | 1 + kernel/Kconfig.kexec | 5 +- kernel/Makefile | 1 + kernel/crash_core.c | 438 ------------------------------- kernel/crash_reserve.c | 464 +++++++++++++++++++++++++++++++++ 18 files changed, 587 insertions(+), 547 deletions(-) delete mode 100644 arch/arm64/include/asm/crash_core.h create mode 100644 arch/arm64/include/asm/crash_reserve.h delete mode 100644 arch/riscv/include/asm/crash_core.h create mode 100644 arch/riscv/include/asm/crash_reserve.h delete mode 100644 arch/x86/include/asm/crash_core.h create mode 100644 arch/x86/include/asm/crash_reserve.h create mode 100644 include/linux/crash_reserve.h create mode 100644 kernel/crash_reserve.c (limited to 'arch/arm64/include') diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 386566138620..5a7ac1f37bdc 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1519,7 +1519,7 @@ config ARCH_SUPPORTS_CRASH_DUMP def_bool y config ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION - def_bool CRASH_CORE + def_bool CRASH_RESERVE config TRANS_TABLE def_bool y diff --git a/arch/arm64/include/asm/crash_core.h b/arch/arm64/include/asm/crash_core.h deleted file mode 100644 index 9f5c8d339f44..000000000000 --- a/arch/arm64/include/asm/crash_core.h +++ /dev/null @@ -1,10 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -#ifndef _ARM64_CRASH_CORE_H -#define _ARM64_CRASH_CORE_H - -/* Current arm64 boot protocol requires 2MB alignment */ -#define CRASH_ALIGN SZ_2M - -#define CRASH_ADDR_LOW_MAX arm64_dma_phys_limit -#define CRASH_ADDR_HIGH_MAX (PHYS_MASK + 1) -#endif diff --git a/arch/arm64/include/asm/crash_reserve.h b/arch/arm64/include/asm/crash_reserve.h new file mode 100644 index 000000000000..4afe027a4e7b --- /dev/null +++ b/arch/arm64/include/asm/crash_reserve.h @@ -0,0 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef _ARM64_CRASH_RESERVE_H +#define _ARM64_CRASH_RESERVE_H + +/* Current arm64 boot protocol requires 2MB alignment */ +#define CRASH_ALIGN SZ_2M + +#define CRASH_ADDR_LOW_MAX arm64_dma_phys_limit +#define CRASH_ADDR_HIGH_MAX (PHYS_MASK + 1) +#endif diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index b9fc064d38d2..7f704ae5c5ef 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -691,6 +691,7 @@ config FA_DUMP bool "Firmware-assisted dump" depends on PPC64 && (PPC_RTAS || PPC_POWERNV) select CRASH_CORE + select CRASH_RESERVE select CRASH_DUMP help A robust mechanism to get reliable kernel crash dump with diff --git a/arch/powerpc/mm/nohash/kaslr_booke.c b/arch/powerpc/mm/nohash/kaslr_booke.c index b4f2786a7d2b..cdff129abb14 100644 --- a/arch/powerpc/mm/nohash/kaslr_booke.c +++ b/arch/powerpc/mm/nohash/kaslr_booke.c @@ -13,7 +13,7 @@ #include #include #include -#include +#include #include #include #include @@ -173,7 +173,7 @@ static __init bool overlaps_region(const void *fdt, u32 start, static void __init get_crash_kernel(void *fdt, unsigned long size) { -#ifdef CONFIG_CRASH_CORE +#ifdef CONFIG_CRASH_RESERVE unsigned long long crash_size, crash_base; int ret; diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index bffbd869a068..bd06ad1bb97c 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -767,7 +767,7 @@ config ARCH_SUPPORTS_CRASH_DUMP def_bool y config ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION - def_bool CRASH_CORE + def_bool CRASH_RESERVE config COMPAT bool "Kernel support for 32-bit U-mode" diff --git a/arch/riscv/include/asm/crash_core.h b/arch/riscv/include/asm/crash_core.h deleted file mode 100644 index e1874b23feaf..000000000000 --- a/arch/riscv/include/asm/crash_core.h +++ /dev/null @@ -1,11 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -#ifndef _RISCV_CRASH_CORE_H -#define _RISCV_CRASH_CORE_H - -#define CRASH_ALIGN PMD_SIZE - -#define CRASH_ADDR_LOW_MAX dma32_phys_limit -#define CRASH_ADDR_HIGH_MAX memblock_end_of_DRAM() - -extern phys_addr_t memblock_end_of_DRAM(void); -#endif diff --git a/arch/riscv/include/asm/crash_reserve.h b/arch/riscv/include/asm/crash_reserve.h new file mode 100644 index 000000000000..013962e63587 --- /dev/null +++ b/arch/riscv/include/asm/crash_reserve.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef _RISCV_CRASH_RESERVE_H +#define _RISCV_CRASH_RESERVE_H + +#define CRASH_ALIGN PMD_SIZE + +#define CRASH_ADDR_LOW_MAX dma32_phys_limit +#define CRASH_ADDR_HIGH_MAX memblock_end_of_DRAM() + +extern phys_addr_t memblock_end_of_DRAM(void); +#endif diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 5edec175b9bf..5bd925815154 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2106,7 +2106,7 @@ config ARCH_SUPPORTS_CRASH_HOTPLUG def_bool y config ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION - def_bool CRASH_CORE + def_bool CRASH_RESERVE config PHYSICAL_START hex "Physical address where the kernel is loaded" if (EXPERT || CRASH_DUMP) diff --git a/arch/x86/include/asm/crash_core.h b/arch/x86/include/asm/crash_core.h deleted file mode 100644 index 76af98f4e801..000000000000 --- a/arch/x86/include/asm/crash_core.h +++ /dev/null @@ -1,42 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _X86_CRASH_CORE_H -#define _X86_CRASH_CORE_H - -/* 16M alignment for crash kernel regions */ -#define CRASH_ALIGN SZ_16M - -/* - * Keep the crash kernel below this limit. - * - * Earlier 32-bits kernels would limit the kernel to the low 512 MB range - * due to mapping restrictions. - * - * 64-bit kdump kernels need to be restricted to be under 64 TB, which is - * the upper limit of system RAM in 4-level paging mode. Since the kdump - * jump could be from 5-level paging to 4-level paging, the jump will fail if - * the kernel is put above 64 TB, and during the 1st kernel bootup there's - * no good way to detect the paging mode of the target kernel which will be - * loaded for dumping. - */ -extern unsigned long swiotlb_size_or_default(void); - -#ifdef CONFIG_X86_32 -# define CRASH_ADDR_LOW_MAX SZ_512M -# define CRASH_ADDR_HIGH_MAX SZ_512M -#else -# define CRASH_ADDR_LOW_MAX SZ_4G -# define CRASH_ADDR_HIGH_MAX SZ_64T -#endif - -# define DEFAULT_CRASH_KERNEL_LOW_SIZE crash_low_size_default() - -static inline unsigned long crash_low_size_default(void) -{ -#ifdef CONFIG_X86_64 - return max(swiotlb_size_or_default() + (8UL << 20), 256UL << 20); -#else - return 0; -#endif -} - -#endif /* _X86_CRASH_CORE_H */ diff --git a/arch/x86/include/asm/crash_reserve.h b/arch/x86/include/asm/crash_reserve.h new file mode 100644 index 000000000000..152239f95541 --- /dev/null +++ b/arch/x86/include/asm/crash_reserve.h @@ -0,0 +1,42 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _X86_CRASH_RESERVE_H +#define _X86_CRASH_RESERVE_H + +/* 16M alignment for crash kernel regions */ +#define CRASH_ALIGN SZ_16M + +/* + * Keep the crash kernel below this limit. + * + * Earlier 32-bits kernels would limit the kernel to the low 512 MB range + * due to mapping restrictions. + * + * 64-bit kdump kernels need to be restricted to be under 64 TB, which is + * the upper limit of system RAM in 4-level paging mode. Since the kdump + * jump could be from 5-level paging to 4-level paging, the jump will fail if + * the kernel is put above 64 TB, and during the 1st kernel bootup there's + * no good way to detect the paging mode of the target kernel which will be + * loaded for dumping. + */ +extern unsigned long swiotlb_size_or_default(void); + +#ifdef CONFIG_X86_32 +# define CRASH_ADDR_LOW_MAX SZ_512M +# define CRASH_ADDR_HIGH_MAX SZ_512M +#else +# define CRASH_ADDR_LOW_MAX SZ_4G +# define CRASH_ADDR_HIGH_MAX SZ_64T +#endif + +# define DEFAULT_CRASH_KERNEL_LOW_SIZE crash_low_size_default() + +static inline unsigned long crash_low_size_default(void) +{ +#ifdef CONFIG_X86_64 + return max(swiotlb_size_or_default() + (8UL << 20), 256UL << 20); +#else + return 0; +#endif +} + +#endif /* _X86_CRASH_RESERVE_H */ diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h index 9eaeaafe0cad..1fde49246fa6 100644 --- a/include/linux/crash_core.h +++ b/include/linux/crash_core.h @@ -5,14 +5,6 @@ #include #include #include -#ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION -#include -#endif - -/* Location of a reserved region to hold the crash kernel. - */ -extern struct resource crashk_res; -extern struct resource crashk_low_res; #define CRASH_CORE_NOTE_NAME "CORE" #define CRASH_CORE_NOTE_HEAD_BYTES ALIGN(sizeof(struct elf_note), 4) @@ -87,38 +79,6 @@ Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type, void *data, size_t data_len); void final_note(Elf_Word *buf); -int __init parse_crashkernel(char *cmdline, unsigned long long system_ram, - unsigned long long *crash_size, unsigned long long *crash_base, - unsigned long long *low_size, bool *high); - -#ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION -#ifndef DEFAULT_CRASH_KERNEL_LOW_SIZE -#define DEFAULT_CRASH_KERNEL_LOW_SIZE (128UL << 20) -#endif -#ifndef CRASH_ALIGN -#define CRASH_ALIGN SZ_2M -#endif -#ifndef CRASH_ADDR_LOW_MAX -#define CRASH_ADDR_LOW_MAX SZ_4G -#endif -#ifndef CRASH_ADDR_HIGH_MAX -#define CRASH_ADDR_HIGH_MAX memblock_end_of_DRAM() -#endif - -void __init reserve_crashkernel_generic(char *cmdline, - unsigned long long crash_size, - unsigned long long crash_base, - unsigned long long crash_low_size, - bool high); -#else -static inline void __init reserve_crashkernel_generic(char *cmdline, - unsigned long long crash_size, - unsigned long long crash_base, - unsigned long long crash_low_size, - bool high) -{} -#endif - /* Alignment required for elf header segment */ #define ELF_CORE_HEADER_ALIGN 4096 diff --git a/include/linux/crash_reserve.h b/include/linux/crash_reserve.h new file mode 100644 index 000000000000..5a9df944fb80 --- /dev/null +++ b/include/linux/crash_reserve.h @@ -0,0 +1,48 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef LINUX_CRASH_RESERVE_H +#define LINUX_CRASH_RESERVE_H + +#include +#include +#include +#ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION +#include +#endif + +/* Location of a reserved region to hold the crash kernel. + */ +extern struct resource crashk_res; +extern struct resource crashk_low_res; + +int __init parse_crashkernel(char *cmdline, unsigned long long system_ram, + unsigned long long *crash_size, unsigned long long *crash_base, + unsigned long long *low_size, bool *high); + +#ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION +#ifndef DEFAULT_CRASH_KERNEL_LOW_SIZE +#define DEFAULT_CRASH_KERNEL_LOW_SIZE (128UL << 20) +#endif +#ifndef CRASH_ALIGN +#define CRASH_ALIGN SZ_2M +#endif +#ifndef CRASH_ADDR_LOW_MAX +#define CRASH_ADDR_LOW_MAX SZ_4G +#endif +#ifndef CRASH_ADDR_HIGH_MAX +#define CRASH_ADDR_HIGH_MAX memblock_end_of_DRAM() +#endif + +void __init reserve_crashkernel_generic(char *cmdline, + unsigned long long crash_size, + unsigned long long crash_base, + unsigned long long crash_low_size, + bool high); +#else +static inline void __init reserve_crashkernel_generic(char *cmdline, + unsigned long long crash_size, + unsigned long long crash_base, + unsigned long long crash_low_size, + bool high) +{} +#endif +#endif /* LINUX_CRASH_RESERVE_H */ diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 400cb6c02176..6d79bfb52e5b 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -16,6 +16,7 @@ #if !defined(__ASSEMBLY__) #include +#include #include #include diff --git a/kernel/Kconfig.kexec b/kernel/Kconfig.kexec index 946dffa048b7..8b7be71edd85 100644 --- a/kernel/Kconfig.kexec +++ b/kernel/Kconfig.kexec @@ -2,11 +2,15 @@ menu "Kexec and crash features" +config CRASH_RESERVE + bool + config CRASH_CORE bool config KEXEC_CORE select CRASH_CORE + select CRASH_RESERVE bool config KEXEC_ELF @@ -96,7 +100,6 @@ config KEXEC_JUMP config CRASH_DUMP bool "kernel crash dumps" depends on ARCH_SUPPORTS_CRASH_DUMP - select CRASH_CORE select KEXEC_CORE help Generate crash dump after being started by kexec. diff --git a/kernel/Makefile b/kernel/Makefile index ce105a5558fc..05fa88b3ab74 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -69,6 +69,7 @@ obj-$(CONFIG_KALLSYMS) += kallsyms.o obj-$(CONFIG_KALLSYMS_SELFTEST) += kallsyms_selftest.o obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o obj-$(CONFIG_CRASH_CORE) += crash_core.o +obj-$(CONFIG_CRASH_RESERVE) += crash_reserve.o obj-$(CONFIG_KEXEC_CORE) += kexec_core.o obj-$(CONFIG_KEXEC) += kexec.o obj-$(CONFIG_KEXEC_FILE) += kexec_file.o diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 49b31e59d3cc..ae0d1ce89b46 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -34,444 +34,6 @@ u32 *vmcoreinfo_note; /* trusted vmcoreinfo, e.g. we can make a copy in the crash memory */ static unsigned char *vmcoreinfo_data_safecopy; -/* Location of the reserved area for the crash kernel */ -struct resource crashk_res = { - .name = "Crash kernel", - .start = 0, - .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM, - .desc = IORES_DESC_CRASH_KERNEL -}; -struct resource crashk_low_res = { - .name = "Crash kernel", - .start = 0, - .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM, - .desc = IORES_DESC_CRASH_KERNEL -}; - -/* - * parsing the "crashkernel" commandline - * - * this code is intended to be called from architecture specific code - */ - - -/* - * This function parses command lines in the format - * - * crashkernel=ramsize-range:size[,...][@offset] - * - * The function returns 0 on success and -EINVAL on failure. - */ -static int __init parse_crashkernel_mem(char *cmdline, - unsigned long long system_ram, - unsigned long long *crash_size, - unsigned long long *crash_base) -{ - char *cur = cmdline, *tmp; - unsigned long long total_mem = system_ram; - - /* - * Firmware sometimes reserves some memory regions for its own use, - * so the system memory size is less than the actual physical memory - * size. Work around this by rounding up the total size to 128M, - * which is enough for most test cases. - */ - total_mem = roundup(total_mem, SZ_128M); - - /* for each entry of the comma-separated list */ - do { - unsigned long long start, end = ULLONG_MAX, size; - - /* get the start of the range */ - start = memparse(cur, &tmp); - if (cur == tmp) { - pr_warn("crashkernel: Memory value expected\n"); - return -EINVAL; - } - cur = tmp; - if (*cur != '-') { - pr_warn("crashkernel: '-' expected\n"); - return -EINVAL; - } - cur++; - - /* if no ':' is here, than we read the end */ - if (*cur != ':') { - end = memparse(cur, &tmp); - if (cur == tmp) { - pr_warn("crashkernel: Memory value expected\n"); - return -EINVAL; - } - cur = tmp; - if (end <= start) { - pr_warn("crashkernel: end <= start\n"); - return -EINVAL; - } - } - - if (*cur != ':') { - pr_warn("crashkernel: ':' expected\n"); - return -EINVAL; - } - cur++; - - size = memparse(cur, &tmp); - if (cur == tmp) { - pr_warn("Memory value expected\n"); - return -EINVAL; - } - cur = tmp; - if (size >= total_mem) { - pr_warn("crashkernel: invalid size\n"); - return -EINVAL; - } - - /* match ? */ - if (total_mem >= start && total_mem < end) { - *crash_size = size; - break; - } - } while (*cur++ == ','); - - if (*crash_size > 0) { - while (*cur && *cur != ' ' && *cur != '@') - cur++; - if (*cur == '@') { - cur++; - *crash_base = memparse(cur, &tmp); - if (cur == tmp) { - pr_warn("Memory value expected after '@'\n"); - return -EINVAL; - } - } - } else - pr_info("crashkernel size resulted in zero bytes\n"); - - return 0; -} - -/* - * That function parses "simple" (old) crashkernel command lines like - * - * crashkernel=size[@offset] - * - * It returns 0 on success and -EINVAL on failure. - */ -static int __init parse_crashkernel_simple(char *cmdline, - unsigned long long *crash_size, - unsigned long long *crash_base) -{ - char *cur = cmdline; - - *crash_size = memparse(cmdline, &cur); - if (cmdline == cur) { - pr_warn("crashkernel: memory value expected\n"); - return -EINVAL; - } - - if (*cur == '@') - *crash_base = memparse(cur+1, &cur); - else if (*cur != ' ' && *cur != '\0') { - pr_warn("crashkernel: unrecognized char: %c\n", *cur); - return -EINVAL; - } - - return 0; -} - -#define SUFFIX_HIGH 0 -#define SUFFIX_LOW 1 -#define SUFFIX_NULL 2 -static __initdata char *suffix_tbl[] = { - [SUFFIX_HIGH] = ",high", - [SUFFIX_LOW] = ",low", - [SUFFIX_NULL] = NULL, -}; - -/* - * That function parses "suffix" crashkernel command lines like - * - * crashkernel=size,[high|low] - * - * It returns 0 on success and -EINVAL on failure. - */ -static int __init parse_crashkernel_suffix(char *cmdline, - unsigned long long *crash_size, - const char *suffix) -{ - char *cur = cmdline; - - *crash_size = memparse(cmdline, &cur); - if (cmdline == cur) { - pr_warn("crashkernel: memory value expected\n"); - return -EINVAL; - } - - /* check with suffix */ - if (strncmp(cur, suffix, strlen(suffix))) { - pr_warn("crashkernel: unrecognized char: %c\n", *cur); - return -EINVAL; - } - cur += strlen(suffix); - if (*cur != ' ' && *cur != '\0') { - pr_warn("crashkernel: unrecognized char: %c\n", *cur); - return -EINVAL; - } - - return 0; -} - -static __init char *get_last_crashkernel(char *cmdline, - const char *name, - const char *suffix) -{ - char *p = cmdline, *ck_cmdline = NULL; - - /* find crashkernel and use the last one if there are more */ - p = strstr(p, name); - while (p) { - char *end_p = strchr(p, ' '); - char *q; - - if (!end_p) - end_p = p + strlen(p); - - if (!suffix) { - int i; - - /* skip the one with any known suffix */ - for (i = 0; suffix_tbl[i]; i++) { - q = end_p - strlen(suffix_tbl[i]); - if (!strncmp(q, suffix_tbl[i], - strlen(suffix_tbl[i]))) - goto next; - } - ck_cmdline = p; - } else { - q = end_p - strlen(suffix); - if (!strncmp(q, suffix, strlen(suffix))) - ck_cmdline = p; - } -next: - p = strstr(p+1, name); - } - - return ck_cmdline; -} - -static int __init __parse_crashkernel(char *cmdline, - unsigned long long system_ram, - unsigned long long *crash_size, - unsigned long long *crash_base, - const char *suffix) -{ - char *first_colon, *first_space; - char *ck_cmdline; - char *name = "crashkernel="; - - BUG_ON(!crash_size || !crash_base); - *crash_size = 0; - *crash_base = 0; - - ck_cmdline = get_last_crashkernel(cmdline, name, suffix); - if (!ck_cmdline) - return -ENOENT; - - ck_cmdline += strlen(name); - - if (suffix) - return parse_crashkernel_suffix(ck_cmdline, crash_size, - suffix); - /* - * if the commandline contains a ':', then that's the extended - * syntax -- if not, it must be the classic syntax - */ - first_colon = strchr(ck_cmdline, ':'); - first_space = strchr(ck_cmdline, ' '); - if (first_colon && (!first_space || first_colon < first_space)) - return parse_crashkernel_mem(ck_cmdline, system_ram, - crash_size, crash_base); - - return parse_crashkernel_simple(ck_cmdline, crash_size, crash_base); -} - -/* - * That function is the entry point for command line parsing and should be - * called from the arch-specific code. - * - * If crashkernel=,high|low is supported on architecture, non-NULL values - * should be passed to parameters 'low_size' and 'high'. - */ -int __init parse_crashkernel(char *cmdline, - unsigned long long system_ram, - unsigned long long *crash_size, - unsigned long long *crash_base, - unsigned long long *low_size, - bool *high) -{ - int ret; - - /* crashkernel=X[@offset] */ - ret = __parse_crashkernel(cmdline, system_ram, crash_size, - crash_base, NULL); -#ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION - /* - * If non-NULL 'high' passed in and no normal crashkernel - * setting detected, try parsing crashkernel=,high|low. - */ - if (high && ret == -ENOENT) { - ret = __parse_crashkernel(cmdline, 0, crash_size, - crash_base, suffix_tbl[SUFFIX_HIGH]); - if (ret || !*crash_size) - return -EINVAL; - - /* - * crashkernel=Y,low can be specified or not, but invalid value - * is not allowed. - */ - ret = __parse_crashkernel(cmdline, 0, low_size, - crash_base, suffix_tbl[SUFFIX_LOW]); - if (ret == -ENOENT) { - *low_size = DEFAULT_CRASH_KERNEL_LOW_SIZE; - ret = 0; - } else if (ret) { - return ret; - } - - *high = true; - } -#endif - if (!*crash_size) - ret = -EINVAL; - - return ret; -} - -/* - * Add a dummy early_param handler to mark crashkernel= as a known command line - * parameter and suppress incorrect warnings in init/main.c. - */ -static int __init parse_crashkernel_dummy(char *arg) -{ - return 0; -} -early_param("crashkernel", parse_crashkernel_dummy); - -#ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION -static int __init reserve_crashkernel_low(unsigned long long low_size) -{ -#ifdef CONFIG_64BIT - unsigned long long low_base; - - low_base = memblock_phys_alloc_range(low_size, CRASH_ALIGN, 0, CRASH_ADDR_LOW_MAX); - if (!low_base) { - pr_err("cannot allocate crashkernel low memory (size:0x%llx).\n", low_size); - return -ENOMEM; - } - - pr_info("crashkernel low memory reserved: 0x%08llx - 0x%08llx (%lld MB)\n", - low_base, low_base + low_size, low_size >> 20); - - crashk_low_res.start = low_base; - crashk_low_res.end = low_base + low_size - 1; -#endif - return 0; -} - -void __init reserve_crashkernel_generic(char *cmdline, - unsigned long long crash_size, - unsigned long long crash_base, - unsigned long long crash_low_size, - bool high) -{ - unsigned long long search_end = CRASH_ADDR_LOW_MAX, search_base = 0; - bool fixed_base = false; - - /* User specifies base address explicitly. */ - if (crash_base) { - fixed_base = true; - search_base = crash_base; - search_end = crash_base + crash_size; - } else if (high) { - search_base = CRASH_ADDR_LOW_MAX; - search_end = CRASH_ADDR_HIGH_MAX; - } - -retry: - crash_base = memblock_phys_alloc_range(crash_size, CRASH_ALIGN, - search_base, search_end); - if (!crash_base) { - /* - * For crashkernel=size[KMG]@offset[KMG], print out failure - * message if can't reserve the specified region. - */ - if (fixed_base) { - pr_warn("crashkernel reservation failed - memory is in use.\n"); - return; - } - - /* - * For crashkernel=size[KMG], if the first attempt was for - * low memory, fall back to high memory, the minimum required - * low memory will be reserved later. - */ - if (!high && search_end == CRASH_ADDR_LOW_MAX) { - search_end = CRASH_ADDR_HIGH_MAX; - search_base = CRASH_ADDR_LOW_MAX; - crash_low_size = DEFAULT_CRASH_KERNEL_LOW_SIZE; - goto retry; - } - - /* - * For crashkernel=size[KMG],high, if the first attempt was - * for high memory, fall back to low memory. - */ - if (high && search_end == CRASH_ADDR_HIGH_MAX) { - search_end = CRASH_ADDR_LOW_MAX; - search_base = 0; - goto retry; - } - pr_warn("cannot allocate crashkernel (size:0x%llx)\n", - crash_size); - return; - } - - if ((crash_base >= CRASH_ADDR_LOW_MAX) && - crash_low_size && reserve_crashkernel_low(crash_low_size)) { - memblock_phys_free(crash_base, crash_size); - return; - } - - pr_info("crashkernel reserved: 0x%016llx - 0x%016llx (%lld MB)\n", - crash_base, crash_base + crash_size, crash_size >> 20); - - /* - * The crashkernel memory will be removed from the kernel linear - * map. Inform kmemleak so that it won't try to access it. - */ - kmemleak_ignore_phys(crash_base); - if (crashk_low_res.end) - kmemleak_ignore_phys(crashk_low_res.start); - - crashk_res.start = crash_base; - crashk_res.end = crash_base + crash_size - 1; -} - -static __init int insert_crashkernel_resources(void) -{ - if (crashk_res.start < crashk_res.end) - insert_resource(&iomem_resource, &crashk_res); - - if (crashk_low_res.start < crashk_low_res.end) - insert_resource(&iomem_resource, &crashk_low_res); - - return 0; -} -early_initcall(insert_crashkernel_resources); -#endif - int crash_prepare_elf64_headers(struct crash_mem *mem, int need_kernel_map, void **addr, unsigned long *sz) { diff --git a/kernel/crash_reserve.c b/kernel/crash_reserve.c new file mode 100644 index 000000000000..bbb6c3cb00e4 --- /dev/null +++ b/kernel/crash_reserve.c @@ -0,0 +1,464 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * crash.c - kernel crash support code. + * Copyright (C) 2002-2004 Eric Biederman + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +#include "kallsyms_internal.h" +#include "kexec_internal.h" + +/* Location of the reserved area for the crash kernel */ +struct resource crashk_res = { + .name = "Crash kernel", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM, + .desc = IORES_DESC_CRASH_KERNEL +}; +struct resource crashk_low_res = { + .name = "Crash kernel", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM, + .desc = IORES_DESC_CRASH_KERNEL +}; + +/* + * parsing the "crashkernel" commandline + * + * this code is intended to be called from architecture specific code + */ + + +/* + * This function parses command lines in the format + * + * crashkernel=ramsize-range:size[,...][@offset] + * + * The function returns 0 on success and -EINVAL on failure. + */ +static int __init parse_crashkernel_mem(char *cmdline, + unsigned long long system_ram, + unsigned long long *crash_size, + unsigned long long *crash_base) +{ + char *cur = cmdline, *tmp; + unsigned long long total_mem = system_ram; + + /* + * Firmware sometimes reserves some memory regions for its own use, + * so the system memory size is less than the actual physical memory + * size. Work around this by rounding up the total size to 128M, + * which is enough for most test cases. + */ + total_mem = roundup(total_mem, SZ_128M); + + /* for each entry of the comma-separated list */ + do { + unsigned long long start, end = ULLONG_MAX, size; + + /* get the start of the range */ + start = memparse(cur, &tmp); + if (cur == tmp) { + pr_warn("crashkernel: Memory value expected\n"); + return -EINVAL; + } + cur = tmp; + if (*cur != '-') { + pr_warn("crashkernel: '-' expected\n"); + return -EINVAL; + } + cur++; + + /* if no ':' is here, than we read the end */ + if (*cur != ':') { + end = memparse(cur, &tmp); + if (cur == tmp) { + pr_warn("crashkernel: Memory value expected\n"); + return -EINVAL; + } + cur = tmp; + if (end <= start) { + pr_warn("crashkernel: end <= start\n"); + return -EINVAL; + } + } + + if (*cur != ':') { + pr_warn("crashkernel: ':' expected\n"); + return -EINVAL; + } + cur++; + + size = memparse(cur, &tmp); + if (cur == tmp) { + pr_warn("Memory value expected\n"); + return -EINVAL; + } + cur = tmp; + if (size >= total_mem) { + pr_warn("crashkernel: invalid size\n"); + return -EINVAL; + } + + /* match ? */ + if (total_mem >= start && total_mem < end) { + *crash_size = size; + break; + } + } while (*cur++ == ','); + + if (*crash_size > 0) { + while (*cur && *cur != ' ' && *cur != '@') + cur++; + if (*cur == '@') { + cur++; + *crash_base = memparse(cur, &tmp); + if (cur == tmp) { + pr_warn("Memory value expected after '@'\n"); + return -EINVAL; + } + } + } else + pr_info("crashkernel size resulted in zero bytes\n"); + + return 0; +} + +/* + * That function parses "simple" (old) crashkernel command lines like + * + * crashkernel=size[@offset] + * + * It returns 0 on success and -EINVAL on failure. + */ +static int __init parse_crashkernel_simple(char *cmdline, + unsigned long long *crash_size, + unsigned long long *crash_base) +{ + char *cur = cmdline; + + *crash_size = memparse(cmdline, &cur); + if (cmdline == cur) { + pr_warn("crashkernel: memory value expected\n"); + return -EINVAL; + } + + if (*cur == '@') + *crash_base = memparse(cur+1, &cur); + else if (*cur != ' ' && *cur != '\0') { + pr_warn("crashkernel: unrecognized char: %c\n", *cur); + return -EINVAL; + } + + return 0; +} + +#define SUFFIX_HIGH 0 +#define SUFFIX_LOW 1 +#define SUFFIX_NULL 2 +static __initdata char *suffix_tbl[] = { + [SUFFIX_HIGH] = ",high", + [SUFFIX_LOW] = ",low", + [SUFFIX_NULL] = NULL, +}; + +/* + * That function parses "suffix" crashkernel command lines like + * + * crashkernel=size,[high|low] + * + * It returns 0 on success and -EINVAL on failure. + */ +static int __init parse_crashkernel_suffix(char *cmdline, + unsigned long long *crash_size, + const char *suffix) +{ + char *cur = cmdline; + + *crash_size = memparse(cmdline, &cur); + if (cmdline == cur) { + pr_warn("crashkernel: memory value expected\n"); + return -EINVAL; + } + + /* check with suffix */ + if (strncmp(cur, suffix, strlen(suffix))) { + pr_warn("crashkernel: unrecognized char: %c\n", *cur); + return -EINVAL; + } + cur += strlen(suffix); + if (*cur != ' ' && *cur != '\0') { + pr_warn("crashkernel: unrecognized char: %c\n", *cur); + return -EINVAL; + } + + return 0; +} + +static __init char *get_last_crashkernel(char *cmdline, + const char *name, + const char *suffix) +{ + char *p = cmdline, *ck_cmdline = NULL; + + /* find crashkernel and use the last one if there are more */ + p = strstr(p, name); + while (p) { + char *end_p = strchr(p, ' '); + char *q; + + if (!end_p) + end_p = p + strlen(p); + + if (!suffix) { + int i; + + /* skip the one with any known suffix */ + for (i = 0; suffix_tbl[i]; i++) { + q = end_p - strlen(suffix_tbl[i]); + if (!strncmp(q, suffix_tbl[i], + strlen(suffix_tbl[i]))) + goto next; + } + ck_cmdline = p; + } else { + q = end_p - strlen(suffix); + if (!strncmp(q, suffix, strlen(suffix))) + ck_cmdline = p; + } +next: + p = strstr(p+1, name); + } + + return ck_cmdline; +} + +static int __init __parse_crashkernel(char *cmdline, + unsigned long long system_ram, + unsigned long long *crash_size, + unsigned long long *crash_base, + const char *suffix) +{ + char *first_colon, *first_space; + char *ck_cmdline; + char *name = "crashkernel="; + + BUG_ON(!crash_size || !crash_base); + *crash_size = 0; + *crash_base = 0; + + ck_cmdline = get_last_crashkernel(cmdline, name, suffix); + if (!ck_cmdline) + return -ENOENT; + + ck_cmdline += strlen(name); + + if (suffix) + return parse_crashkernel_suffix(ck_cmdline, crash_size, + suffix); + /* + * if the commandline contains a ':', then that's the extended + * syntax -- if not, it must be the classic syntax + */ + first_colon = strchr(ck_cmdline, ':'); + first_space = strchr(ck_cmdline, ' '); + if (first_colon && (!first_space || first_colon < first_space)) + return parse_crashkernel_mem(ck_cmdline, system_ram, + crash_size, crash_base); + + return parse_crashkernel_simple(ck_cmdline, crash_size, crash_base); +} + +/* + * That function is the entry point for command line parsing and should be + * called from the arch-specific code. + * + * If crashkernel=,high|low is supported on architecture, non-NULL values + * should be passed to parameters 'low_size' and 'high'. + */ +int __init parse_crashkernel(char *cmdline, + unsigned long long system_ram, + unsigned long long *crash_size, + unsigned long long *crash_base, + unsigned long long *low_size, + bool *high) +{ + int ret; + + /* crashkernel=X[@offset] */ + ret = __parse_crashkernel(cmdline, system_ram, crash_size, + crash_base, NULL); +#ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION + /* + * If non-NULL 'high' passed in and no normal crashkernel + * setting detected, try parsing crashkernel=,high|low. + */ + if (high && ret == -ENOENT) { + ret = __parse_crashkernel(cmdline, 0, crash_size, + crash_base, suffix_tbl[SUFFIX_HIGH]); + if (ret || !*crash_size) + return -EINVAL; + + /* + * crashkernel=Y,low can be specified or not, but invalid value + * is not allowed. + */ + ret = __parse_crashkernel(cmdline, 0, low_size, + crash_base, suffix_tbl[SUFFIX_LOW]); + if (ret == -ENOENT) { + *low_size = DEFAULT_CRASH_KERNEL_LOW_SIZE; + ret = 0; + } else if (ret) { + return ret; + } + + *high = true; + } +#endif + if (!*crash_size) + ret = -EINVAL; + + return ret; +} + +/* + * Add a dummy early_param handler to mark crashkernel= as a known command line + * parameter and suppress incorrect warnings in init/main.c. + */ +static int __init parse_crashkernel_dummy(char *arg) +{ + return 0; +} +early_param("crashkernel", parse_crashkernel_dummy); + +#ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION +static int __init reserve_crashkernel_low(unsigned long long low_size) +{ +#ifdef CONFIG_64BIT + unsigned long long low_base; + + low_base = memblock_phys_alloc_range(low_size, CRASH_ALIGN, 0, CRASH_ADDR_LOW_MAX); + if (!low_base) { + pr_err("cannot allocate crashkernel low memory (size:0x%llx).\n", low_size); + return -ENOMEM; + } + + pr_info("crashkernel low memory reserved: 0x%08llx - 0x%08llx (%lld MB)\n", + low_base, low_base + low_size, low_size >> 20); + + crashk_low_res.start = low_base; + crashk_low_res.end = low_base + low_size - 1; + insert_resource(&iomem_resource, &crashk_low_res); +#endif + return 0; +} + +void __init reserve_crashkernel_generic(char *cmdline, + unsigned long long crash_size, + unsigned long long crash_base, + unsigned long long crash_low_size, + bool high) +{ + unsigned long long search_end = CRASH_ADDR_LOW_MAX, search_base = 0; + bool fixed_base = false; + + /* User specifies base address explicitly. */ + if (crash_base) { + fixed_base = true; + search_base = crash_base; + search_end = crash_base + crash_size; + } else if (high) { + search_base = CRASH_ADDR_LOW_MAX; + search_end = CRASH_ADDR_HIGH_MAX; + } + +retry: + crash_base = memblock_phys_alloc_range(crash_size, CRASH_ALIGN, + search_base, search_end); + if (!crash_base) { + /* + * For crashkernel=size[KMG]@offset[KMG], print out failure + * message if can't reserve the specified region. + */ + if (fixed_base) { + pr_warn("crashkernel reservation failed - memory is in use.\n"); + return; + } + + /* + * For crashkernel=size[KMG], if the first attempt was for + * low memory, fall back to high memory, the minimum required + * low memory will be reserved later. + */ + if (!high && search_end == CRASH_ADDR_LOW_MAX) { + search_end = CRASH_ADDR_HIGH_MAX; + search_base = CRASH_ADDR_LOW_MAX; + crash_low_size = DEFAULT_CRASH_KERNEL_LOW_SIZE; + goto retry; + } + + /* + * For crashkernel=size[KMG],high, if the first attempt was + * for high memory, fall back to low memory. + */ + if (high && search_end == CRASH_ADDR_HIGH_MAX) { + search_end = CRASH_ADDR_LOW_MAX; + search_base = 0; + goto retry; + } + pr_warn("cannot allocate crashkernel (size:0x%llx)\n", + crash_size); + return; + } + + if ((crash_base >= CRASH_ADDR_LOW_MAX) && + crash_low_size && reserve_crashkernel_low(crash_low_size)) { + memblock_phys_free(crash_base, crash_size); + return; + } + + pr_info("crashkernel reserved: 0x%016llx - 0x%016llx (%lld MB)\n", + crash_base, crash_base + crash_size, crash_size >> 20); + + /* + * The crashkernel memory will be removed from the kernel linear + * map. Inform kmemleak so that it won't try to access it. + */ + kmemleak_ignore_phys(crash_base); + if (crashk_low_res.end) + kmemleak_ignore_phys(crashk_low_res.start); + + crashk_res.start = crash_base; + crashk_res.end = crash_base + crash_size - 1; +} + +static __init int insert_crashkernel_resources(void) +{ + if (crashk_res.start < crashk_res.end) + insert_resource(&iomem_resource, &crashk_res); + + if (crashk_low_res.start < crashk_low_res.end) + insert_resource(&iomem_resource, &crashk_low_res); + + return 0; +} +early_initcall(insert_crashkernel_resources); +#endif -- cgit v1.2.3 From 40254101d87870b2e5ac3ddc28af40aa04c48486 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Wed, 24 Jan 2024 13:12:47 +0800 Subject: arm64, crash: wrap crash dumping code into crash related ifdefs Now crash codes under kernel/ folder has been split out from kexec code, crash dumping can be separated from kexec reboot in config items on arm64 with some adjustments. Here wrap up crash dumping codes with CONFIG_CRASH_DUMP ifdeffery. [bhe@redhat.com: fix building error in generic codes] Link: https://lkml.kernel.org/r/20240129135033.157195-2-bhe@redhat.com Link: https://lkml.kernel.org/r/20240124051254.67105-8-bhe@redhat.com Signed-off-by: Baoquan He Cc: Al Viro Cc: Eric W. Biederman Cc: Hari Bathini Cc: Pingfan Liu Cc: Klara Modin Cc: Michael Kelley Cc: Nathan Chancellor Cc: Stephen Rothwell Cc: Yang Li Signed-off-by: Andrew Morton --- arch/arm64/include/asm/kexec.h | 2 +- arch/arm64/kernel/machine_kexec.c | 2 +- arch/arm64/kernel/machine_kexec_file.c | 10 ++++++++-- arch/arm64/mm/init.c | 2 +- drivers/of/kexec.c | 2 ++ kernel/kexec_file.c | 2 ++ 6 files changed, 15 insertions(+), 5 deletions(-) (limited to 'arch/arm64/include') diff --git a/arch/arm64/include/asm/kexec.h b/arch/arm64/include/asm/kexec.h index 9ac9572a3bbe..4d9cc7a76d9c 100644 --- a/arch/arm64/include/asm/kexec.h +++ b/arch/arm64/include/asm/kexec.h @@ -80,7 +80,7 @@ static inline void crash_setup_regs(struct pt_regs *newregs, } } -#if defined(CONFIG_KEXEC_CORE) && defined(CONFIG_HIBERNATION) +#if defined(CONFIG_CRASH_DUMP) && defined(CONFIG_HIBERNATION) extern bool crash_is_nosave(unsigned long pfn); extern void crash_prepare_suspend(void); extern void crash_post_resume(void); diff --git a/arch/arm64/kernel/machine_kexec.c b/arch/arm64/kernel/machine_kexec.c index b38aae5b488d..82e2203d86a3 100644 --- a/arch/arm64/kernel/machine_kexec.c +++ b/arch/arm64/kernel/machine_kexec.c @@ -255,7 +255,7 @@ void machine_crash_shutdown(struct pt_regs *regs) pr_info("Starting crashdump kernel...\n"); } -#ifdef CONFIG_HIBERNATION +#if defined(CONFIG_CRASH_DUMP) && defined(CONFIG_HIBERNATION) /* * To preserve the crash dump kernel image, the relevant memory segments * should be mapped again around the hibernation. diff --git a/arch/arm64/kernel/machine_kexec_file.c b/arch/arm64/kernel/machine_kexec_file.c index 0e017358f4ba..af1ca875c52c 100644 --- a/arch/arm64/kernel/machine_kexec_file.c +++ b/arch/arm64/kernel/machine_kexec_file.c @@ -39,6 +39,7 @@ int arch_kimage_file_post_load_cleanup(struct kimage *image) return kexec_image_post_load_cleanup_default(image); } +#ifdef CONFIG_CRASH_DUMP static int prepare_elf_headers(void **addr, unsigned long *sz) { struct crash_mem *cmem; @@ -80,6 +81,7 @@ out: kfree(cmem); return ret; } +#endif /* * Tries to add the initrd and DTB to the image. If it is not possible to find @@ -93,8 +95,8 @@ int load_other_segments(struct kimage *image, char *cmdline) { struct kexec_buf kbuf; - void *headers, *dtb = NULL; - unsigned long headers_sz, initrd_load_addr = 0, dtb_len, + void *dtb = NULL; + unsigned long initrd_load_addr = 0, dtb_len, orig_segments = image->nr_segments; int ret = 0; @@ -102,7 +104,10 @@ int load_other_segments(struct kimage *image, /* not allocate anything below the kernel */ kbuf.buf_min = kernel_load_addr + kernel_size; +#ifdef CONFIG_CRASH_DUMP /* load elf core header */ + void *headers; + unsigned long headers_sz; if (image->type == KEXEC_TYPE_CRASH) { ret = prepare_elf_headers(&headers, &headers_sz); if (ret) { @@ -130,6 +135,7 @@ int load_other_segments(struct kimage *image, kexec_dprintk("Loaded elf core header at 0x%lx bufsz=0x%lx memsz=0x%lx\n", image->elf_load_addr, kbuf.bufsz, kbuf.memsz); } +#endif /* load initrd */ if (initrd) { diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 74c1db8ce271..c1f6213e77f3 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -100,7 +100,7 @@ static void __init arch_reserve_crashkernel(void) bool high = false; int ret; - if (!IS_ENABLED(CONFIG_KEXEC_CORE)) + if (!IS_ENABLED(CONFIG_CRASH_RESERVE)) return; ret = parse_crashkernel(cmdline, memblock_phys_mem_size(), diff --git a/drivers/of/kexec.c b/drivers/of/kexec.c index 68278340cecf..9ccde2fd77cb 100644 --- a/drivers/of/kexec.c +++ b/drivers/of/kexec.c @@ -395,6 +395,7 @@ void *of_kexec_alloc_and_setup_fdt(const struct kimage *image, if (ret) goto out; +#ifdef CONFIG_CRASH_DUMP /* add linux,usable-memory-range */ ret = fdt_appendprop_addrrange(fdt, 0, chosen_node, "linux,usable-memory-range", crashk_res.start, @@ -410,6 +411,7 @@ void *of_kexec_alloc_and_setup_fdt(const struct kimage *image, if (ret) goto out; } +#endif } /* add bootargs */ diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index ce7ce2ae27cd..2d1db05fbf04 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -540,8 +540,10 @@ static int kexec_walk_memblock(struct kexec_buf *kbuf, phys_addr_t mstart, mend; struct resource res = { }; +#ifdef CONFIG_CRASH_DUMP if (kbuf->image->type == KEXEC_TYPE_CRASH) return func(&crashk_res, kbuf); +#endif /* * Using MEMBLOCK_NONE will properly skip MEMBLOCK_DRIVER_MANAGED. See -- cgit v1.2.3