From 527ed4f7d902d362471a93e1a4afb604c18ceb48 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Fri, 30 Jun 2023 14:22:52 +0800 Subject: mm: remove arguments of show_mem() All callers of show_mem() pass 0 and NULL, so we can remove the two arguments by directly calling __show_mem(0, NULL, MAX_NR_ZONES - 1) in show_mem(). Link: https://lkml.kernel.org/r/20230630062253.189440-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: Christophe Leroy Cc: Greg Kroah-Hartman Cc: Matthew Wilcox Cc: Michael Ellerman Cc: Nicholas Piggin Signed-off-by: Andrew Morton --- arch/powerpc/xmon/xmon.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index fae747cc57d2..ee17270d35d0 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -1084,7 +1084,7 @@ cmds(struct pt_regs *excp) memzcan(); break; case 'i': - show_mem(0, NULL); + show_mem(); break; default: termch = cmd; -- cgit v1.2.3 From 3d140215a6aec37f112aec1606c6a76f7e4443d3 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 11 Jul 2023 21:34:25 -0700 Subject: powerpc: assert_pte_locked() use pte_offset_map_nolock() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Instead of pte_lockptr(), use the recently added pte_offset_map_nolock() in assert_pte_locked(). BUG if pte_offset_map_nolock() fails. This mod might cause new crashes: which either expose my ignorance, or indicate issues to be fixed, or limit the usage of assert_pte_locked(). [hughd@google.com: assert_pte_locked() still needs the pmd_none() check] Link: https://lkml.kernel.org/r/c73d1543-532c-3da2-8cf2-a95363a14116@google.com Link: https://lkml.kernel.org/r/e8d56c95-c132-a82e-5f5f-7bb1b738b057@google.com Signed-off-by: Hugh Dickins Cc: Alexander Gordeev Cc: Alistair Popple Cc: Aneesh Kumar K.V Cc: Anshuman Khandual Cc: Axel Rasmussen Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Christoph Hellwig Cc: Claudio Imbrenda Cc: David Hildenbrand Cc: "David S. Miller" Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Huang, Ying Cc: Ira Weiny Cc: Jann Horn Cc: Jason Gunthorpe Cc: Kirill A. Shutemov Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Miaohe Lin Cc: Michael Ellerman Cc: Mike Kravetz Cc: Mike Rapoport (IBM) Cc: Minchan Kim Cc: Naoya Horiguchi Cc: Pavel Tatashin Cc: Peter Xu Cc: Peter Zijlstra Cc: Qi Zheng Cc: Ralph Campbell Cc: Russell King Cc: SeongJae Park Cc: Song Liu Cc: Steven Price Cc: Suren Baghdasaryan Cc: Thomas Hellström Cc: Vasily Gorbik Cc: Vishal Moola (Oracle) Cc: Vlastimil Babka Cc: Will Deacon Cc: Yang Shi Cc: Yu Zhao Cc: Zack Rusin Cc: Zi Yan Signed-off-by: Andrew Morton --- arch/powerpc/mm/pgtable.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c index cb2dcdb18f8e..a3dcdb2d5b4b 100644 --- a/arch/powerpc/mm/pgtable.c +++ b/arch/powerpc/mm/pgtable.c @@ -311,6 +311,8 @@ void assert_pte_locked(struct mm_struct *mm, unsigned long addr) p4d_t *p4d; pud_t *pud; pmd_t *pmd; + pte_t *pte; + spinlock_t *ptl; if (mm == &init_mm) return; @@ -329,8 +331,10 @@ void assert_pte_locked(struct mm_struct *mm, unsigned long addr) */ if (pmd_none(*pmd)) return; - BUG_ON(!pmd_present(*pmd)); - assert_spin_locked(pte_lockptr(mm, pmd)); + pte = pte_offset_map_nolock(mm, pmd, addr, &ptl); + BUG_ON(!pte); + assert_spin_locked(ptl); + pte_unmap(pte); } #endif /* CONFIG_DEBUG_VM */ -- cgit v1.2.3 From 32cc0b7c9d508efde8946a82eb3c4acfa8dfed15 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 11 Jul 2023 21:35:59 -0700 Subject: powerpc: add pte_free_defer() for pgtables sharing page MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add powerpc-specific pte_free_defer(), to free table page via call_rcu(). pte_free_defer() will be called inside khugepaged's retract_page_tables() loop, where allocating extra memory cannot be relied upon. This precedes the generic version to avoid build breakage from incompatible pgtable_t. This is awkward because the struct page contains only one rcu_head, but that page may be shared between PTE_FRAG_NR pagetables, each wanting to use the rcu_head at the same time. But powerpc never reuses a fragment once it has been freed: so mark the page Active in pte_free_defer(), before calling pte_fragment_free() directly; and there call_rcu() to pte_free_now() when last fragment is freed and the page is PageActive. Link: https://lkml.kernel.org/r/6e3ca5f1-334d-4b14-b92d-fc8e99914fcb@google.com Suggested-by: Jason Gunthorpe Signed-off-by: Hugh Dickins Cc: Alexander Gordeev Cc: Alistair Popple Cc: Aneesh Kumar K.V Cc: Anshuman Khandual Cc: Axel Rasmussen Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Christoph Hellwig Cc: Claudio Imbrenda Cc: David Hildenbrand Cc: "David S. Miller" Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Huang, Ying Cc: Ira Weiny Cc: Jann Horn Cc: Kirill A. Shutemov Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Miaohe Lin Cc: Michael Ellerman Cc: Mike Kravetz Cc: Mike Rapoport (IBM) Cc: Minchan Kim Cc: Naoya Horiguchi Cc: Pavel Tatashin Cc: Peter Xu Cc: Peter Zijlstra Cc: Qi Zheng Cc: Ralph Campbell Cc: Russell King Cc: SeongJae Park Cc: Song Liu Cc: Steven Price Cc: Suren Baghdasaryan Cc: Thomas Hellström Cc: Vasily Gorbik Cc: Vishal Moola (Oracle) Cc: Vlastimil Babka Cc: Will Deacon Cc: Yang Shi Cc: Yu Zhao Cc: Zack Rusin Cc: Zi Yan Signed-off-by: Andrew Morton --- arch/powerpc/include/asm/pgalloc.h | 4 ++++ arch/powerpc/mm/pgtable-frag.c | 29 ++++++++++++++++++++++++++--- 2 files changed, 30 insertions(+), 3 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/include/asm/pgalloc.h b/arch/powerpc/include/asm/pgalloc.h index 3360cad78ace..3a971e2a8c73 100644 --- a/arch/powerpc/include/asm/pgalloc.h +++ b/arch/powerpc/include/asm/pgalloc.h @@ -45,6 +45,10 @@ static inline void pte_free(struct mm_struct *mm, pgtable_t ptepage) pte_fragment_free((unsigned long *)ptepage, 0); } +/* arch use pte_free_defer() implementation in arch/powerpc/mm/pgtable-frag.c */ +#define pte_free_defer pte_free_defer +void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable); + /* * Functions that deal with pagetables that could be at any level of * the table need to be passed an "index_size" so they know how to diff --git a/arch/powerpc/mm/pgtable-frag.c b/arch/powerpc/mm/pgtable-frag.c index 20652daa1d7e..0c6b68130025 100644 --- a/arch/powerpc/mm/pgtable-frag.c +++ b/arch/powerpc/mm/pgtable-frag.c @@ -106,6 +106,15 @@ pte_t *pte_fragment_alloc(struct mm_struct *mm, int kernel) return __alloc_for_ptecache(mm, kernel); } +static void pte_free_now(struct rcu_head *head) +{ + struct page *page; + + page = container_of(head, struct page, rcu_head); + pgtable_pte_page_dtor(page); + __free_page(page); +} + void pte_fragment_free(unsigned long *table, int kernel) { struct page *page = virt_to_page(table); @@ -115,8 +124,22 @@ void pte_fragment_free(unsigned long *table, int kernel) BUG_ON(atomic_read(&page->pt_frag_refcount) <= 0); if (atomic_dec_and_test(&page->pt_frag_refcount)) { - if (!kernel) - pgtable_pte_page_dtor(page); - __free_page(page); + if (kernel) + __free_page(page); + else if (TestClearPageActive(page)) + call_rcu(&page->rcu_head, pte_free_now); + else + pte_free_now(&page->rcu_head); } } + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable) +{ + struct page *page; + + page = virt_to_page(pgtable); + SetPageActive(page); + pte_fragment_free((unsigned long *)pgtable, 0); +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ -- cgit v1.2.3 From 0b1f77e74b5a9234e83dc89f1593b769547a37fa Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Thu, 6 Jul 2023 23:45:02 +0800 Subject: asm-generic/iomap.h: remove ARCH_HAS_IOREMAP_xx macros Patch series "mm: ioremap: Convert architectures to take GENERIC_IOREMAP way", v8. Motivation and implementation: ============================== Currently, many architecutres have't taken the standard GENERIC_IOREMAP way to implement ioremap_prot(), iounmap(), and ioremap_xx(), but make these functions specifically under each arch's folder. Those cause many duplicated code of ioremap() and iounmap(). In this patchset, firstly introduce generic_ioremap_prot() and generic_iounmap() to extract the generic code for GENERIC_IOREMAP. By taking GENERIC_IOREMAP method, the generic generic_ioremap_prot(), generic_iounmap(), and their generic wrapper ioremap_prot(), ioremap() and iounmap() are all visible and available to arch. Arch needs to provide wrapper functions to override the generic version if there's arch specific handling in its corresponding ioremap_prot(), ioremap() or iounmap(). With these changes, duplicated ioremap/iounmap() code uder ARCH-es are removed, and the equivalent functioality is kept as before. Background info: ================ 1) The converting more architectures to take GENERIC_IOREMAP way is suggested by Christoph in below discussion: https://lore.kernel.org/all/Yp7h0Jv6vpgt6xdZ@infradead.org/T/#u 2) In the previous v1 to v3, it's basically further action after arm64 has converted to GENERIC_IOREMAP way in below patchset. It's done by adding hook ioremap_allowed() and iounmap_allowed() in ARCH to add ARCH specific handling the middle of ioremap_prot() and iounmap(). [PATCH v5 0/6] arm64: Cleanup ioremap() and support ioremap_prot() https://lore.kernel.org/all/20220607125027.44946-1-wangkefeng.wang@huawei.com/T/#u Later, during v3 reviewing, Christophe Leroy suggested to introduce generic_ioremap_prot() and generic_iounmap() to generic codes, and ARCH can provide wrapper function ioremap_prot(), ioremap() or iounmap() if needed. Christophe made a RFC patchset as below to specially demonstrate his idea. This is what v4 and now v5 is doing. [RFC PATCH 0/8] mm: ioremap: Convert architectures to take GENERIC_IOREMAP way https://lore.kernel.org/all/cover.1665568707.git.christophe.leroy@csgroup.eu/T/#u Testing: ======== In v8, I only applied this patchset onto the latest linus's tree to build and run on arm64 and s390. This patch (of 19): Let's use '#define ioremap_xx' and "#ifdef ioremap_xx" instead. To remove defined ARCH_HAS_IOREMAP_xx macros in of each ARCH, the ARCH's own ioremap_wc|wt|np definition need be above "#include . Otherwise the redefinition error would be seen during compiling. So the relevant adjustments are made to avoid compiling error: loongarch: - doesn't include , defining ARCH_HAS_IOREMAP_WC is redundant, so simply remove it. m68k: - selected GENERIC_IOMAP, has been added in , and is included above , so simply remove ARCH_HAS_IOREMAP_WT defining. mips: - move "#include " below ioremap_wc definition in powerpc: - remove "#include " in because it's duplicated with the one in , let's rely on the latter. x86: - selected GENERIC_IOMAP, remove #include in the middle of . Let's rely on . Link: https://lkml.kernel.org/r/20230706154520.11257-2-bhe@redhat.com Signed-off-by: Baoquan He Acked-by: Geert Uytterhoeven Reviewed-by: Mike Rapoport (IBM) Reviewed-by: Christoph Hellwig Cc: Alexander Gordeev Cc: Arnd Bergmann Cc: Christophe Leroy Cc: David Laight Cc: Helge Deller Cc: John Paul Adrian Glaubitz Cc: Kefeng Wang Cc: Matthew Wilcox Cc: Nathan Chancellor Cc: Niklas Schnelle Cc: Stafford Horne Cc: Brian Cain Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Chris Zankel Cc: Gerald Schaefer Cc: Heiko Carstens Cc: "James E.J. Bottomley" Cc: Jonas Bonn Cc: Max Filippov Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Rich Felker Cc: Stefan Kristiansson Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/loongarch/include/asm/io.h | 2 -- arch/m68k/include/asm/io_mm.h | 2 -- arch/m68k/include/asm/kmap.h | 2 -- arch/mips/include/asm/io.h | 5 ++--- arch/powerpc/include/asm/io.h | 9 +-------- arch/x86/include/asm/io.h | 5 ----- drivers/net/ethernet/sfc/io.h | 2 +- drivers/net/ethernet/sfc/siena/io.h | 2 +- include/asm-generic/iomap.h | 6 +++--- 9 files changed, 8 insertions(+), 27 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/loongarch/include/asm/io.h b/arch/loongarch/include/asm/io.h index 1c9410220040..0dcb36b32cb2 100644 --- a/arch/loongarch/include/asm/io.h +++ b/arch/loongarch/include/asm/io.h @@ -5,8 +5,6 @@ #ifndef _ASM_IO_H #define _ASM_IO_H -#define ARCH_HAS_IOREMAP_WC - #include #include diff --git a/arch/m68k/include/asm/io_mm.h b/arch/m68k/include/asm/io_mm.h index d41fa488453b..6a0abd4846c6 100644 --- a/arch/m68k/include/asm/io_mm.h +++ b/arch/m68k/include/asm/io_mm.h @@ -26,8 +26,6 @@ #include #include -#include - #ifdef CONFIG_ATARI #define atari_readb raw_inb #define atari_writeb raw_outb diff --git a/arch/m68k/include/asm/kmap.h b/arch/m68k/include/asm/kmap.h index dec05743d426..4efb3efa593a 100644 --- a/arch/m68k/include/asm/kmap.h +++ b/arch/m68k/include/asm/kmap.h @@ -4,8 +4,6 @@ #ifdef CONFIG_MMU -#define ARCH_HAS_IOREMAP_WT - /* Values for nocacheflag and cmode */ #define IOMAP_FULL_CACHING 0 #define IOMAP_NOCACHE_SER 1 diff --git a/arch/mips/include/asm/io.h b/arch/mips/include/asm/io.h index affd21e9c20b..062dd4e6b954 100644 --- a/arch/mips/include/asm/io.h +++ b/arch/mips/include/asm/io.h @@ -12,8 +12,6 @@ #ifndef _ASM_IO_H #define _ASM_IO_H -#define ARCH_HAS_IOREMAP_WC - #include #include #include @@ -25,7 +23,6 @@ #include #include #include -#include #include #include #include @@ -210,6 +207,8 @@ void iounmap(const volatile void __iomem *addr); #define ioremap_wc(offset, size) \ ioremap_prot((offset), (size), boot_cpu_data.writecombine) +#include + #if defined(CONFIG_CPU_CAVIUM_OCTEON) #define war_io_reorder_wmb() wmb() #else diff --git a/arch/powerpc/include/asm/io.h b/arch/powerpc/include/asm/io.h index f1e657c9bbe8..67a3fb6de498 100644 --- a/arch/powerpc/include/asm/io.h +++ b/arch/powerpc/include/asm/io.h @@ -3,11 +3,6 @@ #define _ASM_POWERPC_IO_H #ifdef __KERNEL__ -#define ARCH_HAS_IOREMAP_WC -#ifdef CONFIG_PPC32 -#define ARCH_HAS_IOREMAP_WT -#endif - /* */ @@ -732,9 +727,7 @@ static inline void name at \ #define writel_relaxed(v, addr) writel(v, addr) #define writeq_relaxed(v, addr) writeq(v, addr) -#ifdef CONFIG_GENERIC_IOMAP -#include -#else +#ifndef CONFIG_GENERIC_IOMAP /* * Here comes the implementation of the IOMAP interfaces. */ diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index e9025640f634..76238842406a 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -35,9 +35,6 @@ * - Arnaldo Carvalho de Melo */ -#define ARCH_HAS_IOREMAP_WC -#define ARCH_HAS_IOREMAP_WT - #include #include #include @@ -212,8 +209,6 @@ void memset_io(volatile void __iomem *, int, size_t); #define memcpy_toio memcpy_toio #define memset_io memset_io -#include - /* * ISA space is 'always mapped' on a typical x86 system, no need to * explicitly ioremap() it. The fact that the ISA IO space is mapped diff --git a/drivers/net/ethernet/sfc/io.h b/drivers/net/ethernet/sfc/io.h index 30439cc83a89..07f99ad14bf3 100644 --- a/drivers/net/ethernet/sfc/io.h +++ b/drivers/net/ethernet/sfc/io.h @@ -70,7 +70,7 @@ */ #ifdef CONFIG_X86_64 /* PIO is a win only if write-combining is possible */ -#ifdef ARCH_HAS_IOREMAP_WC +#ifdef ioremap_wc #define EFX_USE_PIO 1 #endif #endif diff --git a/drivers/net/ethernet/sfc/siena/io.h b/drivers/net/ethernet/sfc/siena/io.h index 30439cc83a89..07f99ad14bf3 100644 --- a/drivers/net/ethernet/sfc/siena/io.h +++ b/drivers/net/ethernet/sfc/siena/io.h @@ -70,7 +70,7 @@ */ #ifdef CONFIG_X86_64 /* PIO is a win only if write-combining is possible */ -#ifdef ARCH_HAS_IOREMAP_WC +#ifdef ioremap_wc #define EFX_USE_PIO 1 #endif #endif diff --git a/include/asm-generic/iomap.h b/include/asm-generic/iomap.h index 08237ae8b840..196087a8126e 100644 --- a/include/asm-generic/iomap.h +++ b/include/asm-generic/iomap.h @@ -93,15 +93,15 @@ extern void __iomem *ioport_map(unsigned long port, unsigned int nr); extern void ioport_unmap(void __iomem *); #endif -#ifndef ARCH_HAS_IOREMAP_WC +#ifndef ioremap_wc #define ioremap_wc ioremap #endif -#ifndef ARCH_HAS_IOREMAP_WT +#ifndef ioremap_wt #define ioremap_wt ioremap #endif -#ifndef ARCH_HAS_IOREMAP_NP +#ifndef ioremap_np /* See the comment in asm-generic/io.h about ioremap_np(). */ #define ioremap_np ioremap_np static inline void __iomem *ioremap_np(phys_addr_t offset, size_t size) -- cgit v1.2.3 From 016fec91013cfb27c9f5a101a87ae8266537fed1 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Thu, 6 Jul 2023 23:45:17 +0800 Subject: mm: move is_ioremap_addr() into new header file Now is_ioremap_addr() is only used in kernel/iomem.c and gonna be used in mm/ioremap.c. Move it into its own new header file linux/ioremap.h. Link: https://lkml.kernel.org/r/20230706154520.11257-17-bhe@redhat.com Suggested-by: Christoph Hellwig Signed-off-by: Baoquan He Reviewed-by: Christoph Hellwig Cc: Alexander Gordeev Cc: Arnd Bergmann Cc: Brian Cain Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Chris Zankel Cc: David Laight Cc: Geert Uytterhoeven Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Helge Deller Cc: "James E.J. Bottomley" Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: Kefeng Wang Cc: Matthew Wilcox Cc: Max Filippov Cc: Michael Ellerman Cc: Mike Rapoport (IBM) Cc: Nathan Chancellor Cc: Nicholas Piggin Cc: Niklas Schnelle Cc: Rich Felker Cc: Stafford Horne Cc: Stefan Kristiansson Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/powerpc/include/asm/pgtable.h | 10 ---------- include/linux/ioremap.h | 30 ++++++++++++++++++++++++++++++ include/linux/mm.h | 5 ----- kernel/iomem.c | 1 + mm/ioremap.c | 10 +--------- 5 files changed, 32 insertions(+), 24 deletions(-) create mode 100644 include/linux/ioremap.h (limited to 'arch/powerpc') diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index 6a88bfdaa69b..445a22987aa3 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h @@ -157,16 +157,6 @@ static inline pgtable_t pmd_pgtable(pmd_t pmd) return (pgtable_t)pmd_page_vaddr(pmd); } -#ifdef CONFIG_PPC64 -#define is_ioremap_addr is_ioremap_addr -static inline bool is_ioremap_addr(const void *x) -{ - unsigned long addr = (unsigned long)x; - - return addr >= IOREMAP_BASE && addr < IOREMAP_END; -} -#endif /* CONFIG_PPC64 */ - #endif /* __ASSEMBLY__ */ #endif /* _ASM_POWERPC_PGTABLE_H */ diff --git a/include/linux/ioremap.h b/include/linux/ioremap.h new file mode 100644 index 000000000000..f0e99fc7dd8b --- /dev/null +++ b/include/linux/ioremap.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_IOREMAP_H +#define _LINUX_IOREMAP_H + +#include +#include + +#if defined(CONFIG_HAS_IOMEM) || defined(CONFIG_GENERIC_IOREMAP) +/* + * Ioremap often, but not always uses the generic vmalloc area. E.g on + * Power ARCH, it could have different ioremap space. + */ +#ifndef IOREMAP_START +#define IOREMAP_START VMALLOC_START +#define IOREMAP_END VMALLOC_END +#endif +static inline bool is_ioremap_addr(const void *x) +{ + unsigned long addr = (unsigned long)kasan_reset_tag(x); + + return addr >= IOREMAP_START && addr < IOREMAP_END; +} +#else +static inline bool is_ioremap_addr(const void *x) +{ + return false; +} +#endif + +#endif /* _LINUX_IOREMAP_H */ diff --git a/include/linux/mm.h b/include/linux/mm.h index bfb46483108c..0ae5654f665b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1055,11 +1055,6 @@ unsigned long vmalloc_to_pfn(const void *addr); * On nommu, vmalloc/vfree wrap through kmalloc/kfree directly, so there * is no special casing required. */ - -#ifndef is_ioremap_addr -#define is_ioremap_addr(x) is_vmalloc_addr(x) -#endif - #ifdef CONFIG_MMU extern bool is_vmalloc_addr(const void *x); extern int is_vmalloc_or_module_addr(const void *x); diff --git a/kernel/iomem.c b/kernel/iomem.c index 62c92e43aa0d..9682471e6471 100644 --- a/kernel/iomem.c +++ b/kernel/iomem.c @@ -3,6 +3,7 @@ #include #include #include +#include #ifndef ioremap_cache /* temporary while we convert existing ioremap_cache users to memremap */ diff --git a/mm/ioremap.c b/mm/ioremap.c index 68d9895144ad..a21a6c9fa5ab 100644 --- a/mm/ioremap.c +++ b/mm/ioremap.c @@ -10,15 +10,7 @@ #include #include #include - -/* - * Ioremap often, but not always uses the generic vmalloc area. E.g on - * Power ARCH, it could have different ioremap space. - */ -#ifndef IOREMAP_START -#define IOREMAP_START VMALLOC_START -#define IOREMAP_END VMALLOC_END -#endif +#include void __iomem *generic_ioremap_prot(phys_addr_t phys_addr, size_t size, pgprot_t prot) -- cgit v1.2.3 From 8d05554dca2af6a77dc38a152f4e84fdf7e35179 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 6 Jul 2023 23:45:18 +0800 Subject: powerpc: mm: convert to GENERIC_IOREMAP By taking GENERIC_IOREMAP method, the generic generic_ioremap_prot(), generic_iounmap(), and their generic wrapper ioremap_prot(), ioremap() and iounmap() are all visible and available to arch. Arch needs to provide wrapper functions to override the generic versions if there's arch specific handling in its ioremap_prot(), ioremap() or iounmap(). This change will simplify implementation by removing duplicated code with generic_ioremap_prot() and generic_iounmap(), and has the equivalent functioality as before. Here, add wrapper functions ioremap_prot() and iounmap() for powerpc's special operation when ioremap() and iounmap(). Link: https://lkml.kernel.org/r/20230706154520.11257-18-bhe@redhat.com Signed-off-by: Christophe Leroy Signed-off-by: Baoquan He Reviewed-by: Christoph Hellwig Reviewed-by: Mike Rapoport (IBM) Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Alexander Gordeev Cc: Arnd Bergmann Cc: Brian Cain Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Chris Zankel Cc: David Laight Cc: Geert Uytterhoeven Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Helge Deller Cc: "James E.J. Bottomley" Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: Kefeng Wang Cc: Matthew Wilcox Cc: Max Filippov Cc: Nathan Chancellor Cc: Niklas Schnelle Cc: Rich Felker Cc: Stafford Horne Cc: Stefan Kristiansson Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/powerpc/Kconfig | 1 + arch/powerpc/include/asm/io.h | 8 +++----- arch/powerpc/mm/ioremap.c | 26 +------------------------- arch/powerpc/mm/ioremap_32.c | 19 +++++++++---------- arch/powerpc/mm/ioremap_64.c | 12 ++---------- 5 files changed, 16 insertions(+), 50 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 0b1172cbeccb..9222c138c457 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -193,6 +193,7 @@ config PPC select GENERIC_CPU_VULNERABILITIES if PPC_BARRIER_NOSPEC select GENERIC_EARLY_IOREMAP select GENERIC_GETTIMEOFDAY + select GENERIC_IOREMAP select GENERIC_IRQ_SHOW select GENERIC_IRQ_SHOW_LEVEL select GENERIC_PCI_IOMAP if PCI diff --git a/arch/powerpc/include/asm/io.h b/arch/powerpc/include/asm/io.h index 67a3fb6de498..0732b743e099 100644 --- a/arch/powerpc/include/asm/io.h +++ b/arch/powerpc/include/asm/io.h @@ -889,8 +889,8 @@ static inline void iosync(void) * */ extern void __iomem *ioremap(phys_addr_t address, unsigned long size); -extern void __iomem *ioremap_prot(phys_addr_t address, unsigned long size, - unsigned long flags); +#define ioremap ioremap +#define ioremap_prot ioremap_prot extern void __iomem *ioremap_wc(phys_addr_t address, unsigned long size); #define ioremap_wc ioremap_wc @@ -904,14 +904,12 @@ void __iomem *ioremap_coherent(phys_addr_t address, unsigned long size); #define ioremap_cache(addr, size) \ ioremap_prot((addr), (size), pgprot_val(PAGE_KERNEL)) -extern void iounmap(volatile void __iomem *addr); +#define iounmap iounmap void __iomem *ioremap_phb(phys_addr_t paddr, unsigned long size); int early_ioremap_range(unsigned long ea, phys_addr_t pa, unsigned long size, pgprot_t prot); -void __iomem *do_ioremap(phys_addr_t pa, phys_addr_t offset, unsigned long size, - pgprot_t prot, void *caller); extern void __iomem *__ioremap_caller(phys_addr_t, unsigned long size, pgprot_t prot, void *caller); diff --git a/arch/powerpc/mm/ioremap.c b/arch/powerpc/mm/ioremap.c index 4f12504fb405..705e8e8ffde4 100644 --- a/arch/powerpc/mm/ioremap.c +++ b/arch/powerpc/mm/ioremap.c @@ -41,7 +41,7 @@ void __iomem *ioremap_coherent(phys_addr_t addr, unsigned long size) return __ioremap_caller(addr, size, prot, caller); } -void __iomem *ioremap_prot(phys_addr_t addr, unsigned long size, unsigned long flags) +void __iomem *ioremap_prot(phys_addr_t addr, size_t size, unsigned long flags) { pte_t pte = __pte(flags); void *caller = __builtin_return_address(0); @@ -74,27 +74,3 @@ int early_ioremap_range(unsigned long ea, phys_addr_t pa, return 0; } - -void __iomem *do_ioremap(phys_addr_t pa, phys_addr_t offset, unsigned long size, - pgprot_t prot, void *caller) -{ - struct vm_struct *area; - int ret; - unsigned long va; - - area = __get_vm_area_caller(size, VM_IOREMAP, IOREMAP_START, IOREMAP_END, caller); - if (area == NULL) - return NULL; - - area->phys_addr = pa; - va = (unsigned long)area->addr; - - ret = ioremap_page_range(va, va + size, pa, prot); - if (!ret) - return (void __iomem *)area->addr + offset; - - vunmap_range(va, va + size); - free_vm_area(area); - - return NULL; -} diff --git a/arch/powerpc/mm/ioremap_32.c b/arch/powerpc/mm/ioremap_32.c index 9d13143b8be4..ca5bc6be3e6f 100644 --- a/arch/powerpc/mm/ioremap_32.c +++ b/arch/powerpc/mm/ioremap_32.c @@ -21,6 +21,13 @@ __ioremap_caller(phys_addr_t addr, unsigned long size, pgprot_t prot, void *call phys_addr_t p, offset; int err; + /* + * If the address lies within the first 16 MB, assume it's in ISA + * memory space + */ + if (addr < SZ_16M) + addr += _ISA_MEM_BASE; + /* * Choose an address to map it to. * Once the vmalloc system is running, we use it. @@ -31,13 +38,6 @@ __ioremap_caller(phys_addr_t addr, unsigned long size, pgprot_t prot, void *call offset = addr & ~PAGE_MASK; size = PAGE_ALIGN(addr + size) - p; - /* - * If the address lies within the first 16 MB, assume it's in ISA - * memory space - */ - if (p < 16 * 1024 * 1024) - p += _ISA_MEM_BASE; - #ifndef CONFIG_CRASH_DUMP /* * Don't allow anybody to remap normal RAM that we're using. @@ -63,7 +63,7 @@ __ioremap_caller(phys_addr_t addr, unsigned long size, pgprot_t prot, void *call return (void __iomem *)v + offset; if (slab_is_available()) - return do_ioremap(p, offset, size, prot, caller); + return generic_ioremap_prot(addr, size, prot); /* * Should check if it is a candidate for a BAT mapping @@ -87,7 +87,6 @@ void iounmap(volatile void __iomem *addr) if (v_block_mapped((unsigned long)addr)) return; - if (addr > high_memory && (unsigned long)addr < ioremap_bot) - vunmap((void *)(PAGE_MASK & (unsigned long)addr)); + generic_iounmap(addr); } EXPORT_SYMBOL(iounmap); diff --git a/arch/powerpc/mm/ioremap_64.c b/arch/powerpc/mm/ioremap_64.c index 3acece00b33e..d24e5f166723 100644 --- a/arch/powerpc/mm/ioremap_64.c +++ b/arch/powerpc/mm/ioremap_64.c @@ -29,7 +29,7 @@ void __iomem *__ioremap_caller(phys_addr_t addr, unsigned long size, return NULL; if (slab_is_available()) - return do_ioremap(paligned, offset, size, prot, caller); + return generic_ioremap_prot(addr, size, prot); pr_warn("ioremap() called early from %pS. Use early_ioremap() instead\n", caller); @@ -49,17 +49,9 @@ void __iomem *__ioremap_caller(phys_addr_t addr, unsigned long size, */ void iounmap(volatile void __iomem *token) { - void *addr; - if (!slab_is_available()) return; - addr = (void *)((unsigned long __force)PCI_FIX_ADDR(token) & PAGE_MASK); - - if ((unsigned long)addr < ioremap_bot) { - pr_warn("Attempt to iounmap early bolted mapping at 0x%p\n", addr); - return; - } - vunmap(addr); + generic_iounmap(PCI_FIX_ADDR(token)); } EXPORT_SYMBOL(iounmap); -- cgit v1.2.3 From 6bbd42e2df8f90b022fa0b82986d58ecb30b9dcc Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Tue, 25 Jul 2023 23:42:05 +1000 Subject: mmu_notifiers: call invalidate_range() when invalidating TLBs The invalidate_range() is going to become an architecture specific mmu notifier used to keep the TLB of secondary MMUs such as an IOMMU in sync with the CPU page tables. Currently it is called from separate code paths to the main CPU TLB invalidations. This can lead to a secondary TLB not getting invalidated when required and makes it hard to reason about when exactly the secondary TLB is invalidated. To fix this move the notifier call to the architecture specific TLB maintenance functions for architectures that have secondary MMUs requiring explicit software invalidations. This fixes a SMMU bug on ARM64. On ARM64 PTE permission upgrades require a TLB invalidation. This invalidation is done by the architecture specific ptep_set_access_flags() which calls flush_tlb_page() if required. However this doesn't call the notifier resulting in infinite faults being generated by devices using the SMMU if it has previously cached a read-only PTE in it's TLB. Moving the invalidations into the TLB invalidation functions ensures all invalidations happen at the same time as the CPU invalidation. The architecture specific flush_tlb_all() routines do not call the notifier as none of the IOMMUs require this. Link: https://lkml.kernel.org/r/0287ae32d91393a582897d6c4db6f7456b1001f2.1690292440.git-series.apopple@nvidia.com Signed-off-by: Alistair Popple Suggested-by: Jason Gunthorpe Tested-by: SeongJae Park Acked-by: Catalin Marinas Reviewed-by: Jason Gunthorpe Tested-by: Luis Chamberlain Cc: Andrew Donnellan Cc: Chaitanya Kumar Borah Cc: Frederic Barrat Cc: John Hubbard Cc: Kevin Tian Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Nicolin Chen Cc: Robin Murphy Cc: Sean Christopherson Cc: Tvrtko Ursulin Cc: Will Deacon Cc: Zhi Wang Signed-off-by: Andrew Morton --- arch/arm64/include/asm/tlbflush.h | 5 +++++ arch/powerpc/include/asm/book3s/64/tlbflush.h | 1 + arch/powerpc/mm/book3s64/radix_hugetlbpage.c | 1 + arch/powerpc/mm/book3s64/radix_tlb.c | 4 ++++ arch/x86/include/asm/tlbflush.h | 2 ++ arch/x86/mm/tlb.c | 2 ++ include/asm-generic/tlb.h | 1 - 7 files changed, 15 insertions(+), 1 deletion(-) (limited to 'arch/powerpc') diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h index 3456866c6a1d..a99349d10c0e 100644 --- a/arch/arm64/include/asm/tlbflush.h +++ b/arch/arm64/include/asm/tlbflush.h @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -252,6 +253,7 @@ static inline void flush_tlb_mm(struct mm_struct *mm) __tlbi(aside1is, asid); __tlbi_user(aside1is, asid); dsb(ish); + mmu_notifier_invalidate_range(mm, 0, -1UL); } static inline void __flush_tlb_page_nosync(struct mm_struct *mm, @@ -263,6 +265,8 @@ static inline void __flush_tlb_page_nosync(struct mm_struct *mm, addr = __TLBI_VADDR(uaddr, ASID(mm)); __tlbi(vale1is, addr); __tlbi_user(vale1is, addr); + mmu_notifier_invalidate_range(mm, uaddr & PAGE_MASK, + (uaddr & PAGE_MASK) + PAGE_SIZE); } static inline void flush_tlb_page_nosync(struct vm_area_struct *vma, @@ -396,6 +400,7 @@ static inline void __flush_tlb_range(struct vm_area_struct *vma, scale++; } dsb(ish); + mmu_notifier_invalidate_range(vma->vm_mm, start, end); } static inline void flush_tlb_range(struct vm_area_struct *vma, diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush.h b/arch/powerpc/include/asm/book3s/64/tlbflush.h index 0d0c1447ecf0..dca0477b0709 100644 --- a/arch/powerpc/include/asm/book3s/64/tlbflush.h +++ b/arch/powerpc/include/asm/book3s/64/tlbflush.h @@ -5,6 +5,7 @@ #define MMU_NO_CONTEXT ~0UL #include +#include #include #include diff --git a/arch/powerpc/mm/book3s64/radix_hugetlbpage.c b/arch/powerpc/mm/book3s64/radix_hugetlbpage.c index 5e3195568525..f3fb49fd32fe 100644 --- a/arch/powerpc/mm/book3s64/radix_hugetlbpage.c +++ b/arch/powerpc/mm/book3s64/radix_hugetlbpage.c @@ -39,6 +39,7 @@ void radix__flush_hugetlb_tlb_range(struct vm_area_struct *vma, unsigned long st radix__flush_tlb_pwc_range_psize(vma->vm_mm, start, end, psize); else radix__flush_tlb_range_psize(vma->vm_mm, start, end, psize); + mmu_notifier_invalidate_range(vma->vm_mm, start, end); } void radix__huge_ptep_modify_prot_commit(struct vm_area_struct *vma, diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c index 0bd4866d9824..4d44902a4962 100644 --- a/arch/powerpc/mm/book3s64/radix_tlb.c +++ b/arch/powerpc/mm/book3s64/radix_tlb.c @@ -987,6 +987,7 @@ void radix__flush_tlb_mm(struct mm_struct *mm) } } preempt_enable(); + mmu_notifier_invalidate_range(mm, 0, -1UL); } EXPORT_SYMBOL(radix__flush_tlb_mm); @@ -1020,6 +1021,7 @@ static void __flush_all_mm(struct mm_struct *mm, bool fullmm) _tlbiel_pid_multicast(mm, pid, RIC_FLUSH_ALL); } preempt_enable(); + mmu_notifier_invalidate_range(mm, 0, -1UL); } void radix__flush_all_mm(struct mm_struct *mm) @@ -1228,6 +1230,7 @@ static inline void __radix__flush_tlb_range(struct mm_struct *mm, } out: preempt_enable(); + mmu_notifier_invalidate_range(mm, start, end); } void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start, @@ -1392,6 +1395,7 @@ static void __radix__flush_tlb_range_psize(struct mm_struct *mm, } out: preempt_enable(); + mmu_notifier_invalidate_range(mm, start, end); } void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start, diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 837e4a50281a..0a5432364c5a 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -3,6 +3,7 @@ #define _ASM_X86_TLBFLUSH_H #include +#include #include #include @@ -282,6 +283,7 @@ static inline void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *b { inc_mm_tlb_gen(mm); cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm)); + mmu_notifier_invalidate_range(mm, 0, -1UL); } static inline void arch_flush_tlb_batched_pending(struct mm_struct *mm) diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 267acf27480a..93b2f81f09da 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -1036,6 +1037,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, put_flush_tlb_info(); put_cpu(); + mmu_notifier_invalidate_range(mm, start, end); } diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index b46617207c93..bc32a2284c56 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -456,7 +456,6 @@ static inline void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb) return; tlb_flush(tlb); - mmu_notifier_invalidate_range(tlb->mm, tlb->start, tlb->end); __tlb_reset_range(tlb); } -- cgit v1.2.3 From 1af5a8109904b7f00828e7f9f63f5695b42f8215 Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Tue, 25 Jul 2023 23:42:07 +1000 Subject: mmu_notifiers: rename invalidate_range notifier There are two main use cases for mmu notifiers. One is by KVM which uses mmu_notifier_invalidate_range_start()/end() to manage a software TLB. The other is to manage hardware TLBs which need to use the invalidate_range() callback because HW can establish new TLB entries at any time. Hence using start/end() can lead to memory corruption as these callbacks happen too soon/late during page unmap. mmu notifier users should therefore either use the start()/end() callbacks or the invalidate_range() callbacks. To make this usage clearer rename the invalidate_range() callback to arch_invalidate_secondary_tlbs() and update documention. Link: https://lkml.kernel.org/r/6f77248cd25545c8020a54b4e567e8b72be4dca1.1690292440.git-series.apopple@nvidia.com Signed-off-by: Alistair Popple Suggested-by: Jason Gunthorpe Acked-by: Catalin Marinas Reviewed-by: Jason Gunthorpe Cc: Andrew Donnellan Cc: Chaitanya Kumar Borah Cc: Frederic Barrat Cc: Jason Gunthorpe Cc: John Hubbard Cc: Kevin Tian Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Nicolin Chen Cc: Robin Murphy Cc: Sean Christopherson Cc: SeongJae Park Cc: Tvrtko Ursulin Cc: Will Deacon Cc: Zhi Wang Signed-off-by: Andrew Morton --- arch/arm64/include/asm/tlbflush.h | 6 ++-- arch/powerpc/mm/book3s64/radix_hugetlbpage.c | 2 +- arch/powerpc/mm/book3s64/radix_tlb.c | 8 ++--- arch/x86/include/asm/tlbflush.h | 2 +- arch/x86/mm/tlb.c | 2 +- drivers/iommu/amd/iommu_v2.c | 10 +++--- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c | 13 +++---- drivers/iommu/intel/svm.c | 8 ++--- drivers/misc/ocxl/link.c | 8 ++--- include/linux/mmu_notifier.h | 48 ++++++++++++------------- mm/huge_memory.c | 4 +-- mm/hugetlb.c | 7 ++-- mm/mmu_notifier.c | 21 ++++++++--- 13 files changed, 76 insertions(+), 63 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h index a99349d10c0e..84a05a0bd2b6 100644 --- a/arch/arm64/include/asm/tlbflush.h +++ b/arch/arm64/include/asm/tlbflush.h @@ -253,7 +253,7 @@ static inline void flush_tlb_mm(struct mm_struct *mm) __tlbi(aside1is, asid); __tlbi_user(aside1is, asid); dsb(ish); - mmu_notifier_invalidate_range(mm, 0, -1UL); + mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL); } static inline void __flush_tlb_page_nosync(struct mm_struct *mm, @@ -265,7 +265,7 @@ static inline void __flush_tlb_page_nosync(struct mm_struct *mm, addr = __TLBI_VADDR(uaddr, ASID(mm)); __tlbi(vale1is, addr); __tlbi_user(vale1is, addr); - mmu_notifier_invalidate_range(mm, uaddr & PAGE_MASK, + mmu_notifier_arch_invalidate_secondary_tlbs(mm, uaddr & PAGE_MASK, (uaddr & PAGE_MASK) + PAGE_SIZE); } @@ -400,7 +400,7 @@ static inline void __flush_tlb_range(struct vm_area_struct *vma, scale++; } dsb(ish); - mmu_notifier_invalidate_range(vma->vm_mm, start, end); + mmu_notifier_arch_invalidate_secondary_tlbs(vma->vm_mm, start, end); } static inline void flush_tlb_range(struct vm_area_struct *vma, diff --git a/arch/powerpc/mm/book3s64/radix_hugetlbpage.c b/arch/powerpc/mm/book3s64/radix_hugetlbpage.c index f3fb49fd32fe..17075c78d4bc 100644 --- a/arch/powerpc/mm/book3s64/radix_hugetlbpage.c +++ b/arch/powerpc/mm/book3s64/radix_hugetlbpage.c @@ -39,7 +39,7 @@ void radix__flush_hugetlb_tlb_range(struct vm_area_struct *vma, unsigned long st radix__flush_tlb_pwc_range_psize(vma->vm_mm, start, end, psize); else radix__flush_tlb_range_psize(vma->vm_mm, start, end, psize); - mmu_notifier_invalidate_range(vma->vm_mm, start, end); + mmu_notifier_arch_invalidate_secondary_tlbs(vma->vm_mm, start, end); } void radix__huge_ptep_modify_prot_commit(struct vm_area_struct *vma, diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c index 4d44902a4962..06e647ef19d1 100644 --- a/arch/powerpc/mm/book3s64/radix_tlb.c +++ b/arch/powerpc/mm/book3s64/radix_tlb.c @@ -987,7 +987,7 @@ void radix__flush_tlb_mm(struct mm_struct *mm) } } preempt_enable(); - mmu_notifier_invalidate_range(mm, 0, -1UL); + mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL); } EXPORT_SYMBOL(radix__flush_tlb_mm); @@ -1021,7 +1021,7 @@ static void __flush_all_mm(struct mm_struct *mm, bool fullmm) _tlbiel_pid_multicast(mm, pid, RIC_FLUSH_ALL); } preempt_enable(); - mmu_notifier_invalidate_range(mm, 0, -1UL); + mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL); } void radix__flush_all_mm(struct mm_struct *mm) @@ -1230,7 +1230,7 @@ static inline void __radix__flush_tlb_range(struct mm_struct *mm, } out: preempt_enable(); - mmu_notifier_invalidate_range(mm, start, end); + mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end); } void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start, @@ -1395,7 +1395,7 @@ static void __radix__flush_tlb_range_psize(struct mm_struct *mm, } out: preempt_enable(); - mmu_notifier_invalidate_range(mm, start, end); + mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end); } void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start, diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 0a5432364c5a..6ab42caaa67a 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -283,7 +283,7 @@ static inline void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *b { inc_mm_tlb_gen(mm); cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm)); - mmu_notifier_invalidate_range(mm, 0, -1UL); + mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL); } static inline void arch_flush_tlb_batched_pending(struct mm_struct *mm) diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 93b2f81f09da..2d253919b3e8 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -1037,7 +1037,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, put_flush_tlb_info(); put_cpu(); - mmu_notifier_invalidate_range(mm, start, end); + mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end); } diff --git a/drivers/iommu/amd/iommu_v2.c b/drivers/iommu/amd/iommu_v2.c index 261352a23271..2596466cd5a6 100644 --- a/drivers/iommu/amd/iommu_v2.c +++ b/drivers/iommu/amd/iommu_v2.c @@ -355,9 +355,9 @@ static struct pasid_state *mn_to_state(struct mmu_notifier *mn) return container_of(mn, struct pasid_state, mn); } -static void mn_invalidate_range(struct mmu_notifier *mn, - struct mm_struct *mm, - unsigned long start, unsigned long end) +static void mn_arch_invalidate_secondary_tlbs(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long start, unsigned long end) { struct pasid_state *pasid_state; struct device_state *dev_state; @@ -391,8 +391,8 @@ static void mn_release(struct mmu_notifier *mn, struct mm_struct *mm) } static const struct mmu_notifier_ops iommu_mn = { - .release = mn_release, - .invalidate_range = mn_invalidate_range, + .release = mn_release, + .arch_invalidate_secondary_tlbs = mn_arch_invalidate_secondary_tlbs, }; static void set_pri_tag_status(struct pasid_state *pasid_state, diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c index 2a19784b698e..dbc812a0e57e 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c @@ -186,9 +186,10 @@ static void arm_smmu_free_shared_cd(struct arm_smmu_ctx_desc *cd) } } -static void arm_smmu_mm_invalidate_range(struct mmu_notifier *mn, - struct mm_struct *mm, - unsigned long start, unsigned long end) +static void arm_smmu_mm_arch_invalidate_secondary_tlbs(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long start, + unsigned long end) { struct arm_smmu_mmu_notifier *smmu_mn = mn_to_smmu(mn); struct arm_smmu_domain *smmu_domain = smmu_mn->domain; @@ -247,9 +248,9 @@ static void arm_smmu_mmu_notifier_free(struct mmu_notifier *mn) } static const struct mmu_notifier_ops arm_smmu_mmu_notifier_ops = { - .invalidate_range = arm_smmu_mm_invalidate_range, - .release = arm_smmu_mm_release, - .free_notifier = arm_smmu_mmu_notifier_free, + .arch_invalidate_secondary_tlbs = arm_smmu_mm_arch_invalidate_secondary_tlbs, + .release = arm_smmu_mm_release, + .free_notifier = arm_smmu_mmu_notifier_free, }; /* Allocate or get existing MMU notifier for this {domain, mm} pair */ diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c index e95b339e9cdc..8f6d68006ab6 100644 --- a/drivers/iommu/intel/svm.c +++ b/drivers/iommu/intel/svm.c @@ -219,9 +219,9 @@ static void intel_flush_svm_range(struct intel_svm *svm, unsigned long address, } /* Pages have been freed at this point */ -static void intel_invalidate_range(struct mmu_notifier *mn, - struct mm_struct *mm, - unsigned long start, unsigned long end) +static void intel_arch_invalidate_secondary_tlbs(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long start, unsigned long end) { struct intel_svm *svm = container_of(mn, struct intel_svm, notifier); @@ -256,7 +256,7 @@ static void intel_mm_release(struct mmu_notifier *mn, struct mm_struct *mm) static const struct mmu_notifier_ops intel_mmuops = { .release = intel_mm_release, - .invalidate_range = intel_invalidate_range, + .arch_invalidate_secondary_tlbs = intel_arch_invalidate_secondary_tlbs, }; static DEFINE_MUTEX(pasid_mutex); diff --git a/drivers/misc/ocxl/link.c b/drivers/misc/ocxl/link.c index 4cf4c55a5f00..c06c699c0e7b 100644 --- a/drivers/misc/ocxl/link.c +++ b/drivers/misc/ocxl/link.c @@ -491,9 +491,9 @@ void ocxl_link_release(struct pci_dev *dev, void *link_handle) } EXPORT_SYMBOL_GPL(ocxl_link_release); -static void invalidate_range(struct mmu_notifier *mn, - struct mm_struct *mm, - unsigned long start, unsigned long end) +static void arch_invalidate_secondary_tlbs(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long start, unsigned long end) { struct pe_data *pe_data = container_of(mn, struct pe_data, mmu_notifier); struct ocxl_link *link = pe_data->link; @@ -509,7 +509,7 @@ static void invalidate_range(struct mmu_notifier *mn, } static const struct mmu_notifier_ops ocxl_mmu_notifier_ops = { - .invalidate_range = invalidate_range, + .arch_invalidate_secondary_tlbs = arch_invalidate_secondary_tlbs, }; static u64 calculate_cfg_state(bool kernel) diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index f2e9edc6aa43..6e3c857606f1 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -187,27 +187,27 @@ struct mmu_notifier_ops { const struct mmu_notifier_range *range); /* - * invalidate_range() is either called between - * invalidate_range_start() and invalidate_range_end() when the - * VM has to free pages that where unmapped, but before the - * pages are actually freed, or outside of _start()/_end() when - * a (remote) TLB is necessary. + * arch_invalidate_secondary_tlbs() is used to manage a non-CPU TLB + * which shares page-tables with the CPU. The + * invalidate_range_start()/end() callbacks should not be implemented as + * invalidate_secondary_tlbs() already catches the points in time when + * an external TLB needs to be flushed. * - * If invalidate_range() is used to manage a non-CPU TLB with - * shared page-tables, it not necessary to implement the - * invalidate_range_start()/end() notifiers, as - * invalidate_range() already catches the points in time when an - * external TLB range needs to be flushed. For more in depth - * discussion on this see Documentation/mm/mmu_notifier.rst + * This requires arch_invalidate_secondary_tlbs() to be called while + * holding the ptl spin-lock and therefore this callback is not allowed + * to sleep. * - * Note that this function might be called with just a sub-range - * of what was passed to invalidate_range_start()/end(), if - * called between those functions. + * This is called by architecture code whenever invalidating a TLB + * entry. It is assumed that any secondary TLB has the same rules for + * when invalidations are required. If this is not the case architecture + * code will need to call this explicitly when required for secondary + * TLB invalidation. */ - void (*invalidate_range)(struct mmu_notifier *subscription, - struct mm_struct *mm, - unsigned long start, - unsigned long end); + void (*arch_invalidate_secondary_tlbs)( + struct mmu_notifier *subscription, + struct mm_struct *mm, + unsigned long start, + unsigned long end); /* * These callbacks are used with the get/put interface to manage the @@ -396,8 +396,8 @@ extern void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, pte_t pte); extern int __mmu_notifier_invalidate_range_start(struct mmu_notifier_range *r); extern void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *r); -extern void __mmu_notifier_invalidate_range(struct mm_struct *mm, - unsigned long start, unsigned long end); +extern void __mmu_notifier_arch_invalidate_secondary_tlbs(struct mm_struct *mm, + unsigned long start, unsigned long end); extern bool mmu_notifier_range_update_to_read_only(const struct mmu_notifier_range *range); @@ -483,11 +483,11 @@ mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range) __mmu_notifier_invalidate_range_end(range); } -static inline void mmu_notifier_invalidate_range(struct mm_struct *mm, - unsigned long start, unsigned long end) +static inline void mmu_notifier_arch_invalidate_secondary_tlbs(struct mm_struct *mm, + unsigned long start, unsigned long end) { if (mm_has_notifiers(mm)) - __mmu_notifier_invalidate_range(mm, start, end); + __mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end); } static inline void mmu_notifier_subscriptions_init(struct mm_struct *mm) @@ -664,7 +664,7 @@ void mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range) { } -static inline void mmu_notifier_invalidate_range(struct mm_struct *mm, +static inline void mmu_notifier_arch_invalidate_secondary_tlbs(struct mm_struct *mm, unsigned long start, unsigned long end) { } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 3ece117de898..e0420de0e2e0 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2120,8 +2120,8 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, if (is_huge_zero_pmd(*pmd)) { /* * FIXME: Do we want to invalidate secondary mmu by calling - * mmu_notifier_invalidate_range() see comments below inside - * __split_huge_pmd() ? + * mmu_notifier_arch_invalidate_secondary_tlbs() see comments below + * inside __split_huge_pmd() ? * * We are going from a zero huge page write protected to zero * small page also write protected so it does not seems useful diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 4672752b0b17..5ef7bccda50c 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6649,8 +6649,9 @@ long hugetlb_change_protection(struct vm_area_struct *vma, else flush_hugetlb_tlb_range(vma, start, end); /* - * No need to call mmu_notifier_invalidate_range() we are downgrading - * page table protection not changing it to point to a new page. + * No need to call mmu_notifier_arch_invalidate_secondary_tlbs() we are + * downgrading page table protection not changing it to point to a new + * page. * * See Documentation/mm/mmu_notifier.rst */ @@ -7294,7 +7295,7 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma, i_mmap_unlock_write(vma->vm_file->f_mapping); hugetlb_vma_unlock_write(vma); /* - * No need to call mmu_notifier_invalidate_range(), see + * No need to call mmu_notifier_arch_invalidate_secondary_tlbs(), see * Documentation/mm/mmu_notifier.rst. */ mmu_notifier_invalidate_range_end(&range); diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 453a156d93c0..ec3b068cbbe6 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -585,8 +585,8 @@ void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range) lock_map_release(&__mmu_notifier_invalidate_range_start_map); } -void __mmu_notifier_invalidate_range(struct mm_struct *mm, - unsigned long start, unsigned long end) +void __mmu_notifier_arch_invalidate_secondary_tlbs(struct mm_struct *mm, + unsigned long start, unsigned long end) { struct mmu_notifier *subscription; int id; @@ -595,9 +595,10 @@ void __mmu_notifier_invalidate_range(struct mm_struct *mm, hlist_for_each_entry_rcu(subscription, &mm->notifier_subscriptions->list, hlist, srcu_read_lock_held(&srcu)) { - if (subscription->ops->invalidate_range) - subscription->ops->invalidate_range(subscription, mm, - start, end); + if (subscription->ops->arch_invalidate_secondary_tlbs) + subscription->ops->arch_invalidate_secondary_tlbs( + subscription, mm, + start, end); } srcu_read_unlock(&srcu, id); } @@ -616,6 +617,16 @@ int __mmu_notifier_register(struct mmu_notifier *subscription, mmap_assert_write_locked(mm); BUG_ON(atomic_read(&mm->mm_users) <= 0); + /* + * Subsystems should only register for invalidate_secondary_tlbs() or + * invalidate_range_start()/end() callbacks, not both. + */ + if (WARN_ON_ONCE(subscription && + (subscription->ops->arch_invalidate_secondary_tlbs && + (subscription->ops->invalidate_range_start || + subscription->ops->invalidate_range_end)))) + return -EINVAL; + if (!mm->notifier_subscriptions) { /* * kmalloc cannot be called under mm_take_all_locks(), but we -- cgit v1.2.3 From 284e05920498788c5df1a7dd6424adb426498e1c Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 24 Jul 2023 19:54:01 +0100 Subject: mm: remove CONFIG_PER_VMA_LOCK ifdefs Patch series "Handle most file-backed faults under the VMA lock", v3. This patchset adds the ability to handle page faults on parts of files which are already in the page cache without taking the mmap lock. This patch (of 10): Provide lock_vma_under_rcu() when CONFIG_PER_VMA_LOCK is not defined to eliminate ifdefs in the users. Link: https://lkml.kernel.org/r/20230724185410.1124082-1-willy@infradead.org Link: https://lkml.kernel.org/r/20230724185410.1124082-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Suren Baghdasaryan Cc: Punit Agrawal Cc: Arjun Roy Cc: Eric Dumazet Signed-off-by: Andrew Morton --- arch/arm64/mm/fault.c | 2 -- arch/powerpc/mm/fault.c | 4 ---- arch/riscv/mm/fault.c | 4 ---- arch/s390/mm/fault.c | 2 -- arch/x86/mm/fault.c | 4 ---- include/linux/mm.h | 6 ++++++ 6 files changed, 6 insertions(+), 16 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 3fe516b32577..103fcbdc6552 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -587,7 +587,6 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr, perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr); -#ifdef CONFIG_PER_VMA_LOCK if (!(mm_flags & FAULT_FLAG_USER)) goto lock_mmap; @@ -615,7 +614,6 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr, return 0; } lock_mmap: -#endif /* CONFIG_PER_VMA_LOCK */ retry: vma = lock_mm_and_find_vma(mm, addr, regs); diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index 5bfdf6ecfa96..fafce6bdeff0 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -469,7 +469,6 @@ static int ___do_page_fault(struct pt_regs *regs, unsigned long address, if (is_exec) flags |= FAULT_FLAG_INSTRUCTION; -#ifdef CONFIG_PER_VMA_LOCK if (!(flags & FAULT_FLAG_USER)) goto lock_mmap; @@ -501,7 +500,6 @@ static int ___do_page_fault(struct pt_regs *regs, unsigned long address, return user_mode(regs) ? 0 : SIGBUS; lock_mmap: -#endif /* CONFIG_PER_VMA_LOCK */ /* When running in the kernel we expect faults to occur only to * addresses in user space. All other faults represent errors in the @@ -551,9 +549,7 @@ retry: mmap_read_unlock(current->mm); -#ifdef CONFIG_PER_VMA_LOCK done: -#endif if (unlikely(fault & VM_FAULT_ERROR)) return mm_fault_error(regs, address, fault); diff --git a/arch/riscv/mm/fault.c b/arch/riscv/mm/fault.c index 6ea2cce4cc17..046732fcb48c 100644 --- a/arch/riscv/mm/fault.c +++ b/arch/riscv/mm/fault.c @@ -283,7 +283,6 @@ void handle_page_fault(struct pt_regs *regs) flags |= FAULT_FLAG_WRITE; else if (cause == EXC_INST_PAGE_FAULT) flags |= FAULT_FLAG_INSTRUCTION; -#ifdef CONFIG_PER_VMA_LOCK if (!(flags & FAULT_FLAG_USER)) goto lock_mmap; @@ -311,7 +310,6 @@ void handle_page_fault(struct pt_regs *regs) return; } lock_mmap: -#endif /* CONFIG_PER_VMA_LOCK */ retry: vma = lock_mm_and_find_vma(mm, addr, regs); @@ -368,9 +366,7 @@ retry: mmap_read_unlock(mm); -#ifdef CONFIG_PER_VMA_LOCK done: -#endif if (unlikely(fault & VM_FAULT_ERROR)) { tsk->thread.bad_cause = cause; mm_fault_error(regs, addr, fault); diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index 2f123429a291..6f6b9881e55e 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -407,7 +407,6 @@ static inline vm_fault_t do_exception(struct pt_regs *regs, int access) access = VM_WRITE; if (access == VM_WRITE) flags |= FAULT_FLAG_WRITE; -#ifdef CONFIG_PER_VMA_LOCK if (!(flags & FAULT_FLAG_USER)) goto lock_mmap; vma = lock_vma_under_rcu(mm, address); @@ -432,7 +431,6 @@ static inline vm_fault_t do_exception(struct pt_regs *regs, int access) goto out; } lock_mmap: -#endif /* CONFIG_PER_VMA_LOCK */ mmap_read_lock(mm); gmap = NULL; diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index e8711b2cafaf..787da09d24f3 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1328,7 +1328,6 @@ void do_user_addr_fault(struct pt_regs *regs, } #endif -#ifdef CONFIG_PER_VMA_LOCK if (!(flags & FAULT_FLAG_USER)) goto lock_mmap; @@ -1358,7 +1357,6 @@ void do_user_addr_fault(struct pt_regs *regs, return; } lock_mmap: -#endif /* CONFIG_PER_VMA_LOCK */ retry: vma = lock_mm_and_find_vma(mm, address, regs); @@ -1418,9 +1416,7 @@ retry: } mmap_read_unlock(mm); -#ifdef CONFIG_PER_VMA_LOCK done: -#endif if (likely(!(fault & VM_FAULT_ERROR))) return; diff --git a/include/linux/mm.h b/include/linux/mm.h index ded514ee2588..21299a0cfbca 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -742,6 +742,12 @@ static inline void vma_assert_write_locked(struct vm_area_struct *vma) {} static inline void vma_mark_detached(struct vm_area_struct *vma, bool detached) {} +static inline struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, + unsigned long address) +{ + return NULL; +} + #endif /* CONFIG_PER_VMA_LOCK */ /* -- cgit v1.2.3 From 104c49d5b6dc8b38aea0cb7300068b378cf37ac1 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 25 Jul 2023 00:37:54 +0530 Subject: powerpc/mm/trace: convert trace event to trace event class A follow-up patch will add a pud variant for this same event. Using event class makes that addition simpler. No functional change in this patch. Link: https://lkml.kernel.org/r/20230724190759.483013-9-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Reviewed-by: Christophe Leroy Cc: Catalin Marinas Cc: Dan Williams Cc: Joao Martins Cc: Michael Ellerman Cc: Mike Kravetz Cc: Muchun Song Cc: Nicholas Piggin Cc: Oscar Salvador Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/powerpc/mm/book3s64/hash_pgtable.c | 2 +- arch/powerpc/mm/book3s64/radix_pgtable.c | 2 +- include/trace/events/thp.h | 23 ++++++++++++++++------- 3 files changed, 18 insertions(+), 9 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/mm/book3s64/hash_pgtable.c b/arch/powerpc/mm/book3s64/hash_pgtable.c index 51f48984abca..988948d69bc1 100644 --- a/arch/powerpc/mm/book3s64/hash_pgtable.c +++ b/arch/powerpc/mm/book3s64/hash_pgtable.c @@ -214,7 +214,7 @@ unsigned long hash__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr old = be64_to_cpu(old_be); - trace_hugepage_update(addr, old, clr, set); + trace_hugepage_update_pmd(addr, old, clr, set); if (old & H_PAGE_HASHPTE) hpte_do_hugepage_flush(mm, addr, pmdp, old); return old; diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c index e7ea492ac510..02e185d2e4d6 100644 --- a/arch/powerpc/mm/book3s64/radix_pgtable.c +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c @@ -962,7 +962,7 @@ unsigned long radix__pmd_hugepage_update(struct mm_struct *mm, unsigned long add #endif old = radix__pte_update(mm, addr, pmdp_ptep(pmdp), clr, set, 1); - trace_hugepage_update(addr, old, clr, set); + trace_hugepage_update_pmd(addr, old, clr, set); return old; } diff --git a/include/trace/events/thp.h b/include/trace/events/thp.h index 202b3e3e67ff..a95c78b10561 100644 --- a/include/trace/events/thp.h +++ b/include/trace/events/thp.h @@ -8,25 +8,29 @@ #include #include -TRACE_EVENT(hugepage_set_pmd, +DECLARE_EVENT_CLASS(hugepage_set, - TP_PROTO(unsigned long addr, unsigned long pmd), - TP_ARGS(addr, pmd), + TP_PROTO(unsigned long addr, unsigned long pte), + TP_ARGS(addr, pte), TP_STRUCT__entry( __field(unsigned long, addr) - __field(unsigned long, pmd) + __field(unsigned long, pte) ), TP_fast_assign( __entry->addr = addr; - __entry->pmd = pmd; + __entry->pte = pte; ), - TP_printk("Set pmd with 0x%lx with 0x%lx", __entry->addr, __entry->pmd) + TP_printk("Set page table entry with 0x%lx with 0x%lx", __entry->addr, __entry->pte) ); +DEFINE_EVENT(hugepage_set, hugepage_set_pmd, + TP_PROTO(unsigned long addr, unsigned long pmd), + TP_ARGS(addr, pmd) +); -TRACE_EVENT(hugepage_update, +DECLARE_EVENT_CLASS(hugepage_update, TP_PROTO(unsigned long addr, unsigned long pte, unsigned long clr, unsigned long set), TP_ARGS(addr, pte, clr, set), @@ -48,6 +52,11 @@ TRACE_EVENT(hugepage_update, TP_printk("hugepage update at addr 0x%lx and pte = 0x%lx clr = 0x%lx, set = 0x%lx", __entry->addr, __entry->pte, __entry->clr, __entry->set) ); +DEFINE_EVENT(hugepage_update, hugepage_update_pmd, + TP_PROTO(unsigned long addr, unsigned long pmd, unsigned long clr, unsigned long set), + TP_ARGS(addr, pmd, clr, set) +); + DECLARE_EVENT_CLASS(migration_pmd, TP_PROTO(unsigned long addr, unsigned long pmd), -- cgit v1.2.3 From 27af67f35631ac4b61b5e4455b44c9aee8d2cc4b Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 25 Jul 2023 00:37:55 +0530 Subject: powerpc/book3s64/mm: enable transparent pud hugepage This is enabled only with radix translation and 1G hugepage size. This will be used with devdax device memory with a namespace alignment of 1G. Anon transparent hugepage is not supported even though we do have helpers checking pud_trans_huge(). We should never find that return true. The only expected pte bit combination is _PAGE_PTE | _PAGE_DEVMAP. Some of the helpers are never expected to get called on hash translation and hence is marked to call BUG() in such a case. Link: https://lkml.kernel.org/r/20230724190759.483013-10-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Cc: Catalin Marinas Cc: Christophe Leroy Cc: Dan Williams Cc: Joao Martins Cc: Michael Ellerman Cc: Mike Kravetz Cc: Muchun Song Cc: Nicholas Piggin Cc: Oscar Salvador Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/powerpc/include/asm/book3s/64/hash.h | 9 ++ arch/powerpc/include/asm/book3s/64/pgtable.h | 155 +++++++++++++++++++-- arch/powerpc/include/asm/book3s/64/radix.h | 36 +++++ .../powerpc/include/asm/book3s/64/tlbflush-radix.h | 2 + arch/powerpc/include/asm/book3s/64/tlbflush.h | 8 ++ arch/powerpc/mm/book3s64/pgtable.c | 78 +++++++++++ arch/powerpc/mm/book3s64/radix_pgtable.c | 28 ++++ arch/powerpc/mm/book3s64/radix_tlb.c | 7 + arch/powerpc/platforms/Kconfig.cputype | 1 + include/trace/events/thp.h | 10 ++ 10 files changed, 323 insertions(+), 11 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h index d4a19e6547ac..6e70ae511631 100644 --- a/arch/powerpc/include/asm/book3s/64/hash.h +++ b/arch/powerpc/include/asm/book3s/64/hash.h @@ -138,7 +138,16 @@ static inline int hash__pmd_same(pmd_t pmd_a, pmd_t pmd_b) } #define hash__pmd_bad(pmd) (pmd_val(pmd) & H_PMD_BAD_BITS) + +/* + * pud comparison that will work with both pte and page table pointer. + */ +static inline int hash__pud_same(pud_t pud_a, pud_t pud_b) +{ + return (((pud_raw(pud_a) ^ pud_raw(pud_b)) & ~cpu_to_be64(_PAGE_HPTEFLAGS)) == 0); +} #define hash__pud_bad(pud) (pud_val(pud) & H_PUD_BAD_BITS) + static inline int hash__p4d_bad(p4d_t p4d) { return (p4d_val(p4d) == 0); diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 4acc9690f599..a8204566cfd0 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -921,8 +921,29 @@ static inline pud_t pte_pud(pte_t pte) { return __pud_raw(pte_raw(pte)); } + +static inline pte_t *pudp_ptep(pud_t *pud) +{ + return (pte_t *)pud; +} + +#define pud_pfn(pud) pte_pfn(pud_pte(pud)) +#define pud_dirty(pud) pte_dirty(pud_pte(pud)) +#define pud_young(pud) pte_young(pud_pte(pud)) +#define pud_mkold(pud) pte_pud(pte_mkold(pud_pte(pud))) +#define pud_wrprotect(pud) pte_pud(pte_wrprotect(pud_pte(pud))) +#define pud_mkdirty(pud) pte_pud(pte_mkdirty(pud_pte(pud))) +#define pud_mkclean(pud) pte_pud(pte_mkclean(pud_pte(pud))) +#define pud_mkyoung(pud) pte_pud(pte_mkyoung(pud_pte(pud))) +#define pud_mkwrite(pud) pte_pud(pte_mkwrite(pud_pte(pud))) #define pud_write(pud) pte_write(pud_pte(pud)) +#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY +#define pud_soft_dirty(pmd) pte_soft_dirty(pud_pte(pud)) +#define pud_mksoft_dirty(pmd) pte_pud(pte_mksoft_dirty(pud_pte(pud))) +#define pud_clear_soft_dirty(pmd) pte_pud(pte_clear_soft_dirty(pud_pte(pud))) +#endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */ + static inline int pud_bad(pud_t pud) { if (radix_enabled()) @@ -1115,15 +1136,24 @@ static inline bool pmd_access_permitted(pmd_t pmd, bool write) #ifdef CONFIG_TRANSPARENT_HUGEPAGE extern pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot); +extern pud_t pfn_pud(unsigned long pfn, pgprot_t pgprot); extern pmd_t mk_pmd(struct page *page, pgprot_t pgprot); extern pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot); extern void set_pmd_at(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, pmd_t pmd); +extern void set_pud_at(struct mm_struct *mm, unsigned long addr, + pud_t *pudp, pud_t pud); + static inline void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd) { } +static inline void update_mmu_cache_pud(struct vm_area_struct *vma, + unsigned long addr, pud_t *pud) +{ +} + extern int hash__has_transparent_hugepage(void); static inline int has_transparent_hugepage(void) { @@ -1133,6 +1163,14 @@ static inline int has_transparent_hugepage(void) } #define has_transparent_hugepage has_transparent_hugepage +static inline int has_transparent_pud_hugepage(void) +{ + if (radix_enabled()) + return radix__has_transparent_pud_hugepage(); + return 0; +} +#define has_transparent_pud_hugepage has_transparent_pud_hugepage + static inline unsigned long pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, unsigned long clr, unsigned long set) @@ -1142,6 +1180,16 @@ pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, return hash__pmd_hugepage_update(mm, addr, pmdp, clr, set); } +static inline unsigned long +pud_hugepage_update(struct mm_struct *mm, unsigned long addr, pud_t *pudp, + unsigned long clr, unsigned long set) +{ + if (radix_enabled()) + return radix__pud_hugepage_update(mm, addr, pudp, clr, set); + BUG(); + return pud_val(*pudp); +} + /* * returns true for pmd migration entries, THP, devmap, hugetlb * But compile time dependent on THP config @@ -1151,6 +1199,11 @@ static inline int pmd_large(pmd_t pmd) return !!(pmd_raw(pmd) & cpu_to_be64(_PAGE_PTE)); } +static inline int pud_large(pud_t pud) +{ + return !!(pud_raw(pud) & cpu_to_be64(_PAGE_PTE)); +} + /* * For radix we should always find H_PAGE_HASHPTE zero. Hence * the below will work for radix too @@ -1166,6 +1219,17 @@ static inline int __pmdp_test_and_clear_young(struct mm_struct *mm, return ((old & _PAGE_ACCESSED) != 0); } +static inline int __pudp_test_and_clear_young(struct mm_struct *mm, + unsigned long addr, pud_t *pudp) +{ + unsigned long old; + + if ((pud_raw(*pudp) & cpu_to_be64(_PAGE_ACCESSED | H_PAGE_HASHPTE)) == 0) + return 0; + old = pud_hugepage_update(mm, addr, pudp, _PAGE_ACCESSED, 0); + return ((old & _PAGE_ACCESSED) != 0); +} + #define __HAVE_ARCH_PMDP_SET_WRPROTECT static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp) @@ -1174,6 +1238,14 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long addr, pmd_hugepage_update(mm, addr, pmdp, _PAGE_WRITE, 0); } +#define __HAVE_ARCH_PUDP_SET_WRPROTECT +static inline void pudp_set_wrprotect(struct mm_struct *mm, unsigned long addr, + pud_t *pudp) +{ + if (pud_write(*pudp)) + pud_hugepage_update(mm, addr, pudp, _PAGE_WRITE, 0); +} + /* * Only returns true for a THP. False for pmd migration entry. * We also need to return true when we come across a pte that @@ -1195,6 +1267,17 @@ static inline int pmd_trans_huge(pmd_t pmd) return hash__pmd_trans_huge(pmd); } +static inline int pud_trans_huge(pud_t pud) +{ + if (!pud_present(pud)) + return false; + + if (radix_enabled()) + return radix__pud_trans_huge(pud); + return 0; +} + + #define __HAVE_ARCH_PMD_SAME static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b) { @@ -1203,6 +1286,15 @@ static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b) return hash__pmd_same(pmd_a, pmd_b); } +#define pud_same pud_same +static inline int pud_same(pud_t pud_a, pud_t pud_b) +{ + if (radix_enabled()) + return radix__pud_same(pud_a, pud_b); + return hash__pud_same(pud_a, pud_b); +} + + static inline pmd_t __pmd_mkhuge(pmd_t pmd) { if (radix_enabled()) @@ -1210,6 +1302,14 @@ static inline pmd_t __pmd_mkhuge(pmd_t pmd) return hash__pmd_mkhuge(pmd); } +static inline pud_t __pud_mkhuge(pud_t pud) +{ + if (radix_enabled()) + return radix__pud_mkhuge(pud); + BUG(); + return pud; +} + /* * pfn_pmd return a pmd_t that can be used as pmd pte entry. */ @@ -1225,14 +1325,34 @@ static inline pmd_t pmd_mkhuge(pmd_t pmd) return pmd; } +static inline pud_t pud_mkhuge(pud_t pud) +{ +#ifdef CONFIG_DEBUG_VM + if (radix_enabled()) + WARN_ON((pud_raw(pud) & cpu_to_be64(_PAGE_PTE)) == 0); + else + WARN_ON(1); +#endif + return pud; +} + + #define __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS extern int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, pmd_t entry, int dirty); +#define __HAVE_ARCH_PUDP_SET_ACCESS_FLAGS +extern int pudp_set_access_flags(struct vm_area_struct *vma, + unsigned long address, pud_t *pudp, + pud_t entry, int dirty); #define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG extern int pmdp_test_and_clear_young(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); +#define __HAVE_ARCH_PUDP_TEST_AND_CLEAR_YOUNG +extern int pudp_test_and_clear_young(struct vm_area_struct *vma, + unsigned long address, pud_t *pudp); + #define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, @@ -1243,6 +1363,16 @@ static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, return hash__pmdp_huge_get_and_clear(mm, addr, pmdp); } +#define __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR +static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm, + unsigned long addr, pud_t *pudp) +{ + if (radix_enabled()) + return radix__pudp_huge_get_and_clear(mm, addr, pudp); + BUG(); + return *pudp; +} + static inline pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { @@ -1257,6 +1387,11 @@ pmd_t pmdp_huge_get_and_clear_full(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp, int full); +#define __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR_FULL +pud_t pudp_huge_get_and_clear_full(struct vm_area_struct *vma, + unsigned long addr, + pud_t *pudp, int full); + #define __HAVE_ARCH_PGTABLE_DEPOSIT static inline void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, pgtable_t pgtable) @@ -1305,6 +1440,14 @@ static inline pmd_t pmd_mkdevmap(pmd_t pmd) return hash__pmd_mkdevmap(pmd); } +static inline pud_t pud_mkdevmap(pud_t pud) +{ + if (radix_enabled()) + return radix__pud_mkdevmap(pud); + BUG(); + return pud; +} + static inline int pmd_devmap(pmd_t pmd) { return pte_devmap(pmd_pte(pmd)); @@ -1312,7 +1455,7 @@ static inline int pmd_devmap(pmd_t pmd) static inline int pud_devmap(pud_t pud) { - return 0; + return pte_devmap(pud_pte(pud)); } static inline int pgd_devmap(pgd_t pgd) @@ -1321,16 +1464,6 @@ static inline int pgd_devmap(pgd_t pgd) } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ -static inline int pud_pfn(pud_t pud) -{ - /* - * Currently all calls to pud_pfn() are gated around a pud_devmap() - * check so this should never be used. If it grows another user we - * want to know about it. - */ - BUILD_BUG(); - return 0; -} #define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION pte_t ptep_modify_prot_start(struct vm_area_struct *, unsigned long, pte_t *); void ptep_modify_prot_commit(struct vm_area_struct *, unsigned long, diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h index 686001eda936..2ef92f36340f 100644 --- a/arch/powerpc/include/asm/book3s/64/radix.h +++ b/arch/powerpc/include/asm/book3s/64/radix.h @@ -250,6 +250,10 @@ static inline int radix__pud_bad(pud_t pud) return !!(pud_val(pud) & RADIX_PUD_BAD_BITS); } +static inline int radix__pud_same(pud_t pud_a, pud_t pud_b) +{ + return ((pud_raw(pud_a) ^ pud_raw(pud_b)) == 0); +} static inline int radix__p4d_bad(p4d_t p4d) { @@ -268,9 +272,22 @@ static inline pmd_t radix__pmd_mkhuge(pmd_t pmd) return __pmd(pmd_val(pmd) | _PAGE_PTE); } +static inline int radix__pud_trans_huge(pud_t pud) +{ + return (pud_val(pud) & (_PAGE_PTE | _PAGE_DEVMAP)) == _PAGE_PTE; +} + +static inline pud_t radix__pud_mkhuge(pud_t pud) +{ + return __pud(pud_val(pud) | _PAGE_PTE); +} + extern unsigned long radix__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, unsigned long clr, unsigned long set); +extern unsigned long radix__pud_hugepage_update(struct mm_struct *mm, unsigned long addr, + pud_t *pudp, unsigned long clr, + unsigned long set); extern pmd_t radix__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); extern void radix__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, @@ -278,6 +295,9 @@ extern void radix__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, extern pgtable_t radix__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp); extern pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp); +pud_t radix__pudp_huge_get_and_clear(struct mm_struct *mm, + unsigned long addr, pud_t *pudp); + static inline int radix__has_transparent_hugepage(void) { /* For radix 2M at PMD level means thp */ @@ -285,6 +305,14 @@ static inline int radix__has_transparent_hugepage(void) return 1; return 0; } + +static inline int radix__has_transparent_pud_hugepage(void) +{ + /* For radix 1G at PUD level means pud hugepage support */ + if (mmu_psize_defs[MMU_PAGE_1G].shift == PUD_SHIFT) + return 1; + return 0; +} #endif static inline pmd_t radix__pmd_mkdevmap(pmd_t pmd) @@ -292,9 +320,17 @@ static inline pmd_t radix__pmd_mkdevmap(pmd_t pmd) return __pmd(pmd_val(pmd) | (_PAGE_PTE | _PAGE_DEVMAP)); } +static inline pud_t radix__pud_mkdevmap(pud_t pud) +{ + return __pud(pud_val(pud) | (_PAGE_PTE | _PAGE_DEVMAP)); +} + +struct vmem_altmap; extern int __meminit radix__vmemmap_create_mapping(unsigned long start, unsigned long page_size, unsigned long phys); +int __meminit radix__vmemmap_populate(unsigned long start, unsigned long end, + int node, struct vmem_altmap *altmap); extern void radix__vmemmap_remove_mapping(unsigned long start, unsigned long page_size); diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h index 77797a2a82eb..a38542259fab 100644 --- a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h +++ b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h @@ -68,6 +68,8 @@ void radix__flush_tlb_pwc_range_psize(struct mm_struct *mm, unsigned long start, unsigned long end, int psize); extern void radix__flush_pmd_tlb_range(struct vm_area_struct *vma, unsigned long start, unsigned long end); +extern void radix__flush_pud_tlb_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end); extern void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start, unsigned long end); extern void radix__flush_tlb_kernel_range(unsigned long start, unsigned long end); diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush.h b/arch/powerpc/include/asm/book3s/64/tlbflush.h index dca0477b0709..1950c1b825b4 100644 --- a/arch/powerpc/include/asm/book3s/64/tlbflush.h +++ b/arch/powerpc/include/asm/book3s/64/tlbflush.h @@ -51,6 +51,14 @@ static inline void flush_pmd_tlb_range(struct vm_area_struct *vma, radix__flush_pmd_tlb_range(vma, start, end); } +#define __HAVE_ARCH_FLUSH_PUD_TLB_RANGE +static inline void flush_pud_tlb_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + if (radix_enabled()) + radix__flush_pud_tlb_range(vma, start, end); +} + #define __HAVE_ARCH_FLUSH_HUGETLB_TLB_RANGE static inline void flush_hugetlb_tlb_range(struct vm_area_struct *vma, unsigned long start, diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c index 85c84e89e3ea..75b938268b04 100644 --- a/arch/powerpc/mm/book3s64/pgtable.c +++ b/arch/powerpc/mm/book3s64/pgtable.c @@ -64,11 +64,39 @@ int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address, return changed; } +int pudp_set_access_flags(struct vm_area_struct *vma, unsigned long address, + pud_t *pudp, pud_t entry, int dirty) +{ + int changed; +#ifdef CONFIG_DEBUG_VM + WARN_ON(!pud_devmap(*pudp)); + assert_spin_locked(pud_lockptr(vma->vm_mm, pudp)); +#endif + changed = !pud_same(*(pudp), entry); + if (changed) { + /* + * We can use MMU_PAGE_1G here, because only radix + * path look at the psize. + */ + __ptep_set_access_flags(vma, pudp_ptep(pudp), + pud_pte(entry), address, MMU_PAGE_1G); + } + return changed; +} + + int pmdp_test_and_clear_young(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp); } + +int pudp_test_and_clear_young(struct vm_area_struct *vma, + unsigned long address, pud_t *pudp) +{ + return __pudp_test_and_clear_young(vma->vm_mm, address, pudp); +} + /* * set a new huge pmd. We should not be called for updating * an existing pmd entry. That should go via pmd_hugepage_update. @@ -90,6 +118,23 @@ void set_pmd_at(struct mm_struct *mm, unsigned long addr, return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd)); } +void set_pud_at(struct mm_struct *mm, unsigned long addr, + pud_t *pudp, pud_t pud) +{ +#ifdef CONFIG_DEBUG_VM + /* + * Make sure hardware valid bit is not set. We don't do + * tlb flush for this update. + */ + + WARN_ON(pte_hw_valid(pud_pte(*pudp))); + assert_spin_locked(pud_lockptr(mm, pudp)); + WARN_ON(!(pud_large(pud))); +#endif + trace_hugepage_set_pud(addr, pud_val(pud)); + return set_pte_at(mm, addr, pudp_ptep(pudp), pud_pte(pud)); +} + static void do_serialize(void *arg) { /* We've taken the IPI, so try to trim the mask while here */ @@ -147,11 +192,35 @@ pmd_t pmdp_huge_get_and_clear_full(struct vm_area_struct *vma, return pmd; } +pud_t pudp_huge_get_and_clear_full(struct vm_area_struct *vma, + unsigned long addr, pud_t *pudp, int full) +{ + pud_t pud; + + VM_BUG_ON(addr & ~HPAGE_PMD_MASK); + VM_BUG_ON((pud_present(*pudp) && !pud_devmap(*pudp)) || + !pud_present(*pudp)); + pud = pudp_huge_get_and_clear(vma->vm_mm, addr, pudp); + /* + * if it not a fullmm flush, then we can possibly end up converting + * this PMD pte entry to a regular level 0 PTE by a parallel page fault. + * Make sure we flush the tlb in this case. + */ + if (!full) + flush_pud_tlb_range(vma, addr, addr + HPAGE_PUD_SIZE); + return pud; +} + static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot) { return __pmd(pmd_val(pmd) | pgprot_val(pgprot)); } +static pud_t pud_set_protbits(pud_t pud, pgprot_t pgprot) +{ + return __pud(pud_val(pud) | pgprot_val(pgprot)); +} + /* * At some point we should be able to get rid of * pmd_mkhuge() and mk_huge_pmd() when we update all the @@ -166,6 +235,15 @@ pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot) return __pmd_mkhuge(pmd_set_protbits(__pmd(pmdv), pgprot)); } +pud_t pfn_pud(unsigned long pfn, pgprot_t pgprot) +{ + unsigned long pudv; + + pudv = (pfn << PAGE_SHIFT) & PTE_RPN_MASK; + + return __pud_mkhuge(pud_set_protbits(__pud(pudv), pgprot)); +} + pmd_t mk_pmd(struct page *page, pgprot_t pgprot) { return pfn_pmd(page_to_pfn(page), pgprot); diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c index 02e185d2e4d6..227fea53c217 100644 --- a/arch/powerpc/mm/book3s64/radix_pgtable.c +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c @@ -967,6 +967,23 @@ unsigned long radix__pmd_hugepage_update(struct mm_struct *mm, unsigned long add return old; } +unsigned long radix__pud_hugepage_update(struct mm_struct *mm, unsigned long addr, + pud_t *pudp, unsigned long clr, + unsigned long set) +{ + unsigned long old; + +#ifdef CONFIG_DEBUG_VM + WARN_ON(!pud_devmap(*pudp)); + assert_spin_locked(pud_lockptr(mm, pudp)); +#endif + + old = radix__pte_update(mm, addr, pudp_ptep(pudp), clr, set, 1); + trace_hugepage_update_pud(addr, old, clr, set); + + return old; +} + pmd_t radix__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) @@ -1043,6 +1060,17 @@ pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm, return old_pmd; } +pud_t radix__pudp_huge_get_and_clear(struct mm_struct *mm, + unsigned long addr, pud_t *pudp) +{ + pud_t old_pud; + unsigned long old; + + old = radix__pud_hugepage_update(mm, addr, pudp, ~0UL, 0); + old_pud = __pud(old); + return old_pud; +} + #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ void radix__ptep_set_access_flags(struct vm_area_struct *vma, pte_t *ptep, diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c index 06e647ef19d1..3020a8b38572 100644 --- a/arch/powerpc/mm/book3s64/radix_tlb.c +++ b/arch/powerpc/mm/book3s64/radix_tlb.c @@ -1465,6 +1465,13 @@ void radix__flush_pmd_tlb_range(struct vm_area_struct *vma, } EXPORT_SYMBOL(radix__flush_pmd_tlb_range); +void radix__flush_pud_tlb_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + radix__flush_tlb_range_psize(vma->vm_mm, start, end, MMU_PAGE_1G); +} +EXPORT_SYMBOL(radix__flush_pud_tlb_range); + void radix__flush_tlb_all(void) { unsigned long rb,prs,r,rs; diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 45fd975ef521..340b86ef7284 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -94,6 +94,7 @@ config PPC_BOOK3S_64 select PPC_FPU select PPC_HAVE_PMU_SUPPORT select HAVE_ARCH_TRANSPARENT_HUGEPAGE + select HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD select ARCH_ENABLE_HUGEPAGE_MIGRATION if HUGETLB_PAGE && MIGRATION select ARCH_ENABLE_SPLIT_PMD_PTLOCK select ARCH_ENABLE_THP_MIGRATION if TRANSPARENT_HUGEPAGE diff --git a/include/trace/events/thp.h b/include/trace/events/thp.h index a95c78b10561..f50048af5fcc 100644 --- a/include/trace/events/thp.h +++ b/include/trace/events/thp.h @@ -30,6 +30,11 @@ DEFINE_EVENT(hugepage_set, hugepage_set_pmd, TP_ARGS(addr, pmd) ); +DEFINE_EVENT(hugepage_set, hugepage_set_pud, + TP_PROTO(unsigned long addr, unsigned long pud), + TP_ARGS(addr, pud) +); + DECLARE_EVENT_CLASS(hugepage_update, TP_PROTO(unsigned long addr, unsigned long pte, unsigned long clr, unsigned long set), @@ -57,6 +62,11 @@ DEFINE_EVENT(hugepage_update, hugepage_update_pmd, TP_ARGS(addr, pmd, clr, set) ); +DEFINE_EVENT(hugepage_update, hugepage_update_pud, + TP_PROTO(unsigned long addr, unsigned long pud, unsigned long clr, unsigned long set), + TP_ARGS(addr, pud, clr, set) +); + DECLARE_EVENT_CLASS(migration_pmd, TP_PROTO(unsigned long addr, unsigned long pmd), -- cgit v1.2.3 From 368a0590d954a659b16ab945328ada0cc10f93a0 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 25 Jul 2023 00:37:56 +0530 Subject: powerpc/book3s64/vmemmap: switch radix to use a different vmemmap handling function This is in preparation to update radix to implement vmemmap optimization for devdax. Below are the rules w.r.t radix vmemmap mapping 1. First try to map things using PMD (2M) 2. With altmap if altmap cross-boundary check returns true, fall back to PAGE_SIZE 3. If we can't allocate PMD_SIZE backing memory for vmemmap, fallback to PAGE_SIZE On removing vmemmap mapping, check if every subsection that is using the vmemmap area is invalid. If found to be invalid, that implies we can safely free the vmemmap area. We don't use the PAGE_UNUSED pattern used by x86 because with 64K page size, we need to do the above check even at the PAGE_SIZE granularity. [aneesh.kumar@linux.ibm.com: fix section mismatch warning] Link: https://lkml.kernel.org/r/87h6pqvu5g.fsf@linux.ibm.com [aneesh.kumar@linux.ibm.com: fix kernel build error] Link: https://lkml.kernel.org/r/877cqkwd20.fsf@linux.ibm.com Link: https://lkml.kernel.org/r/20230724190759.483013-11-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Cc: Catalin Marinas Cc: Christophe Leroy Cc: Dan Williams Cc: Joao Martins Cc: Michael Ellerman Cc: Mike Kravetz Cc: Muchun Song Cc: Nicholas Piggin Cc: Oscar Salvador Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/powerpc/include/asm/book3s/64/radix.h | 2 + arch/powerpc/include/asm/pgtable.h | 6 + arch/powerpc/mm/book3s64/radix_pgtable.c | 327 ++++++++++++++++++++++++++--- arch/powerpc/mm/init_64.c | 26 ++- 4 files changed, 329 insertions(+), 32 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h index 2ef92f36340f..f1461289643a 100644 --- a/arch/powerpc/include/asm/book3s/64/radix.h +++ b/arch/powerpc/include/asm/book3s/64/radix.h @@ -331,6 +331,8 @@ extern int __meminit radix__vmemmap_create_mapping(unsigned long start, unsigned long phys); int __meminit radix__vmemmap_populate(unsigned long start, unsigned long end, int node, struct vmem_altmap *altmap); +void __ref radix__vmemmap_free(unsigned long start, unsigned long end, + struct vmem_altmap *altmap); extern void radix__vmemmap_remove_mapping(unsigned long start, unsigned long page_size); diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index 445a22987aa3..a4893b17705a 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h @@ -157,6 +157,12 @@ static inline pgtable_t pmd_pgtable(pmd_t pmd) return (pgtable_t)pmd_page_vaddr(pmd); } +#ifdef CONFIG_PPC64 +int __meminit vmemmap_populated(unsigned long vmemmap_addr, int vmemmap_map_size); +bool altmap_cross_boundary(struct vmem_altmap *altmap, unsigned long start, + unsigned long page_size); +#endif /* CONFIG_PPC64 */ + #endif /* __ASSEMBLY__ */ #endif /* _ASM_POWERPC_PGTABLE_H */ diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c index 227fea53c217..6d04dd579d03 100644 --- a/arch/powerpc/mm/book3s64/radix_pgtable.c +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c @@ -744,8 +744,58 @@ static void free_pud_table(pud_t *pud_start, p4d_t *p4d) p4d_clear(p4d); } -static void remove_pte_table(pte_t *pte_start, unsigned long addr, - unsigned long end, bool direct) +#ifdef CONFIG_SPARSEMEM_VMEMMAP +static bool __meminit vmemmap_pmd_is_unused(unsigned long addr, unsigned long end) +{ + unsigned long start = ALIGN_DOWN(addr, PMD_SIZE); + + return !vmemmap_populated(start, PMD_SIZE); +} + +static bool __meminit vmemmap_page_is_unused(unsigned long addr, unsigned long end) +{ + unsigned long start = ALIGN_DOWN(addr, PAGE_SIZE); + + return !vmemmap_populated(start, PAGE_SIZE); + +} +#endif + +static void __meminit free_vmemmap_pages(struct page *page, + struct vmem_altmap *altmap, + int order) +{ + unsigned int nr_pages = 1 << order; + + if (altmap) { + unsigned long alt_start, alt_end; + unsigned long base_pfn = page_to_pfn(page); + + /* + * with 2M vmemmap mmaping we can have things setup + * such that even though atlmap is specified we never + * used altmap. + */ + alt_start = altmap->base_pfn; + alt_end = altmap->base_pfn + altmap->reserve + altmap->free; + + if (base_pfn >= alt_start && base_pfn < alt_end) { + vmem_altmap_free(altmap, nr_pages); + return; + } + } + + if (PageReserved(page)) { + /* allocated from memblock */ + while (nr_pages--) + free_reserved_page(page++); + } else + free_pages((unsigned long)page_address(page), order); +} + +static void __meminit remove_pte_table(pte_t *pte_start, unsigned long addr, + unsigned long end, bool direct, + struct vmem_altmap *altmap) { unsigned long next, pages = 0; pte_t *pte; @@ -759,24 +809,26 @@ static void remove_pte_table(pte_t *pte_start, unsigned long addr, if (!pte_present(*pte)) continue; - if (!PAGE_ALIGNED(addr) || !PAGE_ALIGNED(next)) { - /* - * The vmemmap_free() and remove_section_mapping() - * codepaths call us with aligned addresses. - */ - WARN_ONCE(1, "%s: unaligned range\n", __func__); - continue; + if (PAGE_ALIGNED(addr) && PAGE_ALIGNED(next)) { + if (!direct) + free_vmemmap_pages(pte_page(*pte), altmap, 0); + pte_clear(&init_mm, addr, pte); + pages++; } - - pte_clear(&init_mm, addr, pte); - pages++; +#ifdef CONFIG_SPARSEMEM_VMEMMAP + else if (!direct && vmemmap_page_is_unused(addr, next)) { + free_vmemmap_pages(pte_page(*pte), altmap, 0); + pte_clear(&init_mm, addr, pte); + } +#endif } if (direct) update_page_count(mmu_virtual_psize, -pages); } static void __meminit remove_pmd_table(pmd_t *pmd_start, unsigned long addr, - unsigned long end, bool direct) + unsigned long end, bool direct, + struct vmem_altmap *altmap) { unsigned long next, pages = 0; pte_t *pte_base; @@ -790,18 +842,24 @@ static void __meminit remove_pmd_table(pmd_t *pmd_start, unsigned long addr, continue; if (pmd_is_leaf(*pmd)) { - if (!IS_ALIGNED(addr, PMD_SIZE) || - !IS_ALIGNED(next, PMD_SIZE)) { - WARN_ONCE(1, "%s: unaligned range\n", __func__); - continue; + if (IS_ALIGNED(addr, PMD_SIZE) && + IS_ALIGNED(next, PMD_SIZE)) { + if (!direct) + free_vmemmap_pages(pmd_page(*pmd), altmap, get_order(PMD_SIZE)); + pte_clear(&init_mm, addr, (pte_t *)pmd); + pages++; } - pte_clear(&init_mm, addr, (pte_t *)pmd); - pages++; +#ifdef CONFIG_SPARSEMEM_VMEMMAP + else if (!direct && vmemmap_pmd_is_unused(addr, next)) { + free_vmemmap_pages(pmd_page(*pmd), altmap, get_order(PMD_SIZE)); + pte_clear(&init_mm, addr, (pte_t *)pmd); + } +#endif continue; } pte_base = (pte_t *)pmd_page_vaddr(*pmd); - remove_pte_table(pte_base, addr, next, direct); + remove_pte_table(pte_base, addr, next, direct, altmap); free_pte_table(pte_base, pmd); } if (direct) @@ -809,7 +867,8 @@ static void __meminit remove_pmd_table(pmd_t *pmd_start, unsigned long addr, } static void __meminit remove_pud_table(pud_t *pud_start, unsigned long addr, - unsigned long end, bool direct) + unsigned long end, bool direct, + struct vmem_altmap *altmap) { unsigned long next, pages = 0; pmd_t *pmd_base; @@ -834,15 +893,16 @@ static void __meminit remove_pud_table(pud_t *pud_start, unsigned long addr, } pmd_base = pud_pgtable(*pud); - remove_pmd_table(pmd_base, addr, next, direct); + remove_pmd_table(pmd_base, addr, next, direct, altmap); free_pmd_table(pmd_base, pud); } if (direct) update_page_count(MMU_PAGE_1G, -pages); } -static void __meminit remove_pagetable(unsigned long start, unsigned long end, - bool direct) +static void __meminit +remove_pagetable(unsigned long start, unsigned long end, bool direct, + struct vmem_altmap *altmap) { unsigned long addr, next; pud_t *pud_base; @@ -871,7 +931,7 @@ static void __meminit remove_pagetable(unsigned long start, unsigned long end, } pud_base = p4d_pgtable(*p4d); - remove_pud_table(pud_base, addr, next, direct); + remove_pud_table(pud_base, addr, next, direct, altmap); free_pud_table(pud_base, p4d); } @@ -894,7 +954,7 @@ int __meminit radix__create_section_mapping(unsigned long start, int __meminit radix__remove_section_mapping(unsigned long start, unsigned long end) { - remove_pagetable(start, end, true); + remove_pagetable(start, end, true, NULL); return 0; } #endif /* CONFIG_MEMORY_HOTPLUG */ @@ -926,10 +986,223 @@ int __meminit radix__vmemmap_create_mapping(unsigned long start, return 0; } +int __meminit vmemmap_check_pmd(pmd_t *pmdp, int node, + unsigned long addr, unsigned long next) +{ + int large = pmd_large(*pmdp); + + if (large) + vmemmap_verify(pmdp_ptep(pmdp), node, addr, next); + + return large; +} + +void __meminit vmemmap_set_pmd(pmd_t *pmdp, void *p, int node, + unsigned long addr, unsigned long next) +{ + pte_t entry; + pte_t *ptep = pmdp_ptep(pmdp); + + VM_BUG_ON(!IS_ALIGNED(addr, PMD_SIZE)); + entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL); + set_pte_at(&init_mm, addr, ptep, entry); + asm volatile("ptesync": : :"memory"); + + vmemmap_verify(ptep, node, addr, next); +} + +static pte_t * __meminit radix__vmemmap_pte_populate(pmd_t *pmdp, unsigned long addr, + int node, + struct vmem_altmap *altmap, + struct page *reuse) +{ + pte_t *pte = pte_offset_kernel(pmdp, addr); + + if (pte_none(*pte)) { + pte_t entry; + void *p; + + if (!reuse) { + /* + * make sure we don't create altmap mappings + * covering things outside the device. + */ + if (altmap && altmap_cross_boundary(altmap, addr, PAGE_SIZE)) + altmap = NULL; + + p = vmemmap_alloc_block_buf(PAGE_SIZE, node, altmap); + if (!p && altmap) + p = vmemmap_alloc_block_buf(PAGE_SIZE, node, NULL); + if (!p) + return NULL; + } else { + /* + * When a PTE/PMD entry is freed from the init_mm + * there's a free_pages() call to this page allocated + * above. Thus this get_page() is paired with the + * put_page_testzero() on the freeing path. + * This can only called by certain ZONE_DEVICE path, + * and through vmemmap_populate_compound_pages() when + * slab is available. + */ + get_page(reuse); + p = page_to_virt(reuse); + } + + VM_BUG_ON(!PAGE_ALIGNED(addr)); + entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL); + set_pte_at(&init_mm, addr, pte, entry); + asm volatile("ptesync": : :"memory"); + } + return pte; +} + +static inline pud_t *vmemmap_pud_alloc(p4d_t *p4dp, int node, + unsigned long address) +{ + pud_t *pud; + + /* All early vmemmap mapping to keep simple do it at PAGE_SIZE */ + if (unlikely(p4d_none(*p4dp))) { + if (unlikely(!slab_is_available())) { + pud = early_alloc_pgtable(PAGE_SIZE, node, 0, 0); + p4d_populate(&init_mm, p4dp, pud); + /* go to the pud_offset */ + } else + return pud_alloc(&init_mm, p4dp, address); + } + return pud_offset(p4dp, address); +} + +static inline pmd_t *vmemmap_pmd_alloc(pud_t *pudp, int node, + unsigned long address) +{ + pmd_t *pmd; + + /* All early vmemmap mapping to keep simple do it at PAGE_SIZE */ + if (unlikely(pud_none(*pudp))) { + if (unlikely(!slab_is_available())) { + pmd = early_alloc_pgtable(PAGE_SIZE, node, 0, 0); + pud_populate(&init_mm, pudp, pmd); + } else + return pmd_alloc(&init_mm, pudp, address); + } + return pmd_offset(pudp, address); +} + +static inline pte_t *vmemmap_pte_alloc(pmd_t *pmdp, int node, + unsigned long address) +{ + pte_t *pte; + + /* All early vmemmap mapping to keep simple do it at PAGE_SIZE */ + if (unlikely(pmd_none(*pmdp))) { + if (unlikely(!slab_is_available())) { + pte = early_alloc_pgtable(PAGE_SIZE, node, 0, 0); + pmd_populate(&init_mm, pmdp, pte); + } else + return pte_alloc_kernel(pmdp, address); + } + return pte_offset_kernel(pmdp, address); +} + + + +int __meminit radix__vmemmap_populate(unsigned long start, unsigned long end, int node, + struct vmem_altmap *altmap) +{ + unsigned long addr; + unsigned long next; + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + for (addr = start; addr < end; addr = next) { + next = pmd_addr_end(addr, end); + + pgd = pgd_offset_k(addr); + p4d = p4d_offset(pgd, addr); + pud = vmemmap_pud_alloc(p4d, node, addr); + if (!pud) + return -ENOMEM; + pmd = vmemmap_pmd_alloc(pud, node, addr); + if (!pmd) + return -ENOMEM; + + if (pmd_none(READ_ONCE(*pmd))) { + void *p; + + /* + * keep it simple by checking addr PMD_SIZE alignment + * and verifying the device boundary condition. + * For us to use a pmd mapping, both addr and pfn should + * be aligned. We skip if addr is not aligned and for + * pfn we hope we have extra area in the altmap that + * can help to find an aligned block. This can result + * in altmap block allocation failures, in which case + * we fallback to RAM for vmemmap allocation. + */ + if (altmap && (!IS_ALIGNED(addr, PMD_SIZE) || + altmap_cross_boundary(altmap, addr, PMD_SIZE))) { + /* + * make sure we don't create altmap mappings + * covering things outside the device. + */ + goto base_mapping; + } + + p = vmemmap_alloc_block_buf(PMD_SIZE, node, altmap); + if (p) { + vmemmap_set_pmd(pmd, p, node, addr, next); + continue; + } else if (altmap) { + /* + * A vmemmap block allocation can fail due to + * alignment requirements and we trying to align + * things aggressively there by running out of + * space. Try base mapping on failure. + */ + goto base_mapping; + } + } else if (vmemmap_check_pmd(pmd, node, addr, next)) { + /* + * If a huge mapping exist due to early call to + * vmemmap_populate, let's try to use that. + */ + continue; + } +base_mapping: + /* + * Not able allocate higher order memory to back memmap + * or we found a pointer to pte page. Allocate base page + * size vmemmap + */ + pte = vmemmap_pte_alloc(pmd, node, addr); + if (!pte) + return -ENOMEM; + + pte = radix__vmemmap_pte_populate(pmd, addr, node, altmap, NULL); + if (!pte) + return -ENOMEM; + + vmemmap_verify(pte, node, addr, addr + PAGE_SIZE); + next = addr + PAGE_SIZE; + } + return 0; +} + #ifdef CONFIG_MEMORY_HOTPLUG void __meminit radix__vmemmap_remove_mapping(unsigned long start, unsigned long page_size) { - remove_pagetable(start, start + page_size, false); + remove_pagetable(start, start + page_size, true, NULL); +} + +void __ref radix__vmemmap_free(unsigned long start, unsigned long end, + struct vmem_altmap *altmap) +{ + remove_pagetable(start, end, false, altmap); } #endif #endif diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c index fe1b83020e0d..af0c9891c38f 100644 --- a/arch/powerpc/mm/init_64.c +++ b/arch/powerpc/mm/init_64.c @@ -92,7 +92,7 @@ static struct page * __meminit vmemmap_subsection_start(unsigned long vmemmap_ad * a page table lookup here because with the hash translation we don't keep * vmemmap details in linux page table. */ -static int __meminit vmemmap_populated(unsigned long vmemmap_addr, int vmemmap_map_size) +int __meminit vmemmap_populated(unsigned long vmemmap_addr, int vmemmap_map_size) { struct page *start; unsigned long vmemmap_end = vmemmap_addr + vmemmap_map_size; @@ -183,8 +183,8 @@ static __meminit int vmemmap_list_populate(unsigned long phys, return 0; } -static bool altmap_cross_boundary(struct vmem_altmap *altmap, unsigned long start, - unsigned long page_size) +bool altmap_cross_boundary(struct vmem_altmap *altmap, unsigned long start, + unsigned long page_size) { unsigned long nr_pfn = page_size / sizeof(struct page); unsigned long start_pfn = page_to_pfn((struct page *)start); @@ -204,6 +204,11 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, bool altmap_alloc; unsigned long page_size = 1 << mmu_psize_defs[mmu_vmemmap_psize].shift; +#ifdef CONFIG_PPC_BOOK3S_64 + if (radix_enabled()) + return radix__vmemmap_populate(start, end, node, altmap); +#endif + /* Align to the page size of the linear mapping. */ start = ALIGN_DOWN(start, page_size); @@ -303,8 +308,8 @@ static unsigned long vmemmap_list_free(unsigned long start) return vmem_back->phys; } -void __ref vmemmap_free(unsigned long start, unsigned long end, - struct vmem_altmap *altmap) +static void __ref __vmemmap_free(unsigned long start, unsigned long end, + struct vmem_altmap *altmap) { unsigned long page_size = 1 << mmu_psize_defs[mmu_vmemmap_psize].shift; unsigned long page_order = get_order(page_size); @@ -362,6 +367,17 @@ void __ref vmemmap_free(unsigned long start, unsigned long end, vmemmap_remove_mapping(start, page_size); } } + +void __ref vmemmap_free(unsigned long start, unsigned long end, + struct vmem_altmap *altmap) +{ +#ifdef CONFIG_PPC_BOOK3S_64 + if (radix_enabled()) + return radix__vmemmap_free(start, end, altmap); +#endif + return __vmemmap_free(start, end, altmap); +} + #endif void register_page_bootmem_memmap(unsigned long section_nr, struct page *start_page, unsigned long size) -- cgit v1.2.3 From f2b79c0d79683d552369bb9a7282c1f0226fc566 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 25 Jul 2023 00:37:57 +0530 Subject: powerpc/book3s64/radix: add support for vmemmap optimization for radix With 2M PMD-level mapping, we require 32 struct pages and a single vmemmap page can contain 1024 struct pages (PAGE_SIZE/sizeof(struct page)). Hence with 64K page size, we don't use vmemmap deduplication for PMD-level mapping. [aneesh.kumar@linux.ibm.com: ppc64: don't include radix headers if CONFIG_PPC_RADIX_MMU=n] Link: https://lkml.kernel.org/r/87zg3jw8km.fsf@linux.ibm.com Link: https://lkml.kernel.org/r/20230724190759.483013-12-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Cc: Catalin Marinas Cc: Christophe Leroy Cc: Dan Williams Cc: Joao Martins Cc: Michael Ellerman Cc: Mike Kravetz Cc: Muchun Song Cc: Nicholas Piggin Cc: Oscar Salvador Cc: Will Deacon Signed-off-by: Andrew Morton --- Documentation/mm/vmemmap_dedup.rst | 1 + Documentation/powerpc/index.rst | 1 + Documentation/powerpc/vmemmap_dedup.rst | 101 ++++++++++++++ arch/powerpc/Kconfig | 1 + arch/powerpc/include/asm/book3s/64/radix.h | 11 ++ arch/powerpc/mm/book3s64/radix_pgtable.c | 203 +++++++++++++++++++++++++++++ 6 files changed, 318 insertions(+) create mode 100644 Documentation/powerpc/vmemmap_dedup.rst (limited to 'arch/powerpc') diff --git a/Documentation/mm/vmemmap_dedup.rst b/Documentation/mm/vmemmap_dedup.rst index a4b12ff906c4..c573e08b5043 100644 --- a/Documentation/mm/vmemmap_dedup.rst +++ b/Documentation/mm/vmemmap_dedup.rst @@ -210,6 +210,7 @@ the device (altmap). The following page sizes are supported in DAX: PAGE_SIZE (4K on x86_64), PMD_SIZE (2M on x86_64) and PUD_SIZE (1G on x86_64). +For powerpc equivalent details see Documentation/powerpc/vmemmap_dedup.rst The differences with HugeTLB are relatively minor. diff --git a/Documentation/powerpc/index.rst b/Documentation/powerpc/index.rst index d33b554ca7ba..a50834798454 100644 --- a/Documentation/powerpc/index.rst +++ b/Documentation/powerpc/index.rst @@ -36,6 +36,7 @@ powerpc ultravisor vas-api vcpudispatch_stats + vmemmap_dedup features diff --git a/Documentation/powerpc/vmemmap_dedup.rst b/Documentation/powerpc/vmemmap_dedup.rst new file mode 100644 index 000000000000..dc4db59fdf87 --- /dev/null +++ b/Documentation/powerpc/vmemmap_dedup.rst @@ -0,0 +1,101 @@ +.. SPDX-License-Identifier: GPL-2.0 + +========== +Device DAX +========== + +The device-dax interface uses the tail deduplication technique explained in +Documentation/mm/vmemmap_dedup.rst + +On powerpc, vmemmap deduplication is only used with radix MMU translation. Also +with a 64K page size, only the devdax namespace with 1G alignment uses vmemmap +deduplication. + +With 2M PMD level mapping, we require 32 struct pages and a single 64K vmemmap +page can contain 1024 struct pages (64K/sizeof(struct page)). Hence there is no +vmemmap deduplication possible. + +With 1G PUD level mapping, we require 16384 struct pages and a single 64K +vmemmap page can contain 1024 struct pages (64K/sizeof(struct page)). Hence we +require 16 64K pages in vmemmap to map the struct page for 1G PUD level mapping. + +Here's how things look like on device-dax after the sections are populated:: + +-----------+ ---virt_to_page---> +-----------+ mapping to +-----------+ + | | | 0 | -------------> | 0 | + | | +-----------+ +-----------+ + | | | 1 | -------------> | 1 | + | | +-----------+ +-----------+ + | | | 2 | ----------------^ ^ ^ ^ ^ ^ + | | +-----------+ | | | | | + | | | 3 | ------------------+ | | | | + | | +-----------+ | | | | + | | | 4 | --------------------+ | | | + | PUD | +-----------+ | | | + | level | | . | ----------------------+ | | + | mapping | +-----------+ | | + | | | . | ------------------------+ | + | | +-----------+ | + | | | 15 | --------------------------+ + | | +-----------+ + | | + | | + | | + +-----------+ + + +With 4K page size, 2M PMD level mapping requires 512 struct pages and a single +4K vmemmap page contains 64 struct pages(4K/sizeof(struct page)). Hence we +require 8 4K pages in vmemmap to map the struct page for 2M pmd level mapping. + +Here's how things look like on device-dax after the sections are populated:: + + +-----------+ ---virt_to_page---> +-----------+ mapping to +-----------+ + | | | 0 | -------------> | 0 | + | | +-----------+ +-----------+ + | | | 1 | -------------> | 1 | + | | +-----------+ +-----------+ + | | | 2 | ----------------^ ^ ^ ^ ^ ^ + | | +-----------+ | | | | | + | | | 3 | ------------------+ | | | | + | | +-----------+ | | | | + | | | 4 | --------------------+ | | | + | PMD | +-----------+ | | | + | level | | 5 | ----------------------+ | | + | mapping | +-----------+ | | + | | | 6 | ------------------------+ | + | | +-----------+ | + | | | 7 | --------------------------+ + | | +-----------+ + | | + | | + | | + +-----------+ + +With 1G PUD level mapping, we require 262144 struct pages and a single 4K +vmemmap page can contain 64 struct pages (4K/sizeof(struct page)). Hence we +require 4096 4K pages in vmemmap to map the struct pages for 1G PUD level +mapping. + +Here's how things look like on device-dax after the sections are populated:: + + +-----------+ ---virt_to_page---> +-----------+ mapping to +-----------+ + | | | 0 | -------------> | 0 | + | | +-----------+ +-----------+ + | | | 1 | -------------> | 1 | + | | +-----------+ +-----------+ + | | | 2 | ----------------^ ^ ^ ^ ^ ^ + | | +-----------+ | | | | | + | | | 3 | ------------------+ | | | | + | | +-----------+ | | | | + | | | 4 | --------------------+ | | | + | PUD | +-----------+ | | | + | level | | . | ----------------------+ | | + | mapping | +-----------+ | | + | | | . | ------------------------+ | + | | +-----------+ | + | | | 4095 | --------------------------+ + | | +-----------+ + | | + | | + | | + +-----------+ diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 9222c138c457..d0497d13f5b4 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -174,6 +174,7 @@ config PPC select ARCH_WANT_IPC_PARSE_VERSION select ARCH_WANT_IRQS_OFF_ACTIVATE_MM select ARCH_WANT_LD_ORPHAN_WARN + select ARCH_WANT_OPTIMIZE_DAX_VMEMMAP if PPC_RADIX_MMU select ARCH_WANTS_MODULES_DATA_IN_VMALLOC if PPC_BOOK3S_32 || PPC_8xx select ARCH_WEAK_RELEASE_ACQUIRE select BINFMT_ELF diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h index f1461289643a..357e23a403d3 100644 --- a/arch/powerpc/include/asm/book3s/64/radix.h +++ b/arch/powerpc/include/asm/book3s/64/radix.h @@ -326,6 +326,7 @@ static inline pud_t radix__pud_mkdevmap(pud_t pud) } struct vmem_altmap; +struct dev_pagemap; extern int __meminit radix__vmemmap_create_mapping(unsigned long start, unsigned long page_size, unsigned long phys); @@ -363,5 +364,15 @@ int radix__remove_section_mapping(unsigned long start, unsigned long end); void radix__kernel_map_pages(struct page *page, int numpages, int enable); +#ifdef CONFIG_ARCH_WANT_OPTIMIZE_DAX_VMEMMAP +#define vmemmap_can_optimize vmemmap_can_optimize +bool vmemmap_can_optimize(struct vmem_altmap *altmap, struct dev_pagemap *pgmap); +#endif + +#define vmemmap_populate_compound_pages vmemmap_populate_compound_pages +int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn, + unsigned long start, + unsigned long end, int node, + struct dev_pagemap *pgmap); #endif /* __ASSEMBLY__ */ #endif diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c index 6d04dd579d03..c264e6a3620e 100644 --- a/arch/powerpc/mm/book3s64/radix_pgtable.c +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c @@ -986,6 +986,15 @@ int __meminit radix__vmemmap_create_mapping(unsigned long start, return 0; } + +bool vmemmap_can_optimize(struct vmem_altmap *altmap, struct dev_pagemap *pgmap) +{ + if (radix_enabled()) + return __vmemmap_can_optimize(altmap, pgmap); + + return false; +} + int __meminit vmemmap_check_pmd(pmd_t *pmdp, int node, unsigned long addr, unsigned long next) { @@ -1193,6 +1202,200 @@ base_mapping: return 0; } +static pte_t * __meminit radix__vmemmap_populate_address(unsigned long addr, int node, + struct vmem_altmap *altmap, + struct page *reuse) +{ + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + pgd = pgd_offset_k(addr); + p4d = p4d_offset(pgd, addr); + pud = vmemmap_pud_alloc(p4d, node, addr); + if (!pud) + return NULL; + pmd = vmemmap_pmd_alloc(pud, node, addr); + if (!pmd) + return NULL; + if (pmd_leaf(*pmd)) + /* + * The second page is mapped as a hugepage due to a nearby request. + * Force our mapping to page size without deduplication + */ + return NULL; + pte = vmemmap_pte_alloc(pmd, node, addr); + if (!pte) + return NULL; + radix__vmemmap_pte_populate(pmd, addr, node, NULL, NULL); + vmemmap_verify(pte, node, addr, addr + PAGE_SIZE); + + return pte; +} + +static pte_t * __meminit vmemmap_compound_tail_page(unsigned long addr, + unsigned long pfn_offset, int node) +{ + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + unsigned long map_addr; + + /* the second vmemmap page which we use for duplication */ + map_addr = addr - pfn_offset * sizeof(struct page) + PAGE_SIZE; + pgd = pgd_offset_k(map_addr); + p4d = p4d_offset(pgd, map_addr); + pud = vmemmap_pud_alloc(p4d, node, map_addr); + if (!pud) + return NULL; + pmd = vmemmap_pmd_alloc(pud, node, map_addr); + if (!pmd) + return NULL; + if (pmd_leaf(*pmd)) + /* + * The second page is mapped as a hugepage due to a nearby request. + * Force our mapping to page size without deduplication + */ + return NULL; + pte = vmemmap_pte_alloc(pmd, node, map_addr); + if (!pte) + return NULL; + /* + * Check if there exist a mapping to the left + */ + if (pte_none(*pte)) { + /* + * Populate the head page vmemmap page. + * It can fall in different pmd, hence + * vmemmap_populate_address() + */ + pte = radix__vmemmap_populate_address(map_addr - PAGE_SIZE, node, NULL, NULL); + if (!pte) + return NULL; + /* + * Populate the tail pages vmemmap page + */ + pte = radix__vmemmap_pte_populate(pmd, map_addr, node, NULL, NULL); + if (!pte) + return NULL; + vmemmap_verify(pte, node, map_addr, map_addr + PAGE_SIZE); + return pte; + } + return pte; +} + +int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn, + unsigned long start, + unsigned long end, int node, + struct dev_pagemap *pgmap) +{ + /* + * we want to map things as base page size mapping so that + * we can save space in vmemmap. We could have huge mapping + * covering out both edges. + */ + unsigned long addr; + unsigned long addr_pfn = start_pfn; + unsigned long next; + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + for (addr = start; addr < end; addr = next) { + + pgd = pgd_offset_k(addr); + p4d = p4d_offset(pgd, addr); + pud = vmemmap_pud_alloc(p4d, node, addr); + if (!pud) + return -ENOMEM; + pmd = vmemmap_pmd_alloc(pud, node, addr); + if (!pmd) + return -ENOMEM; + + if (pmd_leaf(READ_ONCE(*pmd))) { + /* existing huge mapping. Skip the range */ + addr_pfn += (PMD_SIZE >> PAGE_SHIFT); + next = pmd_addr_end(addr, end); + continue; + } + pte = vmemmap_pte_alloc(pmd, node, addr); + if (!pte) + return -ENOMEM; + if (!pte_none(*pte)) { + /* + * This could be because we already have a compound + * page whose VMEMMAP_RESERVE_NR pages were mapped and + * this request fall in those pages. + */ + addr_pfn += 1; + next = addr + PAGE_SIZE; + continue; + } else { + unsigned long nr_pages = pgmap_vmemmap_nr(pgmap); + unsigned long pfn_offset = addr_pfn - ALIGN_DOWN(addr_pfn, nr_pages); + pte_t *tail_page_pte; + + /* + * if the address is aligned to huge page size it is the + * head mapping. + */ + if (pfn_offset == 0) { + /* Populate the head page vmemmap page */ + pte = radix__vmemmap_pte_populate(pmd, addr, node, NULL, NULL); + if (!pte) + return -ENOMEM; + vmemmap_verify(pte, node, addr, addr + PAGE_SIZE); + + /* + * Populate the tail pages vmemmap page + * It can fall in different pmd, hence + * vmemmap_populate_address() + */ + pte = radix__vmemmap_populate_address(addr + PAGE_SIZE, node, NULL, NULL); + if (!pte) + return -ENOMEM; + + addr_pfn += 2; + next = addr + 2 * PAGE_SIZE; + continue; + } + /* + * get the 2nd mapping details + * Also create it if that doesn't exist + */ + tail_page_pte = vmemmap_compound_tail_page(addr, pfn_offset, node); + if (!tail_page_pte) { + + pte = radix__vmemmap_pte_populate(pmd, addr, node, NULL, NULL); + if (!pte) + return -ENOMEM; + vmemmap_verify(pte, node, addr, addr + PAGE_SIZE); + + addr_pfn += 1; + next = addr + PAGE_SIZE; + continue; + } + + pte = radix__vmemmap_pte_populate(pmd, addr, node, NULL, pte_page(*tail_page_pte)); + if (!pte) + return -ENOMEM; + vmemmap_verify(pte, node, addr, addr + PAGE_SIZE); + + addr_pfn += 1; + next = addr + PAGE_SIZE; + continue; + } + } + return 0; +} + + #ifdef CONFIG_MEMORY_HOTPLUG void __meminit radix__vmemmap_remove_mapping(unsigned long start, unsigned long page_size) { -- cgit v1.2.3 From 601f006fddc66e369fdac7c572f981eafd159dac Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 25 Jul 2023 00:37:58 +0530 Subject: powerpc/book3s64/radix: remove mmu_vmemmap_psize This is not used by radix anymore. [aneesh.kumar@linux.ibm.com: fix kernel build error] Link: https://lkml.kernel.org/r/874jlowd0c.fsf@linux.ibm.com Link: https://lkml.kernel.org/r/20230724190759.483013-13-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Cc: Catalin Marinas Cc: Christophe Leroy Cc: Dan Williams Cc: Joao Martins Cc: Michael Ellerman Cc: Mike Kravetz Cc: Muchun Song Cc: Nicholas Piggin Cc: Oscar Salvador Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/powerpc/mm/book3s64/radix_pgtable.c | 11 ----------- arch/powerpc/mm/init_64.c | 21 ++++++++++++++------- 2 files changed, 14 insertions(+), 18 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c index c264e6a3620e..955cf6b68a3a 100644 --- a/arch/powerpc/mm/book3s64/radix_pgtable.c +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c @@ -601,17 +601,6 @@ void __init radix__early_init_mmu(void) #else mmu_virtual_psize = MMU_PAGE_4K; #endif - -#ifdef CONFIG_SPARSEMEM_VMEMMAP - /* vmemmap mapping */ - if (mmu_psize_defs[MMU_PAGE_2M].shift) { - /* - * map vmemmap using 2M if available - */ - mmu_vmemmap_psize = MMU_PAGE_2M; - } else - mmu_vmemmap_psize = mmu_virtual_psize; -#endif #endif /* * initialize page table size diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c index af0c9891c38f..f7930360406e 100644 --- a/arch/powerpc/mm/init_64.c +++ b/arch/powerpc/mm/init_64.c @@ -198,17 +198,12 @@ bool altmap_cross_boundary(struct vmem_altmap *altmap, unsigned long start, return false; } -int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, - struct vmem_altmap *altmap) +static int __meminit __vmemmap_populate(unsigned long start, unsigned long end, int node, + struct vmem_altmap *altmap) { bool altmap_alloc; unsigned long page_size = 1 << mmu_psize_defs[mmu_vmemmap_psize].shift; -#ifdef CONFIG_PPC_BOOK3S_64 - if (radix_enabled()) - return radix__vmemmap_populate(start, end, node, altmap); -#endif - /* Align to the page size of the linear mapping. */ start = ALIGN_DOWN(start, page_size); @@ -277,6 +272,18 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, return 0; } +int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, + struct vmem_altmap *altmap) +{ + +#ifdef CONFIG_PPC_BOOK3S_64 + if (radix_enabled()) + return radix__vmemmap_populate(start, end, node, altmap); +#endif + + return __vmemmap_populate(start, end, node, altmap); +} + #ifdef CONFIG_MEMORY_HOTPLUG static unsigned long vmemmap_list_free(unsigned long start) { -- cgit v1.2.3 From 6be3601517d90b728095d70c14f3a04b9adcb166 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 25 Jul 2023 00:37:59 +0530 Subject: powerpc/book3s64/radix: add debug message to give more details of vmemmap allocation Add some extra vmemmap pr_debug message that will indicate the type of vmemmap allocations. For ex: with DAX vmemmap optimization we can find the below details: [ 187.166580] radix-mmu: PAGE_SIZE vmemmap mapping [ 187.166587] radix-mmu: PAGE_SIZE vmemmap mapping [ 187.166591] radix-mmu: Tail page reuse vmemmap mapping [ 187.166594] radix-mmu: Tail page reuse vmemmap mapping [ 187.166598] radix-mmu: Tail page reuse vmemmap mapping [ 187.166601] radix-mmu: Tail page reuse vmemmap mapping [ 187.166604] radix-mmu: Tail page reuse vmemmap mapping [ 187.166608] radix-mmu: Tail page reuse vmemmap mapping [ 187.166611] radix-mmu: Tail page reuse vmemmap mapping [ 187.166614] radix-mmu: Tail page reuse vmemmap mapping [ 187.166617] radix-mmu: Tail page reuse vmemmap mapping [ 187.166620] radix-mmu: Tail page reuse vmemmap mapping [ 187.166623] radix-mmu: Tail page reuse vmemmap mapping [ 187.166626] radix-mmu: Tail page reuse vmemmap mapping [ 187.166629] radix-mmu: Tail page reuse vmemmap mapping [ 187.166632] radix-mmu: Tail page reuse vmemmap mapping And without vmemmap optimization [ 293.549931] radix-mmu: PMD_SIZE vmemmap mapping [ 293.549984] radix-mmu: PMD_SIZE vmemmap mapping [ 293.550032] radix-mmu: PMD_SIZE vmemmap mapping [ 293.550076] radix-mmu: PMD_SIZE vmemmap mapping [ 293.550117] radix-mmu: PMD_SIZE vmemmap mapping Link: https://lkml.kernel.org/r/20230724190759.483013-14-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Cc: Catalin Marinas Cc: Christophe Leroy Cc: Dan Williams Cc: Joao Martins Cc: Michael Ellerman Cc: Mike Kravetz Cc: Muchun Song Cc: Nicholas Piggin Cc: Oscar Salvador Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/powerpc/mm/book3s64/radix_pgtable.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/powerpc') diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c index 955cf6b68a3a..96679018e7fb 100644 --- a/arch/powerpc/mm/book3s64/radix_pgtable.c +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c @@ -1033,6 +1033,7 @@ static pte_t * __meminit radix__vmemmap_pte_populate(pmd_t *pmdp, unsigned long p = vmemmap_alloc_block_buf(PAGE_SIZE, node, NULL); if (!p) return NULL; + pr_debug("PAGE_SIZE vmemmap mapping\n"); } else { /* * When a PTE/PMD entry is freed from the init_mm @@ -1045,6 +1046,7 @@ static pte_t * __meminit radix__vmemmap_pte_populate(pmd_t *pmdp, unsigned long */ get_page(reuse); p = page_to_virt(reuse); + pr_debug("Tail page reuse vmemmap mapping\n"); } VM_BUG_ON(!PAGE_ALIGNED(addr)); @@ -1154,6 +1156,7 @@ int __meminit radix__vmemmap_populate(unsigned long start, unsigned long end, in p = vmemmap_alloc_block_buf(PMD_SIZE, node, altmap); if (p) { vmemmap_set_pmd(pmd, p, node, addr, next); + pr_debug("PMD_SIZE vmemmap mapping\n"); continue; } else if (altmap) { /* -- cgit v1.2.3 From 60081bf19b0ec8fa40c589bd361fa2bc763f1050 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Fri, 4 Aug 2023 08:27:22 -0700 Subject: mm: lock vma explicitly before doing vm_flags_reset and vm_flags_reset_once Implicit vma locking inside vm_flags_reset() and vm_flags_reset_once() is not obvious and makes it hard to understand where vma locking is happening. Also in some cases (like in dup_userfaultfd()) vma should be locked earlier than vma_flags modification. To make locking more visible, change these functions to assert that the vma write lock is taken and explicitly lock the vma beforehand. Fix userfaultfd functions which should lock the vma earlier. Link: https://lkml.kernel.org/r/20230804152724.3090321-5-surenb@google.com Suggested-by: Linus Torvalds Signed-off-by: Suren Baghdasaryan Cc: Jann Horn Cc: Liam R. Howlett Signed-off-by: Andrew Morton --- arch/powerpc/kvm/book3s_hv_uvmem.c | 1 + fs/userfaultfd.c | 6 ++++++ include/linux/mm.h | 10 +++++++--- mm/madvise.c | 5 ++--- mm/mlock.c | 3 ++- mm/mprotect.c | 1 + 6 files changed, 19 insertions(+), 7 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c index 709ebd578394..e2d6f9327f77 100644 --- a/arch/powerpc/kvm/book3s_hv_uvmem.c +++ b/arch/powerpc/kvm/book3s_hv_uvmem.c @@ -410,6 +410,7 @@ static int kvmppc_memslot_page_merge(struct kvm *kvm, ret = H_STATE; break; } + vma_start_write(vma); /* Copy vm_flags to avoid partial modifications in ksm_madvise */ vm_flags = vma->vm_flags; ret = ksm_madvise(vma, vma->vm_start, vma->vm_end, diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 9854d44ae18e..70bd2951b68d 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -667,6 +667,7 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx, mmap_write_lock(mm); for_each_vma(vmi, vma) { if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) { + vma_start_write(vma); vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; userfaultfd_set_vm_flags(vma, vma->vm_flags & ~__VM_UFFD_FLAGS); @@ -702,6 +703,7 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs) octx = vma->vm_userfaultfd_ctx.ctx; if (!octx || !(octx->features & UFFD_FEATURE_EVENT_FORK)) { + vma_start_write(vma); vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; userfaultfd_set_vm_flags(vma, vma->vm_flags & ~__VM_UFFD_FLAGS); return 0; @@ -783,6 +785,7 @@ void mremap_userfaultfd_prep(struct vm_area_struct *vma, atomic_inc(&ctx->mmap_changing); } else { /* Drop uffd context if remap feature not enabled */ + vma_start_write(vma); vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; userfaultfd_set_vm_flags(vma, vma->vm_flags & ~__VM_UFFD_FLAGS); } @@ -940,6 +943,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file) prev = vma; } + vma_start_write(vma); userfaultfd_set_vm_flags(vma, new_flags); vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; } @@ -1511,6 +1515,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, * the next vma was merged into the current one and * the current one has not been updated yet. */ + vma_start_write(vma); userfaultfd_set_vm_flags(vma, new_flags); vma->vm_userfaultfd_ctx.ctx = ctx; @@ -1694,6 +1699,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, * the next vma was merged into the current one and * the current one has not been updated yet. */ + vma_start_write(vma); userfaultfd_set_vm_flags(vma, new_flags); vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; diff --git a/include/linux/mm.h b/include/linux/mm.h index 49eafc62b4e6..5a6ff9140090 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -774,18 +774,22 @@ static inline void vm_flags_init(struct vm_area_struct *vma, ACCESS_PRIVATE(vma, __vm_flags) = flags; } -/* Use when VMA is part of the VMA tree and modifications need coordination */ +/* + * Use when VMA is part of the VMA tree and modifications need coordination + * Note: vm_flags_reset and vm_flags_reset_once do not lock the vma and + * it should be locked explicitly beforehand. + */ static inline void vm_flags_reset(struct vm_area_struct *vma, vm_flags_t flags) { - vma_start_write(vma); + vma_assert_write_locked(vma); vm_flags_init(vma, flags); } static inline void vm_flags_reset_once(struct vm_area_struct *vma, vm_flags_t flags) { - vma_start_write(vma); + vma_assert_write_locked(vma); WRITE_ONCE(ACCESS_PRIVATE(vma, __vm_flags), flags); } diff --git a/mm/madvise.c b/mm/madvise.c index da65f8bd9ac3..8498f700c284 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -173,9 +173,8 @@ static int madvise_update_vma(struct vm_area_struct *vma, } success: - /* - * vm_flags is protected by the mmap_lock held in write mode. - */ + /* vm_flags is protected by the mmap_lock held in write mode. */ + vma_start_write(vma); vm_flags_reset(vma, new_flags); if (!vma->vm_file || vma_is_anon_shmem(vma)) { error = replace_anon_vma_name(vma, anon_name); diff --git a/mm/mlock.c b/mm/mlock.c index 0a0c996c5c21..1746600a2e14 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -386,6 +386,7 @@ static void mlock_vma_pages_range(struct vm_area_struct *vma, */ if (newflags & VM_LOCKED) newflags |= VM_IO; + vma_start_write(vma); vm_flags_reset_once(vma, newflags); lru_add_drain(); @@ -460,9 +461,9 @@ success: * It's okay if try_to_unmap_one unmaps a page just after we * set VM_LOCKED, populate_vma_page_range will bring it back. */ - if ((newflags & VM_LOCKED) && (oldflags & VM_LOCKED)) { /* No work to do, and mlocking twice would be wrong */ + vma_start_write(vma); vm_flags_reset(vma, newflags); } else { mlock_vma_pages_range(vma, start, end, newflags); diff --git a/mm/mprotect.c b/mm/mprotect.c index 3f36c88a238e..7cd7f644da80 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -656,6 +656,7 @@ success: * vm_flags and vm_page_prot are protected by the mmap_lock * held in write mode. */ + vma_start_write(vma); vm_flags_reset(vma, newflags); if (vma_wants_manual_pte_write_upgrade(vma)) mm_cp_flags |= MM_CP_TRY_CHANGE_WRITABLE; -- cgit v1.2.3 From 603fd64dfa45d1e9df996911e4010f2b00731387 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 8 Aug 2023 14:45:00 +0530 Subject: powerpc/book3s64/memhotplug: enable memmap on memory for radix Radix vmemmap mapping can map things correctly at the PMD level or PTE level based on different device boundary checks. Hence we skip the restrictions w.r.t vmemmap size to be multiple of PMD_SIZE. This also makes the feature widely useful because to use PMD_SIZE vmemmap area we require a memory block size of 2GiB We can also use MHP_RESERVE_PAGES_MEMMAP_ON_MEMORY to that the feature can work with a memory block size of 256MB. Using altmap.reserve feature to align things correctly at pageblock granularity. We can end up losing some pages in memory with this. For ex: with a 256MiB memory block size, we require 4 pages to map vmemmap pages, In order to align things correctly we end up adding a reserve of 28 pages. ie, for every 4096 pages 28 pages get reserved. Link: https://lkml.kernel.org/r/20230808091501.287660-6-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Reviewed-by: David Hildenbrand Cc: Christophe Leroy Cc: Michael Ellerman Cc: Michal Hocko Cc: Nicholas Piggin Cc: Oscar Salvador Cc: Vishal Verma Signed-off-by: Andrew Morton --- arch/powerpc/Kconfig | 1 + arch/powerpc/include/asm/pgtable.h | 21 +++++++++++++++++++++ arch/powerpc/platforms/pseries/hotplug-memory.c | 2 +- 3 files changed, 23 insertions(+), 1 deletion(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index d0497d13f5b4..938294c996dc 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -157,6 +157,7 @@ config PPC select ARCH_HAS_UBSAN_SANITIZE_ALL select ARCH_HAVE_NMI_SAFE_CMPXCHG select ARCH_KEEP_MEMBLOCK + select ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE if PPC_RADIX_MMU select ARCH_MIGHT_HAVE_PC_PARPORT select ARCH_MIGHT_HAVE_PC_SERIO select ARCH_OPTIONAL_KERNEL_RWX if ARCH_HAS_STRICT_KERNEL_RWX diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index a4893b17705a..33464e6d6431 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h @@ -161,6 +161,27 @@ static inline pgtable_t pmd_pgtable(pmd_t pmd) int __meminit vmemmap_populated(unsigned long vmemmap_addr, int vmemmap_map_size); bool altmap_cross_boundary(struct vmem_altmap *altmap, unsigned long start, unsigned long page_size); +/* + * mm/memory_hotplug.c:mhp_supports_memmap_on_memory goes into details + * some of the restrictions. We don't check for PMD_SIZE because our + * vmemmap allocation code can fallback correctly. The pageblock + * alignment requirement is met using altmap->reserve blocks. + */ +#define arch_supports_memmap_on_memory arch_supports_memmap_on_memory +static inline bool arch_supports_memmap_on_memory(unsigned long vmemmap_size) +{ + if (!radix_enabled()) + return false; + /* + * With 4K page size and 2M PMD_SIZE, we can align + * things better with memory block size value + * starting from 128MB. Hence align things with PMD_SIZE. + */ + if (IS_ENABLED(CONFIG_PPC_4K_PAGES)) + return IS_ALIGNED(vmemmap_size, PMD_SIZE); + return true; +} + #endif /* CONFIG_PPC64 */ #endif /* __ASSEMBLY__ */ diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c index 9c62c2c3b3d0..4f3d6a2f9065 100644 --- a/arch/powerpc/platforms/pseries/hotplug-memory.c +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c @@ -637,7 +637,7 @@ static int dlpar_add_lmb(struct drmem_lmb *lmb) nid = first_online_node; /* Add the memory */ - rc = __add_memory(nid, lmb->base_addr, block_sz, MHP_NONE); + rc = __add_memory(nid, lmb->base_addr, block_sz, MHP_MEMMAP_ON_MEMORY); if (rc) { invalidate_lmb_associativity_index(lmb); return rc; -- cgit v1.2.3 From 4eaca96140b33e2d1fad761d1468c8293859da11 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 7 Aug 2023 16:04:54 -0700 Subject: powerpc: convert various functions to use ptdescs In order to split struct ptdesc from struct page, convert various functions to use ptdescs. Link: https://lkml.kernel.org/r/20230807230513.102486-13-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Acked-by: Mike Rapoport (IBM) Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Christophe Leroy Cc: Claudio Imbrenda Cc: Dave Hansen Cc: David Hildenbrand Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Huacai Chen Cc: Hugh Dickins Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: Matthew Wilcox Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Richard Weinberger Cc: Thomas Bogendoerfer Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/powerpc/mm/book3s64/mmu_context.c | 10 +++--- arch/powerpc/mm/book3s64/pgtable.c | 32 +++++++++---------- arch/powerpc/mm/pgtable-frag.c | 58 +++++++++++++++++----------------- 3 files changed, 50 insertions(+), 50 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/mm/book3s64/mmu_context.c b/arch/powerpc/mm/book3s64/mmu_context.c index c766e4c26e42..1715b07c630c 100644 --- a/arch/powerpc/mm/book3s64/mmu_context.c +++ b/arch/powerpc/mm/book3s64/mmu_context.c @@ -246,15 +246,15 @@ static void destroy_contexts(mm_context_t *ctx) static void pmd_frag_destroy(void *pmd_frag) { int count; - struct page *page; + struct ptdesc *ptdesc; - page = virt_to_page(pmd_frag); + ptdesc = virt_to_ptdesc(pmd_frag); /* drop all the pending references */ count = ((unsigned long)pmd_frag & ~PAGE_MASK) >> PMD_FRAG_SIZE_SHIFT; /* We allow PTE_FRAG_NR fragments from a PTE page */ - if (atomic_sub_and_test(PMD_FRAG_NR - count, &page->pt_frag_refcount)) { - pgtable_pmd_page_dtor(page); - __free_page(page); + if (atomic_sub_and_test(PMD_FRAG_NR - count, &ptdesc->pt_frag_refcount)) { + pagetable_pmd_dtor(ptdesc); + pagetable_free(ptdesc); } } diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c index 75b938268b04..1498ccd08367 100644 --- a/arch/powerpc/mm/book3s64/pgtable.c +++ b/arch/powerpc/mm/book3s64/pgtable.c @@ -384,22 +384,22 @@ static pmd_t *get_pmd_from_cache(struct mm_struct *mm) static pmd_t *__alloc_for_pmdcache(struct mm_struct *mm) { void *ret = NULL; - struct page *page; + struct ptdesc *ptdesc; gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO; if (mm == &init_mm) gfp &= ~__GFP_ACCOUNT; - page = alloc_page(gfp); - if (!page) + ptdesc = pagetable_alloc(gfp, 0); + if (!ptdesc) return NULL; - if (!pgtable_pmd_page_ctor(page)) { - __free_pages(page, 0); + if (!pagetable_pmd_ctor(ptdesc)) { + pagetable_free(ptdesc); return NULL; } - atomic_set(&page->pt_frag_refcount, 1); + atomic_set(&ptdesc->pt_frag_refcount, 1); - ret = page_address(page); + ret = ptdesc_address(ptdesc); /* * if we support only one fragment just return the * allocated page. @@ -409,12 +409,12 @@ static pmd_t *__alloc_for_pmdcache(struct mm_struct *mm) spin_lock(&mm->page_table_lock); /* - * If we find pgtable_page set, we return + * If we find ptdesc_page set, we return * the allocated page with single fragment * count. */ if (likely(!mm->context.pmd_frag)) { - atomic_set(&page->pt_frag_refcount, PMD_FRAG_NR); + atomic_set(&ptdesc->pt_frag_refcount, PMD_FRAG_NR); mm->context.pmd_frag = ret + PMD_FRAG_SIZE; } spin_unlock(&mm->page_table_lock); @@ -435,15 +435,15 @@ pmd_t *pmd_fragment_alloc(struct mm_struct *mm, unsigned long vmaddr) void pmd_fragment_free(unsigned long *pmd) { - struct page *page = virt_to_page(pmd); + struct ptdesc *ptdesc = virt_to_ptdesc(pmd); - if (PageReserved(page)) - return free_reserved_page(page); + if (pagetable_is_reserved(ptdesc)) + return free_reserved_ptdesc(ptdesc); - BUG_ON(atomic_read(&page->pt_frag_refcount) <= 0); - if (atomic_dec_and_test(&page->pt_frag_refcount)) { - pgtable_pmd_page_dtor(page); - __free_page(page); + BUG_ON(atomic_read(&ptdesc->pt_frag_refcount) <= 0); + if (atomic_dec_and_test(&ptdesc->pt_frag_refcount)) { + pagetable_pmd_dtor(ptdesc); + pagetable_free(ptdesc); } } diff --git a/arch/powerpc/mm/pgtable-frag.c b/arch/powerpc/mm/pgtable-frag.c index 0c6b68130025..8c31802f97e8 100644 --- a/arch/powerpc/mm/pgtable-frag.c +++ b/arch/powerpc/mm/pgtable-frag.c @@ -18,15 +18,15 @@ void pte_frag_destroy(void *pte_frag) { int count; - struct page *page; + struct ptdesc *ptdesc; - page = virt_to_page(pte_frag); + ptdesc = virt_to_ptdesc(pte_frag); /* drop all the pending references */ count = ((unsigned long)pte_frag & ~PAGE_MASK) >> PTE_FRAG_SIZE_SHIFT; /* We allow PTE_FRAG_NR fragments from a PTE page */ - if (atomic_sub_and_test(PTE_FRAG_NR - count, &page->pt_frag_refcount)) { - pgtable_pte_page_dtor(page); - __free_page(page); + if (atomic_sub_and_test(PTE_FRAG_NR - count, &ptdesc->pt_frag_refcount)) { + pagetable_pte_dtor(ptdesc); + pagetable_free(ptdesc); } } @@ -55,25 +55,25 @@ static pte_t *get_pte_from_cache(struct mm_struct *mm) static pte_t *__alloc_for_ptecache(struct mm_struct *mm, int kernel) { void *ret = NULL; - struct page *page; + struct ptdesc *ptdesc; if (!kernel) { - page = alloc_page(PGALLOC_GFP | __GFP_ACCOUNT); - if (!page) + ptdesc = pagetable_alloc(PGALLOC_GFP | __GFP_ACCOUNT, 0); + if (!ptdesc) return NULL; - if (!pgtable_pte_page_ctor(page)) { - __free_page(page); + if (!pagetable_pte_ctor(ptdesc)) { + pagetable_free(ptdesc); return NULL; } } else { - page = alloc_page(PGALLOC_GFP); - if (!page) + ptdesc = pagetable_alloc(PGALLOC_GFP, 0); + if (!ptdesc) return NULL; } - atomic_set(&page->pt_frag_refcount, 1); + atomic_set(&ptdesc->pt_frag_refcount, 1); - ret = page_address(page); + ret = ptdesc_address(ptdesc); /* * if we support only one fragment just return the * allocated page. @@ -82,12 +82,12 @@ static pte_t *__alloc_for_ptecache(struct mm_struct *mm, int kernel) return ret; spin_lock(&mm->page_table_lock); /* - * If we find pgtable_page set, we return + * If we find ptdesc_page set, we return * the allocated page with single fragment * count. */ if (likely(!pte_frag_get(&mm->context))) { - atomic_set(&page->pt_frag_refcount, PTE_FRAG_NR); + atomic_set(&ptdesc->pt_frag_refcount, PTE_FRAG_NR); pte_frag_set(&mm->context, ret + PTE_FRAG_SIZE); } spin_unlock(&mm->page_table_lock); @@ -108,28 +108,28 @@ pte_t *pte_fragment_alloc(struct mm_struct *mm, int kernel) static void pte_free_now(struct rcu_head *head) { - struct page *page; + struct ptdesc *ptdesc; - page = container_of(head, struct page, rcu_head); - pgtable_pte_page_dtor(page); - __free_page(page); + ptdesc = container_of(head, struct ptdesc, pt_rcu_head); + pagetable_pte_dtor(ptdesc); + pagetable_free(ptdesc); } void pte_fragment_free(unsigned long *table, int kernel) { - struct page *page = virt_to_page(table); + struct ptdesc *ptdesc = virt_to_ptdesc(table); - if (PageReserved(page)) - return free_reserved_page(page); + if (pagetable_is_reserved(ptdesc)) + return free_reserved_ptdesc(ptdesc); - BUG_ON(atomic_read(&page->pt_frag_refcount) <= 0); - if (atomic_dec_and_test(&page->pt_frag_refcount)) { + BUG_ON(atomic_read(&ptdesc->pt_frag_refcount) <= 0); + if (atomic_dec_and_test(&ptdesc->pt_frag_refcount)) { if (kernel) - __free_page(page); - else if (TestClearPageActive(page)) - call_rcu(&page->rcu_head, pte_free_now); + pagetable_free(ptdesc); + else if (folio_test_clear_active(ptdesc_folio(ptdesc))) + call_rcu(&ptdesc->pt_rcu_head, pte_free_now); else - pte_free_now(&page->rcu_head); + pte_free_now(&ptdesc->pt_rcu_head); } } -- cgit v1.2.3 From 4089eef0e6ac1a179c58304c657b3df3bb6fe509 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Fri, 30 Jun 2023 14:19:54 -0700 Subject: mm: drop per-VMA lock when returning VM_FAULT_RETRY or VM_FAULT_COMPLETED handle_mm_fault returning VM_FAULT_RETRY or VM_FAULT_COMPLETED means mmap_lock has been released. However with per-VMA locks behavior is different and the caller should still release it. To make the rules consistent for the caller, drop the per-VMA lock when returning VM_FAULT_RETRY or VM_FAULT_COMPLETED. Currently the only path returning VM_FAULT_RETRY under per-VMA locks is do_swap_page and no path returns VM_FAULT_COMPLETED for now. [willy@infradead.org: fix riscv] Link: https://lkml.kernel.org/r/CAJuCfpE6GWEx1rPBmNpUfoD5o-gNFz9-UFywzCE2PbEGBiVz7g@mail.gmail.com Link: https://lkml.kernel.org/r/20230630211957.1341547-4-surenb@google.com Signed-off-by: Suren Baghdasaryan Acked-by: Peter Xu Tested-by: Conor Dooley Cc: Alistair Popple Cc: Al Viro Cc: Christian Brauner Cc: Christoph Hellwig Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: Hillf Danton Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Jan Kara Cc: Johannes Weiner Cc: Josef Bacik Cc: Laurent Dufour Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcox Cc: Michal Hocko Cc: Michel Lespinasse Cc: Minchan Kim Cc: Pavel Tatashin Cc: Punit Agrawal Cc: Vlastimil Babka Cc: Yu Zhao Signed-off-by: Andrew Morton --- arch/arm64/mm/fault.c | 3 ++- arch/powerpc/mm/fault.c | 3 ++- arch/riscv/mm/fault.c | 3 ++- arch/s390/mm/fault.c | 3 ++- arch/x86/mm/fault.c | 3 ++- mm/memory.c | 12 ++++++++++++ 6 files changed, 22 insertions(+), 5 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 103fcbdc6552..2e5d1e238af9 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -599,7 +599,8 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr, goto lock_mmap; } fault = handle_mm_fault(vma, addr, mm_flags | FAULT_FLAG_VMA_LOCK, regs); - vma_end_read(vma); + if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED))) + vma_end_read(vma); if (!(fault & VM_FAULT_RETRY)) { count_vm_vma_lock_event(VMA_LOCK_SUCCESS); diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index fafce6bdeff0..b1723094d464 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -488,7 +488,8 @@ static int ___do_page_fault(struct pt_regs *regs, unsigned long address, } fault = handle_mm_fault(vma, address, flags | FAULT_FLAG_VMA_LOCK, regs); - vma_end_read(vma); + if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED))) + vma_end_read(vma); if (!(fault & VM_FAULT_RETRY)) { count_vm_vma_lock_event(VMA_LOCK_SUCCESS); diff --git a/arch/riscv/mm/fault.c b/arch/riscv/mm/fault.c index 046732fcb48c..6115d7514972 100644 --- a/arch/riscv/mm/fault.c +++ b/arch/riscv/mm/fault.c @@ -296,7 +296,8 @@ void handle_page_fault(struct pt_regs *regs) } fault = handle_mm_fault(vma, addr, flags | FAULT_FLAG_VMA_LOCK, regs); - vma_end_read(vma); + if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED))) + vma_end_read(vma); if (!(fault & VM_FAULT_RETRY)) { count_vm_vma_lock_event(VMA_LOCK_SUCCESS); diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index 6f6b9881e55e..a063774ba584 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -417,7 +417,8 @@ static inline vm_fault_t do_exception(struct pt_regs *regs, int access) goto lock_mmap; } fault = handle_mm_fault(vma, address, flags | FAULT_FLAG_VMA_LOCK, regs); - vma_end_read(vma); + if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED))) + vma_end_read(vma); if (!(fault & VM_FAULT_RETRY)) { count_vm_vma_lock_event(VMA_LOCK_SUCCESS); if (likely(!(fault & VM_FAULT_ERROR))) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 787da09d24f3..2e861b9360c7 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1340,7 +1340,8 @@ void do_user_addr_fault(struct pt_regs *regs, goto lock_mmap; } fault = handle_mm_fault(vma, address, flags | FAULT_FLAG_VMA_LOCK, regs); - vma_end_read(vma); + if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED))) + vma_end_read(vma); if (!(fault & VM_FAULT_RETRY)) { count_vm_vma_lock_event(VMA_LOCK_SUCCESS); diff --git a/mm/memory.c b/mm/memory.c index f9c3ad489823..b9c3780fd426 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3747,6 +3747,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) if (vmf->flags & FAULT_FLAG_VMA_LOCK) { ret = VM_FAULT_RETRY; + vma_end_read(vma); goto out; } @@ -5248,6 +5249,17 @@ static vm_fault_t sanitize_fault_flags(struct vm_area_struct *vma, !is_cow_mapping(vma->vm_flags))) return VM_FAULT_SIGSEGV; } +#ifdef CONFIG_PER_VMA_LOCK + /* + * Per-VMA locks can't be used with FAULT_FLAG_RETRY_NOWAIT because of + * the assumption that lock is dropped on VM_FAULT_RETRY. + */ + if (WARN_ON_ONCE((*flags & + (FAULT_FLAG_VMA_LOCK | FAULT_FLAG_RETRY_NOWAIT)) == + (FAULT_FLAG_VMA_LOCK | FAULT_FLAG_RETRY_NOWAIT))) + return VM_FAULT_SIGSEGV; +#endif + return 0; } -- cgit v1.2.3 From 9fee28baa601f4dbf869b1373183b312d2d5ef3d Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 2 Aug 2023 16:13:49 +0100 Subject: powerpc: implement the new page table range API Add set_ptes(), update_mmu_cache_range() and flush_dcache_folio(). Change the PG_arch_1 (aka PG_dcache_dirty) flag from being per-page to per-folio. [willy@infradead.org: re-export flush_dcache_icache_folio()] Link: https://lkml.kernel.org/r/ZMx1daYwvD9EM7Cv@casper.infradead.org Link: https://lkml.kernel.org/r/20230802151406.3735276-22-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Mike Rapoport (IBM) Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Christophe Leroy Signed-off-by: Andrew Morton --- arch/powerpc/include/asm/book3s/32/pgtable.h | 5 --- arch/powerpc/include/asm/book3s/64/pgtable.h | 6 +--- arch/powerpc/include/asm/book3s/pgtable.h | 11 ++---- arch/powerpc/include/asm/cacheflush.h | 14 +++++--- arch/powerpc/include/asm/kvm_ppc.h | 10 +++--- arch/powerpc/include/asm/nohash/pgtable.h | 16 +++------ arch/powerpc/include/asm/pgtable.h | 12 +++++++ arch/powerpc/mm/book3s64/hash_utils.c | 11 +++--- arch/powerpc/mm/cacheflush.c | 41 ++++++++------------- arch/powerpc/mm/nohash/e500_hugetlbpage.c | 3 +- arch/powerpc/mm/pgtable.c | 53 ++++++++++++++++------------ 11 files changed, 89 insertions(+), 93 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/include/asm/book3s/32/pgtable.h b/arch/powerpc/include/asm/book3s/32/pgtable.h index 7bf1fe7297c6..5f12b9382909 100644 --- a/arch/powerpc/include/asm/book3s/32/pgtable.h +++ b/arch/powerpc/include/asm/book3s/32/pgtable.h @@ -462,11 +462,6 @@ static inline pte_t pfn_pte(unsigned long pfn, pgprot_t pgprot) pgprot_val(pgprot)); } -static inline unsigned long pte_pfn(pte_t pte) -{ - return pte_val(pte) >> PTE_RPN_SHIFT; -} - /* Generic modifiers for PTE bits */ static inline pte_t pte_wrprotect(pte_t pte) { diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index a8204566cfd0..8269b231c533 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -104,6 +104,7 @@ * and every thing below PAGE_SHIFT; */ #define PTE_RPN_MASK (((1UL << _PAGE_PA_MAX) - 1) & (PAGE_MASK)) +#define PTE_RPN_SHIFT PAGE_SHIFT /* * set of bits not changed in pmd_modify. Even though we have hash specific bits * in here, on radix we expect them to be zero. @@ -569,11 +570,6 @@ static inline pte_t pfn_pte(unsigned long pfn, pgprot_t pgprot) return __pte(((pte_basic_t)pfn << PAGE_SHIFT) | pgprot_val(pgprot) | _PAGE_PTE); } -static inline unsigned long pte_pfn(pte_t pte) -{ - return (pte_val(pte) & PTE_RPN_MASK) >> PAGE_SHIFT; -} - /* Generic modifiers for PTE bits */ static inline pte_t pte_wrprotect(pte_t pte) { diff --git a/arch/powerpc/include/asm/book3s/pgtable.h b/arch/powerpc/include/asm/book3s/pgtable.h index d18b748ea3ae..3b7bd36a2321 100644 --- a/arch/powerpc/include/asm/book3s/pgtable.h +++ b/arch/powerpc/include/asm/book3s/pgtable.h @@ -9,13 +9,6 @@ #endif #ifndef __ASSEMBLY__ -/* Insert a PTE, top-level function is out of line. It uses an inline - * low level function in the respective pgtable-* files - */ -extern void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, - pte_t pte); - - #define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS extern int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address, pte_t *ptep, pte_t entry, int dirty); @@ -36,7 +29,9 @@ void __update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t * corresponding HPTE into the hash table ahead of time, instead of * waiting for the inevitable extra hash-table miss exception. */ -static inline void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) +static inline void update_mmu_cache_range(struct vm_fault *vmf, + struct vm_area_struct *vma, unsigned long address, + pte_t *ptep, unsigned int nr) { if (IS_ENABLED(CONFIG_PPC32) && !mmu_has_feature(MMU_FTR_HPTE_TABLE)) return; diff --git a/arch/powerpc/include/asm/cacheflush.h b/arch/powerpc/include/asm/cacheflush.h index 7564dd4fd12b..ef7d2de33b89 100644 --- a/arch/powerpc/include/asm/cacheflush.h +++ b/arch/powerpc/include/asm/cacheflush.h @@ -35,13 +35,19 @@ static inline void flush_cache_vmap(unsigned long start, unsigned long end) * It just marks the page as not i-cache clean. We do the i-cache * flush later when the page is given to a user process, if necessary. */ -static inline void flush_dcache_page(struct page *page) +static inline void flush_dcache_folio(struct folio *folio) { if (cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) return; /* avoid an atomic op if possible */ - if (test_bit(PG_dcache_clean, &page->flags)) - clear_bit(PG_dcache_clean, &page->flags); + if (test_bit(PG_dcache_clean, &folio->flags)) + clear_bit(PG_dcache_clean, &folio->flags); +} +#define flush_dcache_folio flush_dcache_folio + +static inline void flush_dcache_page(struct page *page) +{ + flush_dcache_folio(page_folio(page)); } void flush_icache_range(unsigned long start, unsigned long stop); @@ -51,7 +57,7 @@ void flush_icache_user_page(struct vm_area_struct *vma, struct page *page, unsigned long addr, int len); #define flush_icache_user_page flush_icache_user_page -void flush_dcache_icache_page(struct page *page); +void flush_dcache_icache_folio(struct folio *folio); /** * flush_dcache_range(): Write any modified data cache blocks out to memory and diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index d16d80ad2ae4..b4da8514af43 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -894,7 +894,7 @@ void kvmppc_init_lpid(unsigned long nr_lpids); static inline void kvmppc_mmu_flush_icache(kvm_pfn_t pfn) { - struct page *page; + struct folio *folio; /* * We can only access pages that the kernel maps * as memory. Bail out for unmapped ones. @@ -903,10 +903,10 @@ static inline void kvmppc_mmu_flush_icache(kvm_pfn_t pfn) return; /* Clear i-cache for new pages */ - page = pfn_to_page(pfn); - if (!test_bit(PG_dcache_clean, &page->flags)) { - flush_dcache_icache_page(page); - set_bit(PG_dcache_clean, &page->flags); + folio = page_folio(pfn_to_page(pfn)); + if (!test_bit(PG_dcache_clean, &folio->flags)) { + flush_dcache_icache_folio(folio); + set_bit(PG_dcache_clean, &folio->flags); } } diff --git a/arch/powerpc/include/asm/nohash/pgtable.h b/arch/powerpc/include/asm/nohash/pgtable.h index a6caaaab6f92..56ea48276356 100644 --- a/arch/powerpc/include/asm/nohash/pgtable.h +++ b/arch/powerpc/include/asm/nohash/pgtable.h @@ -101,8 +101,6 @@ static inline bool pte_access_permitted(pte_t pte, bool write) static inline pte_t pfn_pte(unsigned long pfn, pgprot_t pgprot) { return __pte(((pte_basic_t)(pfn) << PTE_RPN_SHIFT) | pgprot_val(pgprot)); } -static inline unsigned long pte_pfn(pte_t pte) { - return pte_val(pte) >> PTE_RPN_SHIFT; } /* Generic modifiers for PTE bits */ static inline pte_t pte_exprotect(pte_t pte) @@ -166,12 +164,6 @@ static inline pte_t pte_swp_clear_exclusive(pte_t pte) return __pte(pte_val(pte) & ~_PAGE_SWP_EXCLUSIVE); } -/* Insert a PTE, top-level function is out of line. It uses an inline - * low level function in the respective pgtable-* files - */ -extern void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, - pte_t pte); - /* This low level function performs the actual PTE insertion * Setting the PTE depends on the MMU type and other factors. It's * an horrible mess that I'm not going to try to clean up now but @@ -282,10 +274,12 @@ static inline int pud_huge(pud_t pud) * for the page which has just been mapped in. */ #if defined(CONFIG_PPC_E500) && defined(CONFIG_HUGETLB_PAGE) -void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *ptep); +void update_mmu_cache_range(struct vm_fault *vmf, struct vm_area_struct *vma, + unsigned long address, pte_t *ptep, unsigned int nr); #else -static inline -void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) {} +static inline void update_mmu_cache_range(struct vm_fault *vmf, + struct vm_area_struct *vma, unsigned long address, + pte_t *ptep, unsigned int nr) {} #endif #endif /* __ASSEMBLY__ */ diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index 33464e6d6431..b2e9bc4a52c1 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h @@ -41,6 +41,12 @@ struct mm_struct; #ifndef __ASSEMBLY__ +void set_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, + pte_t pte, unsigned int nr); +#define set_ptes set_ptes +#define update_mmu_cache(vma, addr, ptep) \ + update_mmu_cache_range(NULL, vma, addr, ptep, 1) + #ifndef MAX_PTRS_PER_PGD #define MAX_PTRS_PER_PGD PTRS_PER_PGD #endif @@ -48,6 +54,12 @@ struct mm_struct; /* Keep these as a macros to avoid include dependency mess */ #define pte_page(x) pfn_to_page(pte_pfn(x)) #define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot)) + +static inline unsigned long pte_pfn(pte_t pte) +{ + return (pte_val(pte) & PTE_RPN_MASK) >> PTE_RPN_SHIFT; +} + /* * Select all bits except the pfn */ diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c index fedffe3ae136..ad2afa08e62e 100644 --- a/arch/powerpc/mm/book3s64/hash_utils.c +++ b/arch/powerpc/mm/book3s64/hash_utils.c @@ -1307,18 +1307,19 @@ void hash__early_init_mmu_secondary(void) */ unsigned int hash_page_do_lazy_icache(unsigned int pp, pte_t pte, int trap) { - struct page *page; + struct folio *folio; if (!pfn_valid(pte_pfn(pte))) return pp; - page = pte_page(pte); + folio = page_folio(pte_page(pte)); /* page is dirty */ - if (!test_bit(PG_dcache_clean, &page->flags) && !PageReserved(page)) { + if (!test_bit(PG_dcache_clean, &folio->flags) && + !folio_test_reserved(folio)) { if (trap == INTERRUPT_INST_STORAGE) { - flush_dcache_icache_page(page); - set_bit(PG_dcache_clean, &page->flags); + flush_dcache_icache_folio(folio); + set_bit(PG_dcache_clean, &folio->flags); } else pp |= HPTE_R_N; } diff --git a/arch/powerpc/mm/cacheflush.c b/arch/powerpc/mm/cacheflush.c index 0e9b4879c0f9..15189592da09 100644 --- a/arch/powerpc/mm/cacheflush.c +++ b/arch/powerpc/mm/cacheflush.c @@ -148,44 +148,31 @@ static void __flush_dcache_icache(void *p) invalidate_icache_range(addr, addr + PAGE_SIZE); } -static void flush_dcache_icache_hugepage(struct page *page) +void flush_dcache_icache_folio(struct folio *folio) { - int i; - int nr = compound_nr(page); + unsigned int i, nr = folio_nr_pages(folio); - if (!PageHighMem(page)) { + if (flush_coherent_icache()) + return; + + if (!folio_test_highmem(folio)) { + void *addr = folio_address(folio); for (i = 0; i < nr; i++) - __flush_dcache_icache(lowmem_page_address(page + i)); - } else { + __flush_dcache_icache(addr + i * PAGE_SIZE); + } else if (IS_ENABLED(CONFIG_BOOKE) || sizeof(phys_addr_t) > sizeof(void *)) { for (i = 0; i < nr; i++) { - void *start = kmap_local_page(page + i); + void *start = kmap_local_folio(folio, i * PAGE_SIZE); __flush_dcache_icache(start); kunmap_local(start); } - } -} - -void flush_dcache_icache_page(struct page *page) -{ - if (flush_coherent_icache()) - return; - - if (PageCompound(page)) - return flush_dcache_icache_hugepage(page); - - if (!PageHighMem(page)) { - __flush_dcache_icache(lowmem_page_address(page)); - } else if (IS_ENABLED(CONFIG_BOOKE) || sizeof(phys_addr_t) > sizeof(void *)) { - void *start = kmap_local_page(page); - - __flush_dcache_icache(start); - kunmap_local(start); } else { - flush_dcache_icache_phys(page_to_phys(page)); + unsigned long pfn = folio_pfn(folio); + for (i = 0; i < nr; i++) + flush_dcache_icache_phys((pfn + i) * PAGE_SIZE); } } -EXPORT_SYMBOL(flush_dcache_icache_page); +EXPORT_SYMBOL(flush_dcache_icache_folio); void clear_user_page(void *page, unsigned long vaddr, struct page *pg) { diff --git a/arch/powerpc/mm/nohash/e500_hugetlbpage.c b/arch/powerpc/mm/nohash/e500_hugetlbpage.c index 58c8d9849cb1..6b30e40d4590 100644 --- a/arch/powerpc/mm/nohash/e500_hugetlbpage.c +++ b/arch/powerpc/mm/nohash/e500_hugetlbpage.c @@ -178,7 +178,8 @@ book3e_hugetlb_preload(struct vm_area_struct *vma, unsigned long ea, pte_t pte) * * This must always be called with the pte lock held. */ -void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) +void update_mmu_cache_range(struct vm_fault *vmf, struct vm_area_struct *vma, + unsigned long address, pte_t *ptep, unsigned int nr) { if (is_vm_hugetlb_page(vma)) book3e_hugetlb_preload(vma, address, *ptep); diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c index a3dcdb2d5b4b..3f86fd217690 100644 --- a/arch/powerpc/mm/pgtable.c +++ b/arch/powerpc/mm/pgtable.c @@ -58,7 +58,7 @@ static inline int pte_looks_normal(pte_t pte) return 0; } -static struct page *maybe_pte_to_page(pte_t pte) +static struct folio *maybe_pte_to_folio(pte_t pte) { unsigned long pfn = pte_pfn(pte); struct page *page; @@ -68,7 +68,7 @@ static struct page *maybe_pte_to_page(pte_t pte) page = pfn_to_page(pfn); if (PageReserved(page)) return NULL; - return page; + return page_folio(page); } #ifdef CONFIG_PPC_BOOK3S @@ -84,12 +84,12 @@ static pte_t set_pte_filter_hash(pte_t pte) pte = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS); if (pte_looks_normal(pte) && !(cpu_has_feature(CPU_FTR_COHERENT_ICACHE) || cpu_has_feature(CPU_FTR_NOEXECUTE))) { - struct page *pg = maybe_pte_to_page(pte); - if (!pg) + struct folio *folio = maybe_pte_to_folio(pte); + if (!folio) return pte; - if (!test_bit(PG_dcache_clean, &pg->flags)) { - flush_dcache_icache_page(pg); - set_bit(PG_dcache_clean, &pg->flags); + if (!test_bit(PG_dcache_clean, &folio->flags)) { + flush_dcache_icache_folio(folio); + set_bit(PG_dcache_clean, &folio->flags); } } return pte; @@ -107,7 +107,7 @@ static pte_t set_pte_filter_hash(pte_t pte) { return pte; } */ static inline pte_t set_pte_filter(pte_t pte) { - struct page *pg; + struct folio *folio; if (radix_enabled()) return pte; @@ -120,18 +120,18 @@ static inline pte_t set_pte_filter(pte_t pte) return pte; /* If you set _PAGE_EXEC on weird pages you're on your own */ - pg = maybe_pte_to_page(pte); - if (unlikely(!pg)) + folio = maybe_pte_to_folio(pte); + if (unlikely(!folio)) return pte; /* If the page clean, we move on */ - if (test_bit(PG_dcache_clean, &pg->flags)) + if (test_bit(PG_dcache_clean, &folio->flags)) return pte; /* If it's an exec fault, we flush the cache and make it clean */ if (is_exec_fault()) { - flush_dcache_icache_page(pg); - set_bit(PG_dcache_clean, &pg->flags); + flush_dcache_icache_folio(folio); + set_bit(PG_dcache_clean, &folio->flags); return pte; } @@ -142,7 +142,7 @@ static inline pte_t set_pte_filter(pte_t pte) static pte_t set_access_flags_filter(pte_t pte, struct vm_area_struct *vma, int dirty) { - struct page *pg; + struct folio *folio; if (IS_ENABLED(CONFIG_PPC_BOOK3S_64)) return pte; @@ -168,17 +168,17 @@ static pte_t set_access_flags_filter(pte_t pte, struct vm_area_struct *vma, #endif /* CONFIG_DEBUG_VM */ /* If you set _PAGE_EXEC on weird pages you're on your own */ - pg = maybe_pte_to_page(pte); - if (unlikely(!pg)) + folio = maybe_pte_to_folio(pte); + if (unlikely(!folio)) goto bail; /* If the page is already clean, we move on */ - if (test_bit(PG_dcache_clean, &pg->flags)) + if (test_bit(PG_dcache_clean, &folio->flags)) goto bail; /* Clean the page and set PG_dcache_clean */ - flush_dcache_icache_page(pg); - set_bit(PG_dcache_clean, &pg->flags); + flush_dcache_icache_folio(folio); + set_bit(PG_dcache_clean, &folio->flags); bail: return pte_mkexec(pte); @@ -187,8 +187,8 @@ static pte_t set_access_flags_filter(pte_t pte, struct vm_area_struct *vma, /* * set_pte stores a linux PTE into the linux page table. */ -void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, - pte_t pte) +void set_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, + pte_t pte, unsigned int nr) { /* * Make sure hardware valid bit is not set. We don't do @@ -203,7 +203,16 @@ void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte = set_pte_filter(pte); /* Perform the setting of the PTE */ - __set_pte_at(mm, addr, ptep, pte, 0); + arch_enter_lazy_mmu_mode(); + for (;;) { + __set_pte_at(mm, addr, ptep, pte, 0); + if (--nr == 0) + break; + ptep++; + pte = __pte(pte_val(pte) + (1UL << PTE_RPN_SHIFT)); + addr += PAGE_SIZE; + } + arch_leave_lazy_mmu_mode(); } void unmap_kernel_page(unsigned long va) -- cgit v1.2.3