From 8211dad6279817a8966ff6b74c2c588dd4166f45 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 11 Jul 2023 21:38:35 -0700 Subject: s390: add pte_free_defer() for pgtables sharing page MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add s390-specific pte_free_defer(), to free table page via call_rcu(). pte_free_defer() will be called inside khugepaged's retract_page_tables() loop, where allocating extra memory cannot be relied upon. This precedes the generic version to avoid build breakage from incompatible pgtable_t. This version is more complicated than others: because s390 fits two 2K page tables into one 4K page (so page->rcu_head must be shared between both halves), and already uses page->lru (which page->rcu_head overlays) to list any free halves; with clever management by page->_refcount bits. Build upon the existing management, adjusted to follow a new rule: that a page is never on the free list if pte_free_defer() was used on either half (marked by PageActive). And for simplicity, delay calling RCU until both halves are freed. Not adding back unallocated fragments to the list in pte_free_defer() can result in wasting some amount of memory for pagetables, depending on how long the allocated fragment will stay in use. In practice, this effect is expected to be insignificant, and not justify a far more complex approach, which might allow to add the fragments back later in __tlb_remove_table(), where we might not have a stable mm any more. [hughd@google.com: Claudio finds warning on mm_has_pgste() more useful than on mm_alloc_pgste()] Link: https://lkml.kernel.org/r/3bc095ba-a180-ce3b-82b1-2bfc64612f3@google.com Link: https://lkml.kernel.org/r/94eccf5f-264c-8abe-4567-e77f4b4e14a@google.com Signed-off-by: Hugh Dickins Reviewed-by: Gerald Schaefer Tested-by: Alexander Gordeev Acked-by: Alexander Gordeev Cc: Alistair Popple Cc: Aneesh Kumar K.V Cc: Anshuman Khandual Cc: Axel Rasmussen Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Christoph Hellwig Cc: Claudio Imbrenda Cc: David Hildenbrand Cc: "David S. Miller" Cc: Heiko Carstens Cc: Huang, Ying Cc: Ira Weiny Cc: Jann Horn Cc: Jason Gunthorpe Cc: Kirill A. Shutemov Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Miaohe Lin Cc: Michael Ellerman Cc: Mike Kravetz Cc: Mike Rapoport (IBM) Cc: Minchan Kim Cc: Naoya Horiguchi Cc: Pavel Tatashin Cc: Peter Xu Cc: Peter Zijlstra Cc: Qi Zheng Cc: Ralph Campbell Cc: Russell King Cc: SeongJae Park Cc: Song Liu Cc: Steven Price Cc: Suren Baghdasaryan Cc: Thomas Hellström Cc: Vasily Gorbik Cc: Vishal Moola (Oracle) Cc: Vlastimil Babka Cc: Will Deacon Cc: Yang Shi Cc: Yu Zhao Cc: Zack Rusin Cc: Zi Yan Signed-off-by: Andrew Morton --- arch/s390/mm/pgalloc.c | 80 ++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 68 insertions(+), 12 deletions(-) (limited to 'arch/s390/mm') diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c index 66ab68db9842..d7374add7820 100644 --- a/arch/s390/mm/pgalloc.c +++ b/arch/s390/mm/pgalloc.c @@ -229,6 +229,15 @@ void page_table_free_pgste(struct page *page) * logic described above. Both AA bits are set to 1 to denote a 4KB-pgtable * while the PP bits are never used, nor such a page is added to or removed * from mm_context_t::pgtable_list. + * + * pte_free_defer() overrides those rules: it takes the page off pgtable_list, + * and prevents both 2K fragments from being reused. pte_free_defer() has to + * guarantee that its pgtable cannot be reused before the RCU grace period + * has elapsed (which page_table_free_rcu() does not actually guarantee). + * But for simplicity, because page->rcu_head overlays page->lru, and because + * the RCU callback might not be called before the mm_context_t has been freed, + * pte_free_defer() in this implementation prevents both fragments from being + * reused, and delays making the call to RCU until both fragments are freed. */ unsigned long *page_table_alloc(struct mm_struct *mm) { @@ -261,7 +270,7 @@ unsigned long *page_table_alloc(struct mm_struct *mm) table += PTRS_PER_PTE; atomic_xor_bits(&page->_refcount, 0x01U << (bit + 24)); - list_del(&page->lru); + list_del_init(&page->lru); } } spin_unlock_bh(&mm->context.lock); @@ -281,6 +290,7 @@ unsigned long *page_table_alloc(struct mm_struct *mm) table = (unsigned long *) page_to_virt(page); if (mm_alloc_pgste(mm)) { /* Return 4K page table with PGSTEs */ + INIT_LIST_HEAD(&page->lru); atomic_xor_bits(&page->_refcount, 0x03U << 24); memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE); memset64((u64 *)table + PTRS_PER_PTE, 0, PTRS_PER_PTE); @@ -300,7 +310,9 @@ static void page_table_release_check(struct page *page, void *table, { char msg[128]; - if (!IS_ENABLED(CONFIG_DEBUG_VM) || !mask) + if (!IS_ENABLED(CONFIG_DEBUG_VM)) + return; + if (!mask && list_empty(&page->lru)) return; snprintf(msg, sizeof(msg), "Invalid pgtable %p release half 0x%02x mask 0x%02x", @@ -308,6 +320,15 @@ static void page_table_release_check(struct page *page, void *table, dump_page(page, msg); } +static void pte_free_now(struct rcu_head *head) +{ + struct page *page; + + page = container_of(head, struct page, rcu_head); + pgtable_pte_page_dtor(page); + __free_page(page); +} + void page_table_free(struct mm_struct *mm, unsigned long *table) { unsigned int mask, bit, half; @@ -325,10 +346,17 @@ void page_table_free(struct mm_struct *mm, unsigned long *table) */ mask = atomic_xor_bits(&page->_refcount, 0x11U << (bit + 24)); mask >>= 24; - if (mask & 0x03U) + if ((mask & 0x03U) && !PageActive(page)) { + /* + * Other half is allocated, and neither half has had + * its free deferred: add page to head of list, to make + * this freed half available for immediate reuse. + */ list_add(&page->lru, &mm->context.pgtable_list); - else - list_del(&page->lru); + } else { + /* If page is on list, now remove it. */ + list_del_init(&page->lru); + } spin_unlock_bh(&mm->context.lock); mask = atomic_xor_bits(&page->_refcount, 0x10U << (bit + 24)); mask >>= 24; @@ -342,8 +370,10 @@ void page_table_free(struct mm_struct *mm, unsigned long *table) } page_table_release_check(page, table, half, mask); - pgtable_pte_page_dtor(page); - __free_page(page); + if (TestClearPageActive(page)) + call_rcu(&page->rcu_head, pte_free_now); + else + pte_free_now(&page->rcu_head); } void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table, @@ -370,10 +400,18 @@ void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table, */ mask = atomic_xor_bits(&page->_refcount, 0x11U << (bit + 24)); mask >>= 24; - if (mask & 0x03U) + if ((mask & 0x03U) && !PageActive(page)) { + /* + * Other half is allocated, and neither half has had + * its free deferred: add page to end of list, to make + * this freed half available for reuse once its pending + * bit has been cleared by __tlb_remove_table(). + */ list_add_tail(&page->lru, &mm->context.pgtable_list); - else - list_del(&page->lru); + } else { + /* If page is on list, now remove it. */ + list_del_init(&page->lru); + } spin_unlock_bh(&mm->context.lock); table = (unsigned long *) ((unsigned long) table | (0x01U << bit)); tlb_remove_table(tlb, table); @@ -403,9 +441,27 @@ void __tlb_remove_table(void *_table) } page_table_release_check(page, table, half, mask); - pgtable_pte_page_dtor(page); - __free_page(page); + if (TestClearPageActive(page)) + call_rcu(&page->rcu_head, pte_free_now); + else + pte_free_now(&page->rcu_head); +} + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable) +{ + struct page *page; + + page = virt_to_page(pgtable); + SetPageActive(page); + page_table_free(mm, (unsigned long *)pgtable); + /* + * page_table_free() does not do the pgste gmap_unlink() which + * page_table_free_rcu() does: warn us if pgste ever reaches here. + */ + WARN_ON_ONCE(mm_has_pgste(mm)); } +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ /* * Base infrastructure required to generate basic asces, region, segment, -- cgit v1.2.3 From 284e05920498788c5df1a7dd6424adb426498e1c Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 24 Jul 2023 19:54:01 +0100 Subject: mm: remove CONFIG_PER_VMA_LOCK ifdefs Patch series "Handle most file-backed faults under the VMA lock", v3. This patchset adds the ability to handle page faults on parts of files which are already in the page cache without taking the mmap lock. This patch (of 10): Provide lock_vma_under_rcu() when CONFIG_PER_VMA_LOCK is not defined to eliminate ifdefs in the users. Link: https://lkml.kernel.org/r/20230724185410.1124082-1-willy@infradead.org Link: https://lkml.kernel.org/r/20230724185410.1124082-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Suren Baghdasaryan Cc: Punit Agrawal Cc: Arjun Roy Cc: Eric Dumazet Signed-off-by: Andrew Morton --- arch/arm64/mm/fault.c | 2 -- arch/powerpc/mm/fault.c | 4 ---- arch/riscv/mm/fault.c | 4 ---- arch/s390/mm/fault.c | 2 -- arch/x86/mm/fault.c | 4 ---- include/linux/mm.h | 6 ++++++ 6 files changed, 6 insertions(+), 16 deletions(-) (limited to 'arch/s390/mm') diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 3fe516b32577..103fcbdc6552 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -587,7 +587,6 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr, perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr); -#ifdef CONFIG_PER_VMA_LOCK if (!(mm_flags & FAULT_FLAG_USER)) goto lock_mmap; @@ -615,7 +614,6 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr, return 0; } lock_mmap: -#endif /* CONFIG_PER_VMA_LOCK */ retry: vma = lock_mm_and_find_vma(mm, addr, regs); diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index 5bfdf6ecfa96..fafce6bdeff0 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -469,7 +469,6 @@ static int ___do_page_fault(struct pt_regs *regs, unsigned long address, if (is_exec) flags |= FAULT_FLAG_INSTRUCTION; -#ifdef CONFIG_PER_VMA_LOCK if (!(flags & FAULT_FLAG_USER)) goto lock_mmap; @@ -501,7 +500,6 @@ static int ___do_page_fault(struct pt_regs *regs, unsigned long address, return user_mode(regs) ? 0 : SIGBUS; lock_mmap: -#endif /* CONFIG_PER_VMA_LOCK */ /* When running in the kernel we expect faults to occur only to * addresses in user space. All other faults represent errors in the @@ -551,9 +549,7 @@ retry: mmap_read_unlock(current->mm); -#ifdef CONFIG_PER_VMA_LOCK done: -#endif if (unlikely(fault & VM_FAULT_ERROR)) return mm_fault_error(regs, address, fault); diff --git a/arch/riscv/mm/fault.c b/arch/riscv/mm/fault.c index 6ea2cce4cc17..046732fcb48c 100644 --- a/arch/riscv/mm/fault.c +++ b/arch/riscv/mm/fault.c @@ -283,7 +283,6 @@ void handle_page_fault(struct pt_regs *regs) flags |= FAULT_FLAG_WRITE; else if (cause == EXC_INST_PAGE_FAULT) flags |= FAULT_FLAG_INSTRUCTION; -#ifdef CONFIG_PER_VMA_LOCK if (!(flags & FAULT_FLAG_USER)) goto lock_mmap; @@ -311,7 +310,6 @@ void handle_page_fault(struct pt_regs *regs) return; } lock_mmap: -#endif /* CONFIG_PER_VMA_LOCK */ retry: vma = lock_mm_and_find_vma(mm, addr, regs); @@ -368,9 +366,7 @@ retry: mmap_read_unlock(mm); -#ifdef CONFIG_PER_VMA_LOCK done: -#endif if (unlikely(fault & VM_FAULT_ERROR)) { tsk->thread.bad_cause = cause; mm_fault_error(regs, addr, fault); diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index 2f123429a291..6f6b9881e55e 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -407,7 +407,6 @@ static inline vm_fault_t do_exception(struct pt_regs *regs, int access) access = VM_WRITE; if (access == VM_WRITE) flags |= FAULT_FLAG_WRITE; -#ifdef CONFIG_PER_VMA_LOCK if (!(flags & FAULT_FLAG_USER)) goto lock_mmap; vma = lock_vma_under_rcu(mm, address); @@ -432,7 +431,6 @@ static inline vm_fault_t do_exception(struct pt_regs *regs, int access) goto out; } lock_mmap: -#endif /* CONFIG_PER_VMA_LOCK */ mmap_read_lock(mm); gmap = NULL; diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index e8711b2cafaf..787da09d24f3 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1328,7 +1328,6 @@ void do_user_addr_fault(struct pt_regs *regs, } #endif -#ifdef CONFIG_PER_VMA_LOCK if (!(flags & FAULT_FLAG_USER)) goto lock_mmap; @@ -1358,7 +1357,6 @@ void do_user_addr_fault(struct pt_regs *regs, return; } lock_mmap: -#endif /* CONFIG_PER_VMA_LOCK */ retry: vma = lock_mm_and_find_vma(mm, address, regs); @@ -1418,9 +1416,7 @@ retry: } mmap_read_unlock(mm); -#ifdef CONFIG_PER_VMA_LOCK done: -#endif if (likely(!(fault & VM_FAULT_ERROR))) return; diff --git a/include/linux/mm.h b/include/linux/mm.h index ded514ee2588..21299a0cfbca 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -742,6 +742,12 @@ static inline void vma_assert_write_locked(struct vm_area_struct *vma) {} static inline void vma_mark_detached(struct vm_area_struct *vma, bool detached) {} +static inline struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, + unsigned long address) +{ + return NULL; +} + #endif /* CONFIG_PER_VMA_LOCK */ /* -- cgit v1.2.3 From 6326c26c1514757242829b292b26eac589013200 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 7 Aug 2023 16:04:56 -0700 Subject: s390: convert various pgalloc functions to use ptdescs As part of the conversions to replace pgtable constructor/destructors with ptdesc equivalents, convert various page table functions to use ptdescs. Some of the functions use the *get*page*() helper functions. Convert these to use pagetable_alloc() and ptdesc_address() instead to help standardize page tables further. Link: https://lkml.kernel.org/r/20230807230513.102486-15-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Acked-by: Mike Rapoport (IBM) Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Christophe Leroy Cc: Claudio Imbrenda Cc: Dave Hansen Cc: David Hildenbrand Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Huacai Chen Cc: Hugh Dickins Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: Matthew Wilcox Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Richard Weinberger Cc: Thomas Bogendoerfer Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/s390/include/asm/pgalloc.h | 4 +- arch/s390/include/asm/tlb.h | 4 +- arch/s390/mm/pgalloc.c | 128 ++++++++++++++++++++-------------------- 3 files changed, 69 insertions(+), 67 deletions(-) (limited to 'arch/s390/mm') diff --git a/arch/s390/include/asm/pgalloc.h b/arch/s390/include/asm/pgalloc.h index 89a9d5ef94f8..376b4b23bdaa 100644 --- a/arch/s390/include/asm/pgalloc.h +++ b/arch/s390/include/asm/pgalloc.h @@ -86,7 +86,7 @@ static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long vmaddr) if (!table) return NULL; crst_table_init(table, _SEGMENT_ENTRY_EMPTY); - if (!pgtable_pmd_page_ctor(virt_to_page(table))) { + if (!pagetable_pmd_ctor(virt_to_ptdesc(table))) { crst_table_free(mm, table); return NULL; } @@ -97,7 +97,7 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) { if (mm_pmd_folded(mm)) return; - pgtable_pmd_page_dtor(virt_to_page(pmd)); + pagetable_pmd_dtor(virt_to_ptdesc(pmd)); crst_table_free(mm, (unsigned long *) pmd); } diff --git a/arch/s390/include/asm/tlb.h b/arch/s390/include/asm/tlb.h index b91f4a9b044c..383b1f91442c 100644 --- a/arch/s390/include/asm/tlb.h +++ b/arch/s390/include/asm/tlb.h @@ -89,12 +89,12 @@ static inline void pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd, { if (mm_pmd_folded(tlb->mm)) return; - pgtable_pmd_page_dtor(virt_to_page(pmd)); + pagetable_pmd_dtor(virt_to_ptdesc(pmd)); __tlb_adjust_range(tlb, address, PAGE_SIZE); tlb->mm->context.flush_mm = 1; tlb->freed_tables = 1; tlb->cleared_puds = 1; - tlb_remove_table(tlb, pmd); + tlb_remove_ptdesc(tlb, pmd); } /* diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c index d7374add7820..07fc660a24aa 100644 --- a/arch/s390/mm/pgalloc.c +++ b/arch/s390/mm/pgalloc.c @@ -43,17 +43,17 @@ __initcall(page_table_register_sysctl); unsigned long *crst_table_alloc(struct mm_struct *mm) { - struct page *page = alloc_pages(GFP_KERNEL, CRST_ALLOC_ORDER); + struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL, CRST_ALLOC_ORDER); - if (!page) + if (!ptdesc) return NULL; - arch_set_page_dat(page, CRST_ALLOC_ORDER); - return (unsigned long *) page_to_virt(page); + arch_set_page_dat(ptdesc_page(ptdesc), CRST_ALLOC_ORDER); + return (unsigned long *) ptdesc_to_virt(ptdesc); } void crst_table_free(struct mm_struct *mm, unsigned long *table) { - free_pages((unsigned long)table, CRST_ALLOC_ORDER); + pagetable_free(virt_to_ptdesc(table)); } static void __crst_table_upgrade(void *arg) @@ -140,21 +140,21 @@ static inline unsigned int atomic_xor_bits(atomic_t *v, unsigned int bits) struct page *page_table_alloc_pgste(struct mm_struct *mm) { - struct page *page; + struct ptdesc *ptdesc; u64 *table; - page = alloc_page(GFP_KERNEL); - if (page) { - table = (u64 *)page_to_virt(page); + ptdesc = pagetable_alloc(GFP_KERNEL, 0); + if (ptdesc) { + table = (u64 *)ptdesc_to_virt(ptdesc); memset64(table, _PAGE_INVALID, PTRS_PER_PTE); memset64(table + PTRS_PER_PTE, 0, PTRS_PER_PTE); } - return page; + return ptdesc_page(ptdesc); } void page_table_free_pgste(struct page *page) { - __free_page(page); + pagetable_free(page_ptdesc(page)); } #endif /* CONFIG_PGSTE */ @@ -242,7 +242,7 @@ void page_table_free_pgste(struct page *page) unsigned long *page_table_alloc(struct mm_struct *mm) { unsigned long *table; - struct page *page; + struct ptdesc *ptdesc; unsigned int mask, bit; /* Try to get a fragment of a 4K page as a 2K page table */ @@ -250,9 +250,9 @@ unsigned long *page_table_alloc(struct mm_struct *mm) table = NULL; spin_lock_bh(&mm->context.lock); if (!list_empty(&mm->context.pgtable_list)) { - page = list_first_entry(&mm->context.pgtable_list, - struct page, lru); - mask = atomic_read(&page->_refcount) >> 24; + ptdesc = list_first_entry(&mm->context.pgtable_list, + struct ptdesc, pt_list); + mask = atomic_read(&ptdesc->_refcount) >> 24; /* * The pending removal bits must also be checked. * Failure to do so might lead to an impossible @@ -264,13 +264,13 @@ unsigned long *page_table_alloc(struct mm_struct *mm) */ mask = (mask | (mask >> 4)) & 0x03U; if (mask != 0x03U) { - table = (unsigned long *) page_to_virt(page); + table = (unsigned long *) ptdesc_to_virt(ptdesc); bit = mask & 1; /* =1 -> second 2K */ if (bit) table += PTRS_PER_PTE; - atomic_xor_bits(&page->_refcount, + atomic_xor_bits(&ptdesc->_refcount, 0x01U << (bit + 24)); - list_del_init(&page->lru); + list_del_init(&ptdesc->pt_list); } } spin_unlock_bh(&mm->context.lock); @@ -278,28 +278,28 @@ unsigned long *page_table_alloc(struct mm_struct *mm) return table; } /* Allocate a fresh page */ - page = alloc_page(GFP_KERNEL); - if (!page) + ptdesc = pagetable_alloc(GFP_KERNEL, 0); + if (!ptdesc) return NULL; - if (!pgtable_pte_page_ctor(page)) { - __free_page(page); + if (!pagetable_pte_ctor(ptdesc)) { + pagetable_free(ptdesc); return NULL; } - arch_set_page_dat(page, 0); + arch_set_page_dat(ptdesc_page(ptdesc), 0); /* Initialize page table */ - table = (unsigned long *) page_to_virt(page); + table = (unsigned long *) ptdesc_to_virt(ptdesc); if (mm_alloc_pgste(mm)) { /* Return 4K page table with PGSTEs */ - INIT_LIST_HEAD(&page->lru); - atomic_xor_bits(&page->_refcount, 0x03U << 24); + INIT_LIST_HEAD(&ptdesc->pt_list); + atomic_xor_bits(&ptdesc->_refcount, 0x03U << 24); memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE); memset64((u64 *)table + PTRS_PER_PTE, 0, PTRS_PER_PTE); } else { /* Return the first 2K fragment of the page */ - atomic_xor_bits(&page->_refcount, 0x01U << 24); + atomic_xor_bits(&ptdesc->_refcount, 0x01U << 24); memset64((u64 *)table, _PAGE_INVALID, 2 * PTRS_PER_PTE); spin_lock_bh(&mm->context.lock); - list_add(&page->lru, &mm->context.pgtable_list); + list_add(&ptdesc->pt_list, &mm->context.pgtable_list); spin_unlock_bh(&mm->context.lock); } return table; @@ -322,19 +322,18 @@ static void page_table_release_check(struct page *page, void *table, static void pte_free_now(struct rcu_head *head) { - struct page *page; + struct ptdesc *ptdesc; - page = container_of(head, struct page, rcu_head); - pgtable_pte_page_dtor(page); - __free_page(page); + ptdesc = container_of(head, struct ptdesc, pt_rcu_head); + pagetable_pte_dtor(ptdesc); + pagetable_free(ptdesc); } void page_table_free(struct mm_struct *mm, unsigned long *table) { unsigned int mask, bit, half; - struct page *page; + struct ptdesc *ptdesc = virt_to_ptdesc(table); - page = virt_to_page(table); if (!mm_alloc_pgste(mm)) { /* Free 2K page table fragment of a 4K page */ bit = ((unsigned long) table & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t)); @@ -344,51 +343,50 @@ void page_table_free(struct mm_struct *mm, unsigned long *table) * will happen outside of the critical section from this * function or from __tlb_remove_table() */ - mask = atomic_xor_bits(&page->_refcount, 0x11U << (bit + 24)); + mask = atomic_xor_bits(&ptdesc->_refcount, 0x11U << (bit + 24)); mask >>= 24; - if ((mask & 0x03U) && !PageActive(page)) { + if ((mask & 0x03U) && !folio_test_active(ptdesc_folio(ptdesc))) { /* * Other half is allocated, and neither half has had * its free deferred: add page to head of list, to make * this freed half available for immediate reuse. */ - list_add(&page->lru, &mm->context.pgtable_list); + list_add(&ptdesc->pt_list, &mm->context.pgtable_list); } else { /* If page is on list, now remove it. */ - list_del_init(&page->lru); + list_del_init(&ptdesc->pt_list); } spin_unlock_bh(&mm->context.lock); - mask = atomic_xor_bits(&page->_refcount, 0x10U << (bit + 24)); + mask = atomic_xor_bits(&ptdesc->_refcount, 0x10U << (bit + 24)); mask >>= 24; if (mask != 0x00U) return; half = 0x01U << bit; } else { half = 0x03U; - mask = atomic_xor_bits(&page->_refcount, 0x03U << 24); + mask = atomic_xor_bits(&ptdesc->_refcount, 0x03U << 24); mask >>= 24; } - page_table_release_check(page, table, half, mask); - if (TestClearPageActive(page)) - call_rcu(&page->rcu_head, pte_free_now); + page_table_release_check(ptdesc_page(ptdesc), table, half, mask); + if (folio_test_clear_active(ptdesc_folio(ptdesc))) + call_rcu(&ptdesc->pt_rcu_head, pte_free_now); else - pte_free_now(&page->rcu_head); + pte_free_now(&ptdesc->pt_rcu_head); } void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table, unsigned long vmaddr) { struct mm_struct *mm; - struct page *page; unsigned int bit, mask; + struct ptdesc *ptdesc = virt_to_ptdesc(table); mm = tlb->mm; - page = virt_to_page(table); if (mm_alloc_pgste(mm)) { gmap_unlink(mm, table, vmaddr); table = (unsigned long *) ((unsigned long)table | 0x03U); - tlb_remove_table(tlb, table); + tlb_remove_ptdesc(tlb, table); return; } bit = ((unsigned long) table & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t)); @@ -398,19 +396,19 @@ void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table, * outside of the critical section from __tlb_remove_table() or from * page_table_free() */ - mask = atomic_xor_bits(&page->_refcount, 0x11U << (bit + 24)); + mask = atomic_xor_bits(&ptdesc->_refcount, 0x11U << (bit + 24)); mask >>= 24; - if ((mask & 0x03U) && !PageActive(page)) { + if ((mask & 0x03U) && !folio_test_active(ptdesc_folio(ptdesc))) { /* * Other half is allocated, and neither half has had * its free deferred: add page to end of list, to make * this freed half available for reuse once its pending * bit has been cleared by __tlb_remove_table(). */ - list_add_tail(&page->lru, &mm->context.pgtable_list); + list_add_tail(&ptdesc->pt_list, &mm->context.pgtable_list); } else { /* If page is on list, now remove it. */ - list_del_init(&page->lru); + list_del_init(&ptdesc->pt_list); } spin_unlock_bh(&mm->context.lock); table = (unsigned long *) ((unsigned long) table | (0x01U << bit)); @@ -421,30 +419,30 @@ void __tlb_remove_table(void *_table) { unsigned int mask = (unsigned long) _table & 0x03U, half = mask; void *table = (void *)((unsigned long) _table ^ mask); - struct page *page = virt_to_page(table); + struct ptdesc *ptdesc = virt_to_ptdesc(table); switch (half) { case 0x00U: /* pmd, pud, or p4d */ - free_pages((unsigned long)table, CRST_ALLOC_ORDER); + pagetable_free(ptdesc); return; case 0x01U: /* lower 2K of a 4K page table */ case 0x02U: /* higher 2K of a 4K page table */ - mask = atomic_xor_bits(&page->_refcount, mask << (4 + 24)); + mask = atomic_xor_bits(&ptdesc->_refcount, mask << (4 + 24)); mask >>= 24; if (mask != 0x00U) return; break; case 0x03U: /* 4K page table with pgstes */ - mask = atomic_xor_bits(&page->_refcount, 0x03U << 24); + mask = atomic_xor_bits(&ptdesc->_refcount, 0x03U << 24); mask >>= 24; break; } - page_table_release_check(page, table, half, mask); - if (TestClearPageActive(page)) - call_rcu(&page->rcu_head, pte_free_now); + page_table_release_check(ptdesc_page(ptdesc), table, half, mask); + if (folio_test_clear_active(ptdesc_folio(ptdesc))) + call_rcu(&ptdesc->pt_rcu_head, pte_free_now); else - pte_free_now(&page->rcu_head); + pte_free_now(&ptdesc->pt_rcu_head); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE @@ -488,16 +486,20 @@ static void base_pgt_free(unsigned long *table) static unsigned long *base_crst_alloc(unsigned long val) { unsigned long *table; + struct ptdesc *ptdesc; - table = (unsigned long *)__get_free_pages(GFP_KERNEL, CRST_ALLOC_ORDER); - if (table) - crst_table_init(table, val); + ptdesc = pagetable_alloc(GFP_KERNEL & ~__GFP_HIGHMEM, CRST_ALLOC_ORDER); + if (!ptdesc) + return NULL; + table = ptdesc_address(ptdesc); + + crst_table_init(table, val); return table; } static void base_crst_free(unsigned long *table) { - free_pages((unsigned long)table, CRST_ALLOC_ORDER); + pagetable_free(virt_to_ptdesc(table)); } #define BASE_ADDR_END_FUNC(NAME, SIZE) \ -- cgit v1.2.3 From 4089eef0e6ac1a179c58304c657b3df3bb6fe509 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Fri, 30 Jun 2023 14:19:54 -0700 Subject: mm: drop per-VMA lock when returning VM_FAULT_RETRY or VM_FAULT_COMPLETED handle_mm_fault returning VM_FAULT_RETRY or VM_FAULT_COMPLETED means mmap_lock has been released. However with per-VMA locks behavior is different and the caller should still release it. To make the rules consistent for the caller, drop the per-VMA lock when returning VM_FAULT_RETRY or VM_FAULT_COMPLETED. Currently the only path returning VM_FAULT_RETRY under per-VMA locks is do_swap_page and no path returns VM_FAULT_COMPLETED for now. [willy@infradead.org: fix riscv] Link: https://lkml.kernel.org/r/CAJuCfpE6GWEx1rPBmNpUfoD5o-gNFz9-UFywzCE2PbEGBiVz7g@mail.gmail.com Link: https://lkml.kernel.org/r/20230630211957.1341547-4-surenb@google.com Signed-off-by: Suren Baghdasaryan Acked-by: Peter Xu Tested-by: Conor Dooley Cc: Alistair Popple Cc: Al Viro Cc: Christian Brauner Cc: Christoph Hellwig Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: Hillf Danton Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Jan Kara Cc: Johannes Weiner Cc: Josef Bacik Cc: Laurent Dufour Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcox Cc: Michal Hocko Cc: Michel Lespinasse Cc: Minchan Kim Cc: Pavel Tatashin Cc: Punit Agrawal Cc: Vlastimil Babka Cc: Yu Zhao Signed-off-by: Andrew Morton --- arch/arm64/mm/fault.c | 3 ++- arch/powerpc/mm/fault.c | 3 ++- arch/riscv/mm/fault.c | 3 ++- arch/s390/mm/fault.c | 3 ++- arch/x86/mm/fault.c | 3 ++- mm/memory.c | 12 ++++++++++++ 6 files changed, 22 insertions(+), 5 deletions(-) (limited to 'arch/s390/mm') diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 103fcbdc6552..2e5d1e238af9 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -599,7 +599,8 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr, goto lock_mmap; } fault = handle_mm_fault(vma, addr, mm_flags | FAULT_FLAG_VMA_LOCK, regs); - vma_end_read(vma); + if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED))) + vma_end_read(vma); if (!(fault & VM_FAULT_RETRY)) { count_vm_vma_lock_event(VMA_LOCK_SUCCESS); diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index fafce6bdeff0..b1723094d464 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -488,7 +488,8 @@ static int ___do_page_fault(struct pt_regs *regs, unsigned long address, } fault = handle_mm_fault(vma, address, flags | FAULT_FLAG_VMA_LOCK, regs); - vma_end_read(vma); + if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED))) + vma_end_read(vma); if (!(fault & VM_FAULT_RETRY)) { count_vm_vma_lock_event(VMA_LOCK_SUCCESS); diff --git a/arch/riscv/mm/fault.c b/arch/riscv/mm/fault.c index 046732fcb48c..6115d7514972 100644 --- a/arch/riscv/mm/fault.c +++ b/arch/riscv/mm/fault.c @@ -296,7 +296,8 @@ void handle_page_fault(struct pt_regs *regs) } fault = handle_mm_fault(vma, addr, flags | FAULT_FLAG_VMA_LOCK, regs); - vma_end_read(vma); + if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED))) + vma_end_read(vma); if (!(fault & VM_FAULT_RETRY)) { count_vm_vma_lock_event(VMA_LOCK_SUCCESS); diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index 6f6b9881e55e..a063774ba584 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -417,7 +417,8 @@ static inline vm_fault_t do_exception(struct pt_regs *regs, int access) goto lock_mmap; } fault = handle_mm_fault(vma, address, flags | FAULT_FLAG_VMA_LOCK, regs); - vma_end_read(vma); + if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED))) + vma_end_read(vma); if (!(fault & VM_FAULT_RETRY)) { count_vm_vma_lock_event(VMA_LOCK_SUCCESS); if (likely(!(fault & VM_FAULT_ERROR))) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 787da09d24f3..2e861b9360c7 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1340,7 +1340,8 @@ void do_user_addr_fault(struct pt_regs *regs, goto lock_mmap; } fault = handle_mm_fault(vma, address, flags | FAULT_FLAG_VMA_LOCK, regs); - vma_end_read(vma); + if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED))) + vma_end_read(vma); if (!(fault & VM_FAULT_RETRY)) { count_vm_vma_lock_event(VMA_LOCK_SUCCESS); diff --git a/mm/memory.c b/mm/memory.c index f9c3ad489823..b9c3780fd426 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3747,6 +3747,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) if (vmf->flags & FAULT_FLAG_VMA_LOCK) { ret = VM_FAULT_RETRY; + vma_end_read(vma); goto out; } @@ -5248,6 +5249,17 @@ static vm_fault_t sanitize_fault_flags(struct vm_area_struct *vma, !is_cow_mapping(vma->vm_flags))) return VM_FAULT_SIGSEGV; } +#ifdef CONFIG_PER_VMA_LOCK + /* + * Per-VMA locks can't be used with FAULT_FLAG_RETRY_NOWAIT because of + * the assumption that lock is dropped on VM_FAULT_RETRY. + */ + if (WARN_ON_ONCE((*flags & + (FAULT_FLAG_VMA_LOCK | FAULT_FLAG_RETRY_NOWAIT)) == + (FAULT_FLAG_VMA_LOCK | FAULT_FLAG_RETRY_NOWAIT))) + return VM_FAULT_SIGSEGV; +#endif + return 0; } -- cgit v1.2.3