From ebcbc6ea7d8a604ad8504dae70a6ac1b1e64a0b7 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 14 Feb 2022 18:20:24 -0800 Subject: mm/munlock: delete page_mlock() and all its works We have recommended some applications to mlock their userspace, but that turns out to be counter-productive: when many processes mlock the same file, contention on rmap's i_mmap_rwsem can become intolerable at exit: it is needed for write, to remove any vma mapping that file from rmap's tree; but hogged for read by those with mlocks calling page_mlock() (formerly known as try_to_munlock()) on *each* page mapped from the file (the purpose being to find out whether another process has the page mlocked, so therefore it should not be unmlocked yet). Several optimizations have been made in the past: one is to skip page_mlock() when mapcount tells that nothing else has this page mapped; but that doesn't help at all when others do have it mapped. This time around, I initially intended to add a preliminary search of the rmap tree for overlapping VM_LOCKED ranges; but that gets messy with locking order, when in doubt whether a page is actually present; and risks adding even more contention on the i_mmap_rwsem. A solution would be much easier, if only there were space in struct page for an mlock_count... but actually, most of the time, there is space for it - an mlocked page spends most of its life on an unevictable LRU, but since 3.18 removed the scan_unevictable_pages sysctl, that "LRU" has been redundant. Let's try to reuse its page->lru. But leave that until a later patch: in this patch, clear the ground by removing page_mlock(), and all the infrastructure that has gathered around it - which mostly hinders understanding, and will make reviewing new additions harder. Don't mind those old comments about THPs, they date from before 4.5's refcounting rework: splitting is not a risk here. Just keep a minimal version of munlock_vma_page(), as reminder of what it should attend to (in particular, the odd way PGSTRANDED is counted out of PGMUNLOCKED), and likewise a stub for munlock_vma_pages_range(). Move unchanged __mlock_posix_error_return() out of the way, down to above its caller: this series then makes no further change after mlock_fixup(). After this and each following commit, the kernel builds, boots and runs; but with deficiencies which may show up in testing of mlock and munlock. The system calls succeed or fail as before, and mlock remains effective in preventing page reclaim; but meminfo's Unevictable and Mlocked amounts may be shown too low after mlock, grow, then stay too high after munlock: with previously mlocked pages remaining unevictable for too long, until finally unmapped and freed and counts corrected. Normal service will be resumed in "mm/munlock: mlock_pte_range() when mlocking or munlocking". Signed-off-by: Hugh Dickins Acked-by: Vlastimil Babka Signed-off-by: Matthew Wilcox (Oracle) --- mm/internal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/internal.h') diff --git a/mm/internal.h b/mm/internal.h index d80300392a19..e48c486d5ddf 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -409,7 +409,7 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma) * must be called with vma's mmap_lock held for read or write, and page locked. */ extern void mlock_vma_page(struct page *page); -extern unsigned int munlock_vma_page(struct page *page); +extern void munlock_vma_page(struct page *page); extern int mlock_future_check(struct mm_struct *mm, unsigned long flags, unsigned long len); -- cgit v1.2.3 From a213e5cf71cbcea4b23caedcb8fe6629a333b275 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 14 Feb 2022 18:23:29 -0800 Subject: mm/munlock: delete munlock_vma_pages_all(), allow oomreap munlock_vma_pages_range() will still be required, when munlocking but not munmapping a set of pages; but when unmapping a pte, the mlock count will be maintained in much the same way as it will be maintained when mapping in the pte. Which removes the need for munlock_vma_pages_all() on mlocked vmas when munmapping or exiting: eliminating the catastrophic contention on i_mmap_rwsem, and the need for page lock on the pages. There is still a need to update locked_vm accounting according to the munmapped vmas when munmapping: do that in detach_vmas_to_be_unmapped(). exit_mmap() does not need locked_vm updates, so delete unlock_range(). And wasn't I the one who forbade the OOM reaper to attack mlocked vmas, because of the uncertainty in blocking on all those page locks? No fear of that now, so permit the OOM reaper on mlocked vmas. Signed-off-by: Hugh Dickins Acked-by: Vlastimil Babka Signed-off-by: Matthew Wilcox (Oracle) --- mm/internal.h | 16 ++-------------- mm/madvise.c | 5 +++++ mm/mlock.c | 4 ++-- mm/mmap.c | 32 ++------------------------------ mm/oom_kill.c | 2 +- 5 files changed, 12 insertions(+), 47 deletions(-) (limited to 'mm/internal.h') diff --git a/mm/internal.h b/mm/internal.h index e48c486d5ddf..f235aa92e564 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -71,11 +71,6 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, unsigned long floor, unsigned long ceiling); void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte); -static inline bool can_madv_lru_vma(struct vm_area_struct *vma) -{ - return !(vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP)); -} - struct zap_details; void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, @@ -398,12 +393,8 @@ extern long populate_vma_page_range(struct vm_area_struct *vma, extern long faultin_vma_page_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, bool write, int *locked); -extern void munlock_vma_pages_range(struct vm_area_struct *vma, - unsigned long start, unsigned long end); -static inline void munlock_vma_pages_all(struct vm_area_struct *vma) -{ - munlock_vma_pages_range(vma, vma->vm_start, vma->vm_end); -} +extern int mlock_future_check(struct mm_struct *mm, unsigned long flags, + unsigned long len); /* * must be called with vma's mmap_lock held for read or write, and page locked. @@ -411,9 +402,6 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma) extern void mlock_vma_page(struct page *page); extern void munlock_vma_page(struct page *page); -extern int mlock_future_check(struct mm_struct *mm, unsigned long flags, - unsigned long len); - /* * Clear the page's PageMlocked(). This can be useful in a situation where * we want to unconditionally remove a page from the pagecache -- e.g., diff --git a/mm/madvise.c b/mm/madvise.c index 5604064df464..ae35d72627ef 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -530,6 +530,11 @@ static void madvise_cold_page_range(struct mmu_gather *tlb, tlb_end_vma(tlb, vma); } +static inline bool can_madv_lru_vma(struct vm_area_struct *vma) +{ + return !(vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP)); +} + static long madvise_cold(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start_addr, unsigned long end_addr) diff --git a/mm/mlock.c b/mm/mlock.c index aec4ce7919da..5d7ced8303be 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -137,8 +137,8 @@ void munlock_vma_page(struct page *page) * Returns with VM_LOCKED cleared. Callers must be prepared to * deal with this. */ -void munlock_vma_pages_range(struct vm_area_struct *vma, - unsigned long start, unsigned long end) +static void munlock_vma_pages_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end) { vma->vm_flags &= VM_LOCKED_CLEAR_MASK; diff --git a/mm/mmap.c b/mm/mmap.c index 1e8fdb0b51ed..64b5985b5295 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2674,6 +2674,8 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, vma->vm_prev = NULL; do { vma_rb_erase(vma, &mm->mm_rb); + if (vma->vm_flags & VM_LOCKED) + mm->locked_vm -= vma_pages(vma); mm->map_count--; tail_vma = vma; vma = vma->vm_next; @@ -2778,22 +2780,6 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, return __split_vma(mm, vma, addr, new_below); } -static inline void -unlock_range(struct vm_area_struct *start, unsigned long limit) -{ - struct mm_struct *mm = start->vm_mm; - struct vm_area_struct *tmp = start; - - while (tmp && tmp->vm_start < limit) { - if (tmp->vm_flags & VM_LOCKED) { - mm->locked_vm -= vma_pages(tmp); - munlock_vma_pages_all(tmp); - } - - tmp = tmp->vm_next; - } -} - /* Munmap is split into 2 main parts -- this part which finds * what needs doing, and the areas themselves, which do the * work. This now handles partial unmappings. @@ -2874,12 +2860,6 @@ int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len, return error; } - /* - * unlock any mlock()ed ranges before detaching vmas - */ - if (mm->locked_vm) - unlock_range(vma, end); - /* Detach vmas from rbtree */ if (!detach_vmas_to_be_unmapped(mm, vma, prev, end)) downgrade = false; @@ -3147,20 +3127,12 @@ void exit_mmap(struct mm_struct *mm) * Nothing can be holding mm->mmap_lock here and the above call * to mmu_notifier_release(mm) ensures mmu notifier callbacks in * __oom_reap_task_mm() will not block. - * - * This needs to be done before calling unlock_range(), - * which clears VM_LOCKED, otherwise the oom reaper cannot - * reliably test it. */ (void)__oom_reap_task_mm(mm); - set_bit(MMF_OOM_SKIP, &mm->flags); } mmap_write_lock(mm); - if (mm->locked_vm) - unlock_range(mm->mmap, ULONG_MAX); - arch_exit_mmap(mm); vma = mm->mmap; diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 832fb330376e..6b875acabd1e 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -526,7 +526,7 @@ bool __oom_reap_task_mm(struct mm_struct *mm) set_bit(MMF_UNSTABLE, &mm->flags); for (vma = mm->mmap ; vma; vma = vma->vm_next) { - if (!can_madv_lru_vma(vma)) + if (vma->vm_flags & (VM_HUGETLB|VM_PFNMAP)) continue; /* -- cgit v1.2.3 From cea86fe246b694a191804b47378eb9d77aefabec Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 14 Feb 2022 18:26:39 -0800 Subject: mm/munlock: rmap call mlock_vma_page() munlock_vma_page() Add vma argument to mlock_vma_page() and munlock_vma_page(), make them inline functions which check (vma->vm_flags & VM_LOCKED) before calling mlock_page() and munlock_page() in mm/mlock.c. Add bool compound to mlock_vma_page() and munlock_vma_page(): this is because we have understandable difficulty in accounting pte maps of THPs, and if passed a PageHead page, mlock_page() and munlock_page() cannot tell whether it's a pmd map to be counted or a pte map to be ignored. Add vma arg to page_add_file_rmap() and page_remove_rmap(), like the others, and use that to call mlock_vma_page() at the end of the page adds, and munlock_vma_page() at the end of page_remove_rmap() (end or beginning? unimportant, but end was easier for assertions in testing). No page lock is required (although almost all adds happen to hold it): delete the "Serialize with page migration" BUG_ON(!PageLocked(page))s. Certainly page lock did serialize with page migration, but I'm having difficulty explaining why that was ever important. Mlock accounting on THPs has been hard to define, differed between anon and file, involved PageDoubleMap in some places and not others, required clear_page_mlock() at some points. Keep it simple now: just count the pmds and ignore the ptes, there is no reason for ptes to undo pmd mlocks. page_add_new_anon_rmap() callers unchanged: they have long been calling lru_cache_add_inactive_or_unevictable(), which does its own VM_LOCKED handling (it also checks for not VM_SPECIAL: I think that's overcautious, and inconsistent with other checks, that mmap_region() already prevents VM_LOCKED on VM_SPECIAL; but haven't quite convinced myself to change it). Signed-off-by: Hugh Dickins Acked-by: Vlastimil Babka Signed-off-by: Matthew Wilcox (Oracle) --- include/linux/rmap.h | 17 ++++++++------- kernel/events/uprobes.c | 7 ++----- mm/huge_memory.c | 17 +++++++-------- mm/hugetlb.c | 4 ++-- mm/internal.h | 36 ++++++++++++++++++++++++++----- mm/khugepaged.c | 4 ++-- mm/ksm.c | 12 +---------- mm/memory.c | 45 +++++++++++++-------------------------- mm/migrate.c | 9 ++------ mm/mlock.c | 21 +++++++------------ mm/rmap.c | 56 +++++++++++++++++++++++-------------------------- mm/userfaultfd.c | 14 +++++++------ 12 files changed, 113 insertions(+), 129 deletions(-) (limited to 'mm/internal.h') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index dc48aa8c2c94..ac29b076082b 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -167,18 +167,19 @@ struct anon_vma *page_get_anon_vma(struct page *page); */ void page_move_anon_rmap(struct page *, struct vm_area_struct *); void page_add_anon_rmap(struct page *, struct vm_area_struct *, - unsigned long, bool); + unsigned long address, bool compound); void do_page_add_anon_rmap(struct page *, struct vm_area_struct *, - unsigned long, int); + unsigned long address, int flags); void page_add_new_anon_rmap(struct page *, struct vm_area_struct *, - unsigned long, bool); -void page_add_file_rmap(struct page *, bool); -void page_remove_rmap(struct page *, bool); - + unsigned long address, bool compound); +void page_add_file_rmap(struct page *, struct vm_area_struct *, + bool compound); +void page_remove_rmap(struct page *, struct vm_area_struct *, + bool compound); void hugepage_add_anon_rmap(struct page *, struct vm_area_struct *, - unsigned long); + unsigned long address); void hugepage_add_new_anon_rmap(struct page *, struct vm_area_struct *, - unsigned long); + unsigned long address); static inline void page_dup_rmap(struct page *page, bool compound) { diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 6357c3580d07..eed2f7437d96 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -173,7 +173,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, return err; } - /* For try_to_free_swap() and munlock_vma_page() below */ + /* For try_to_free_swap() below */ lock_page(old_page); mmu_notifier_invalidate_range_start(&range); @@ -201,13 +201,10 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, set_pte_at_notify(mm, addr, pvmw.pte, mk_pte(new_page, vma->vm_page_prot)); - page_remove_rmap(old_page, false); + page_remove_rmap(old_page, vma, false); if (!page_mapped(old_page)) try_to_free_swap(old_page); page_vma_mapped_walk_done(&pvmw); - - if ((vma->vm_flags & VM_LOCKED) && !PageCompound(old_page)) - munlock_vma_page(old_page); put_page(old_page); err = 0; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 9a34b85ebcf8..d6477f48a27e 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1577,7 +1577,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, if (pmd_present(orig_pmd)) { page = pmd_page(orig_pmd); - page_remove_rmap(page, true); + page_remove_rmap(page, vma, true); VM_BUG_ON_PAGE(page_mapcount(page) < 0, page); VM_BUG_ON_PAGE(!PageHead(page), page); } else if (thp_migration_supported()) { @@ -1962,7 +1962,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, set_page_dirty(page); if (!PageReferenced(page) && pmd_young(old_pmd)) SetPageReferenced(page); - page_remove_rmap(page, true); + page_remove_rmap(page, vma, true); put_page(page); } add_mm_counter(mm, mm_counter_file(page), -HPAGE_PMD_NR); @@ -2096,6 +2096,9 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, } } unlock_page_memcg(page); + + /* Above is effectively page_remove_rmap(page, vma, true) */ + munlock_vma_page(page, vma, true); } smp_wmb(); /* make pte visible before pmd */ @@ -2103,7 +2106,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, if (freeze) { for (i = 0; i < HPAGE_PMD_NR; i++) { - page_remove_rmap(page + i, false); + page_remove_rmap(page + i, vma, false); put_page(page + i); } } @@ -2163,8 +2166,6 @@ repeat: do_unlock_page = true; } } - if (PageMlocked(page)) - clear_page_mlock(page); } else if (!(pmd_devmap(*pmd) || is_pmd_migration_entry(*pmd))) goto out; __split_huge_pmd_locked(vma, pmd, range.start, freeze); @@ -3138,7 +3139,7 @@ void set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, if (pmd_soft_dirty(pmdval)) pmdswp = pmd_swp_mksoft_dirty(pmdswp); set_pmd_at(mm, address, pvmw->pmd, pmdswp); - page_remove_rmap(page, true); + page_remove_rmap(page, vma, true); put_page(page); } @@ -3168,10 +3169,8 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new) if (PageAnon(new)) page_add_anon_rmap(new, vma, mmun_start, true); else - page_add_file_rmap(new, true); + page_add_file_rmap(new, vma, true); set_pmd_at(mm, mmun_start, pvmw->pmd, pmde); - if ((vma->vm_flags & VM_LOCKED) && !PageDoubleMap(new)) - mlock_vma_page(new); update_mmu_cache_pmd(vma, address, pvmw->pmd); } #endif diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 61895cc01d09..43fb3155298e 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5014,7 +5014,7 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct set_page_dirty(page); hugetlb_count_sub(pages_per_huge_page(h), mm); - page_remove_rmap(page, true); + page_remove_rmap(page, vma, true); spin_unlock(ptl); tlb_remove_page_size(tlb, page, huge_page_size(h)); @@ -5259,7 +5259,7 @@ retry_avoidcopy: /* Break COW */ huge_ptep_clear_flush(vma, haddr, ptep); mmu_notifier_invalidate_range(mm, range.start, range.end); - page_remove_rmap(old_page, true); + page_remove_rmap(old_page, vma, true); hugepage_add_new_anon_rmap(new_page, vma, haddr); set_huge_pte_at(mm, haddr, ptep, make_huge_pte(vma, new_page, 1)); diff --git a/mm/internal.h b/mm/internal.h index f235aa92e564..3d7dfc8bc471 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -395,12 +395,35 @@ extern long faultin_vma_page_range(struct vm_area_struct *vma, bool write, int *locked); extern int mlock_future_check(struct mm_struct *mm, unsigned long flags, unsigned long len); - /* - * must be called with vma's mmap_lock held for read or write, and page locked. + * mlock_vma_page() and munlock_vma_page(): + * should be called with vma's mmap_lock held for read or write, + * under page table lock for the pte/pmd being added or removed. + * + * mlock is usually called at the end of page_add_*_rmap(), + * munlock at the end of page_remove_rmap(); but new anon + * pages are managed in lru_cache_add_inactive_or_unevictable(). + * + * @compound is used to include pmd mappings of THPs, but filter out + * pte mappings of THPs, which cannot be consistently counted: a pte + * mapping of the THP head cannot be distinguished by the page alone. */ -extern void mlock_vma_page(struct page *page); -extern void munlock_vma_page(struct page *page); +void mlock_page(struct page *page); +static inline void mlock_vma_page(struct page *page, + struct vm_area_struct *vma, bool compound) +{ + if (unlikely(vma->vm_flags & VM_LOCKED) && + (compound || !PageTransCompound(page))) + mlock_page(page); +} +void munlock_page(struct page *page); +static inline void munlock_vma_page(struct page *page, + struct vm_area_struct *vma, bool compound) +{ + if (unlikely(vma->vm_flags & VM_LOCKED) && + (compound || !PageTransCompound(page))) + munlock_page(page); +} /* * Clear the page's PageMlocked(). This can be useful in a situation where @@ -487,7 +510,10 @@ static inline struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf, #else /* !CONFIG_MMU */ static inline void unmap_mapping_folio(struct folio *folio) { } static inline void clear_page_mlock(struct page *page) { } -static inline void mlock_vma_page(struct page *page) { } +static inline void mlock_vma_page(struct page *page, + struct vm_area_struct *vma, bool compound) { } +static inline void munlock_vma_page(struct page *page, + struct vm_area_struct *vma, bool compound) { } static inline void vunmap_range_noflush(unsigned long start, unsigned long end) { } diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 131492fd1148..52add1825525 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -774,7 +774,7 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page, */ spin_lock(ptl); ptep_clear(vma->vm_mm, address, _pte); - page_remove_rmap(src_page, false); + page_remove_rmap(src_page, vma, false); spin_unlock(ptl); free_page_and_swap_cache(src_page); } @@ -1513,7 +1513,7 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) if (pte_none(*pte)) continue; page = vm_normal_page(vma, addr, *pte); - page_remove_rmap(page, false); + page_remove_rmap(page, vma, false); } pte_unmap_unlock(start_pte, ptl); diff --git a/mm/ksm.c b/mm/ksm.c index c20bd4d9a0d9..c5a4403b5dc9 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1177,7 +1177,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, ptep_clear_flush(vma, addr, ptep); set_pte_at_notify(mm, addr, ptep, newpte); - page_remove_rmap(page, false); + page_remove_rmap(page, vma, false); if (!page_mapped(page)) try_to_free_swap(page); put_page(page); @@ -1252,16 +1252,6 @@ static int try_to_merge_one_page(struct vm_area_struct *vma, err = replace_page(vma, page, kpage, orig_pte); } - if ((vma->vm_flags & VM_LOCKED) && kpage && !err) { - munlock_vma_page(page); - if (!PageMlocked(kpage)) { - unlock_page(page); - lock_page(kpage); - mlock_vma_page(kpage); - page = kpage; /* for final unlock */ - } - } - out_unlock: unlock_page(page); out: diff --git a/mm/memory.c b/mm/memory.c index c125c4969913..53bd9e5f2e33 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -735,9 +735,6 @@ static void restore_exclusive_pte(struct vm_area_struct *vma, set_pte_at(vma->vm_mm, address, ptep, pte); - if (vma->vm_flags & VM_LOCKED) - mlock_vma_page(page); - /* * No need to invalidate - it was non-present before. However * secondary CPUs may have mappings that need invalidating. @@ -1377,7 +1374,7 @@ again: mark_page_accessed(page); } rss[mm_counter(page)]--; - page_remove_rmap(page, false); + page_remove_rmap(page, vma, false); if (unlikely(page_mapcount(page) < 0)) print_bad_pte(vma, addr, ptent, page); if (unlikely(__tlb_remove_page(tlb, page))) { @@ -1397,10 +1394,8 @@ again: continue; pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); rss[mm_counter(page)]--; - if (is_device_private_entry(entry)) - page_remove_rmap(page, false); - + page_remove_rmap(page, vma, false); put_page(page); continue; } @@ -1753,16 +1748,16 @@ static int validate_page_before_insert(struct page *page) return 0; } -static int insert_page_into_pte_locked(struct mm_struct *mm, pte_t *pte, +static int insert_page_into_pte_locked(struct vm_area_struct *vma, pte_t *pte, unsigned long addr, struct page *page, pgprot_t prot) { if (!pte_none(*pte)) return -EBUSY; /* Ok, finally just insert the thing.. */ get_page(page); - inc_mm_counter_fast(mm, mm_counter_file(page)); - page_add_file_rmap(page, false); - set_pte_at(mm, addr, pte, mk_pte(page, prot)); + inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page)); + page_add_file_rmap(page, vma, false); + set_pte_at(vma->vm_mm, addr, pte, mk_pte(page, prot)); return 0; } @@ -1776,7 +1771,6 @@ static int insert_page_into_pte_locked(struct mm_struct *mm, pte_t *pte, static int insert_page(struct vm_area_struct *vma, unsigned long addr, struct page *page, pgprot_t prot) { - struct mm_struct *mm = vma->vm_mm; int retval; pte_t *pte; spinlock_t *ptl; @@ -1785,17 +1779,17 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr, if (retval) goto out; retval = -ENOMEM; - pte = get_locked_pte(mm, addr, &ptl); + pte = get_locked_pte(vma->vm_mm, addr, &ptl); if (!pte) goto out; - retval = insert_page_into_pte_locked(mm, pte, addr, page, prot); + retval = insert_page_into_pte_locked(vma, pte, addr, page, prot); pte_unmap_unlock(pte, ptl); out: return retval; } #ifdef pte_index -static int insert_page_in_batch_locked(struct mm_struct *mm, pte_t *pte, +static int insert_page_in_batch_locked(struct vm_area_struct *vma, pte_t *pte, unsigned long addr, struct page *page, pgprot_t prot) { int err; @@ -1805,7 +1799,7 @@ static int insert_page_in_batch_locked(struct mm_struct *mm, pte_t *pte, err = validate_page_before_insert(page); if (err) return err; - return insert_page_into_pte_locked(mm, pte, addr, page, prot); + return insert_page_into_pte_locked(vma, pte, addr, page, prot); } /* insert_pages() amortizes the cost of spinlock operations @@ -1842,7 +1836,7 @@ more: start_pte = pte_offset_map_lock(mm, pmd, addr, &pte_lock); for (pte = start_pte; pte_idx < batch_size; ++pte, ++pte_idx) { - int err = insert_page_in_batch_locked(mm, pte, + int err = insert_page_in_batch_locked(vma, pte, addr, pages[curr_page_idx], prot); if (unlikely(err)) { pte_unmap_unlock(start_pte, pte_lock); @@ -3098,7 +3092,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) * mapcount is visible. So transitively, TLBs to * old page will be flushed before it can be reused. */ - page_remove_rmap(old_page, false); + page_remove_rmap(old_page, vma, false); } /* Free the old page.. */ @@ -3118,16 +3112,6 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) */ mmu_notifier_invalidate_range_only_end(&range); if (old_page) { - /* - * Don't let another task, with possibly unlocked vma, - * keep the mlocked page. - */ - if (page_copied && (vma->vm_flags & VM_LOCKED)) { - lock_page(old_page); /* LRU manipulation */ - if (PageMlocked(old_page)) - munlock_vma_page(old_page); - unlock_page(old_page); - } if (page_copied) free_swap_cache(old_page); put_page(old_page); @@ -3947,7 +3931,8 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); add_mm_counter(vma->vm_mm, mm_counter_file(page), HPAGE_PMD_NR); - page_add_file_rmap(page, true); + page_add_file_rmap(page, vma, true); + /* * deposit and withdraw with pmd lock held */ @@ -3996,7 +3981,7 @@ void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr) lru_cache_add_inactive_or_unevictable(page, vma); } else { inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page)); - page_add_file_rmap(page, false); + page_add_file_rmap(page, vma, false); } set_pte_at(vma->vm_mm, addr, vmf->pte, entry); } diff --git a/mm/migrate.c b/mm/migrate.c index c7da064b4781..7c4223ce2500 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -248,14 +248,9 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma, if (PageAnon(new)) page_add_anon_rmap(new, vma, pvmw.address, false); else - page_add_file_rmap(new, false); + page_add_file_rmap(new, vma, false); set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte); } - if (vma->vm_flags & VM_LOCKED && !PageTransCompound(new)) - mlock_vma_page(new); - - if (PageTransHuge(page) && PageMlocked(page)) - clear_page_mlock(page); /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, pvmw.address, pvmw.pte); @@ -2331,7 +2326,7 @@ again: * drop page refcount. Page won't be freed, as we took * a reference just above. */ - page_remove_rmap(page, false); + page_remove_rmap(page, vma, false); put_page(page); if (pte_present(pte)) diff --git a/mm/mlock.c b/mm/mlock.c index 5d7ced8303be..92f28258b4ae 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -78,17 +78,13 @@ void clear_page_mlock(struct page *page) } } -/* - * Mark page as mlocked if not already. - * If page on LRU, isolate and putback to move to unevictable list. +/** + * mlock_page - mlock a page + * @page: page to be mlocked, either a normal page or a THP head. */ -void mlock_vma_page(struct page *page) +void mlock_page(struct page *page) { - /* Serialize with page migration */ - BUG_ON(!PageLocked(page)); - VM_BUG_ON_PAGE(PageTail(page), page); - VM_BUG_ON_PAGE(PageCompound(page) && PageDoubleMap(page), page); if (!TestSetPageMlocked(page)) { int nr_pages = thp_nr_pages(page); @@ -101,14 +97,11 @@ void mlock_vma_page(struct page *page) } /** - * munlock_vma_page - munlock a vma page - * @page: page to be unlocked, either a normal page or THP page head + * munlock_page - munlock a page + * @page: page to be munlocked, either a normal page or a THP head. */ -void munlock_vma_page(struct page *page) +void munlock_page(struct page *page) { - /* Serialize with page migration */ - BUG_ON(!PageLocked(page)); - VM_BUG_ON_PAGE(PageTail(page), page); if (TestClearPageMlocked(page)) { diff --git a/mm/rmap.c b/mm/rmap.c index 7ce7f1946cff..6cc8bf129f18 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1181,17 +1181,17 @@ void do_page_add_anon_rmap(struct page *page, __mod_lruvec_page_state(page, NR_ANON_MAPPED, nr); } - if (unlikely(PageKsm(page))) { + if (unlikely(PageKsm(page))) unlock_page_memcg(page); - return; - } /* address might be in next vma when migration races vma_adjust */ - if (first) + else if (first) __page_set_anon_rmap(page, vma, address, flags & RMAP_EXCLUSIVE); else __page_check_anon_rmap(page, vma, address); + + mlock_vma_page(page, vma, compound); } /** @@ -1232,12 +1232,14 @@ void page_add_new_anon_rmap(struct page *page, /** * page_add_file_rmap - add pte mapping to a file page - * @page: the page to add the mapping to - * @compound: charge the page as compound or small page + * @page: the page to add the mapping to + * @vma: the vm area in which the mapping is added + * @compound: charge the page as compound or small page * * The caller needs to hold the pte lock. */ -void page_add_file_rmap(struct page *page, bool compound) +void page_add_file_rmap(struct page *page, + struct vm_area_struct *vma, bool compound) { int i, nr = 1; @@ -1260,13 +1262,8 @@ void page_add_file_rmap(struct page *page, bool compound) nr_pages); } else { if (PageTransCompound(page) && page_mapping(page)) { - struct page *head = compound_head(page); - VM_WARN_ON_ONCE(!PageLocked(page)); - - SetPageDoubleMap(head); - if (PageMlocked(page)) - clear_page_mlock(head); + SetPageDoubleMap(compound_head(page)); } if (!atomic_inc_and_test(&page->_mapcount)) goto out; @@ -1274,6 +1271,8 @@ void page_add_file_rmap(struct page *page, bool compound) __mod_lruvec_page_state(page, NR_FILE_MAPPED, nr); out: unlock_page_memcg(page); + + mlock_vma_page(page, vma, compound); } static void page_remove_file_rmap(struct page *page, bool compound) @@ -1368,11 +1367,13 @@ static void page_remove_anon_compound_rmap(struct page *page) /** * page_remove_rmap - take down pte mapping from a page * @page: page to remove mapping from + * @vma: the vm area from which the mapping is removed * @compound: uncharge the page as compound or small page * * The caller needs to hold the pte lock. */ -void page_remove_rmap(struct page *page, bool compound) +void page_remove_rmap(struct page *page, + struct vm_area_struct *vma, bool compound) { lock_page_memcg(page); @@ -1414,6 +1415,8 @@ void page_remove_rmap(struct page *page, bool compound) */ out: unlock_page_memcg(page); + + munlock_vma_page(page, vma, compound); } /* @@ -1469,28 +1472,21 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, mmu_notifier_invalidate_range_start(&range); while (page_vma_mapped_walk(&pvmw)) { + /* Unexpected PMD-mapped THP? */ + VM_BUG_ON_PAGE(!pvmw.pte, page); + /* - * If the page is mlock()d, we cannot swap it out. + * If the page is in an mlock()d vma, we must not swap it out. */ if (!(flags & TTU_IGNORE_MLOCK) && (vma->vm_flags & VM_LOCKED)) { - /* - * PTE-mapped THP are never marked as mlocked: so do - * not set it on a DoubleMap THP, nor on an Anon THP - * (which may still be PTE-mapped after DoubleMap was - * cleared). But stop unmapping even in those cases. - */ - if (!PageTransCompound(page) || (PageHead(page) && - !PageDoubleMap(page) && !PageAnon(page))) - mlock_vma_page(page); + /* Restore the mlock which got missed */ + mlock_vma_page(page, vma, false); page_vma_mapped_walk_done(&pvmw); ret = false; break; } - /* Unexpected PMD-mapped THP? */ - VM_BUG_ON_PAGE(!pvmw.pte, page); - subpage = page - page_to_pfn(page) + pte_pfn(*pvmw.pte); address = pvmw.address; @@ -1668,7 +1664,7 @@ discard: * * See Documentation/vm/mmu_notifier.rst */ - page_remove_rmap(subpage, PageHuge(page)); + page_remove_rmap(subpage, vma, PageHuge(page)); put_page(page); } @@ -1942,7 +1938,7 @@ static bool try_to_migrate_one(struct page *page, struct vm_area_struct *vma, * * See Documentation/vm/mmu_notifier.rst */ - page_remove_rmap(subpage, PageHuge(page)); + page_remove_rmap(subpage, vma, PageHuge(page)); put_page(page); } @@ -2078,7 +2074,7 @@ static bool page_make_device_exclusive_one(struct page *page, * There is a reference on the page for the swap entry which has * been removed, so shouldn't take another. */ - page_remove_rmap(subpage, false); + page_remove_rmap(subpage, vma, false); } mmu_notifier_invalidate_range_end(&range); diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 0780c2a57ff1..15d3e97a6e04 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -95,10 +95,15 @@ int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, if (!pte_none(*dst_pte)) goto out_unlock; - if (page_in_cache) - page_add_file_rmap(page, false); - else + if (page_in_cache) { + /* Usually, cache pages are already added to LRU */ + if (newly_allocated) + lru_cache_add(page); + page_add_file_rmap(page, dst_vma, false); + } else { page_add_new_anon_rmap(page, dst_vma, dst_addr, false); + lru_cache_add_inactive_or_unevictable(page, dst_vma); + } /* * Must happen after rmap, as mm_counter() checks mapping (via @@ -106,9 +111,6 @@ int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, */ inc_mm_counter(dst_mm, mm_counter(page)); - if (newly_allocated) - lru_cache_add_inactive_or_unevictable(page, dst_vma); - set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); /* No need to invalidate - it was non-present before */ -- cgit v1.2.3 From b109b87050df5438ee745b2bddfa3587970025bb Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 14 Feb 2022 18:28:05 -0800 Subject: mm/munlock: replace clear_page_mlock() by final clearance Placing munlock_vma_page() at the end of page_remove_rmap() shifts most of the munlocking to clear_page_mlock(), since PageMlocked is typically still set when mapcount has fallen to 0. That is not what we want: we want /proc/vmstat's unevictable_pgs_cleared to remain as a useful check on the integrity of of the mlock/munlock protocol - small numbers are not surprising, but big numbers mean the protocol is not working. That could be easily fixed by placing munlock_vma_page() at the start of page_remove_rmap(); but later in the series we shall want to batch the munlocking, and that too would tend to leave PageMlocked still set at the point when it is checked. So delete clear_page_mlock() now: leave it instead to release_pages() (and __page_cache_release()) to do this backstop clearing of Mlocked, when page refcount has fallen to 0. If a pinned page occasionally gets counted as Mlocked and Unevictable until it is unpinned, that's okay. A slightly regrettable side-effect of this change is that, since release_pages() and __page_cache_release() may be called at interrupt time, those places which update NR_MLOCK with interrupts enabled had better use mod_zone_page_state() than __mod_zone_page_state() (but holding the lruvec lock always has interrupts disabled). This change, forcing Mlocked off when refcount 0 instead of earlier when mapcount 0, is not fundamental: it can be reversed if performance or something else is found to suffer; but this is the easiest way to separate the stats - let's not complicate that without good reason. Signed-off-by: Hugh Dickins Acked-by: Vlastimil Babka Signed-off-by: Matthew Wilcox (Oracle) --- mm/internal.h | 12 ------------ mm/mlock.c | 30 ------------------------------ mm/rmap.c | 9 --------- mm/swap.c | 32 ++++++++++++++++++++++++-------- 4 files changed, 24 insertions(+), 59 deletions(-) (limited to 'mm/internal.h') diff --git a/mm/internal.h b/mm/internal.h index 3d7dfc8bc471..a43d79335c16 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -425,17 +425,6 @@ static inline void munlock_vma_page(struct page *page, munlock_page(page); } -/* - * Clear the page's PageMlocked(). This can be useful in a situation where - * we want to unconditionally remove a page from the pagecache -- e.g., - * on truncation or freeing. - * - * It is legal to call this function for any page, mlocked or not. - * If called for a page that is still mapped by mlocked vmas, all we do - * is revert to lazy LRU behaviour -- semantics are not broken. - */ -extern void clear_page_mlock(struct page *page); - extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma); /* @@ -509,7 +498,6 @@ static inline struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf, } #else /* !CONFIG_MMU */ static inline void unmap_mapping_folio(struct folio *folio) { } -static inline void clear_page_mlock(struct page *page) { } static inline void mlock_vma_page(struct page *page, struct vm_area_struct *vma, bool compound) { } static inline void munlock_vma_page(struct page *page, diff --git a/mm/mlock.c b/mm/mlock.c index 92f28258b4ae..3c26473050a3 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -48,36 +48,6 @@ EXPORT_SYMBOL(can_do_mlock); * PageUnevictable is set to indicate the unevictable state. */ -/* - * LRU accounting for clear_page_mlock() - */ -void clear_page_mlock(struct page *page) -{ - int nr_pages; - - if (!TestClearPageMlocked(page)) - return; - - nr_pages = thp_nr_pages(page); - mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages); - count_vm_events(UNEVICTABLE_PGCLEARED, nr_pages); - /* - * The previous TestClearPageMlocked() corresponds to the smp_mb() - * in __pagevec_lru_add_fn(). - * - * See __pagevec_lru_add_fn for more explanation. - */ - if (!isolate_lru_page(page)) { - putback_lru_page(page); - } else { - /* - * We lost the race. the page already moved to evictable list. - */ - if (PageUnevictable(page)) - count_vm_events(UNEVICTABLE_PGSTRANDED, nr_pages); - } -} - /** * mlock_page - mlock a page * @page: page to be mlocked, either a normal page or a THP head. diff --git a/mm/rmap.c b/mm/rmap.c index 6cc8bf129f18..5442a5c97a85 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1315,9 +1315,6 @@ static void page_remove_file_rmap(struct page *page, bool compound) * pte lock(a spinlock) is held, which implies preemption disabled. */ __mod_lruvec_page_state(page, NR_FILE_MAPPED, -nr); - - if (unlikely(PageMlocked(page))) - clear_page_mlock(page); } static void page_remove_anon_compound_rmap(struct page *page) @@ -1357,9 +1354,6 @@ static void page_remove_anon_compound_rmap(struct page *page) nr = thp_nr_pages(page); } - if (unlikely(PageMlocked(page))) - clear_page_mlock(page); - if (nr) __mod_lruvec_page_state(page, NR_ANON_MAPPED, -nr); } @@ -1398,9 +1392,6 @@ void page_remove_rmap(struct page *page, */ __dec_lruvec_page_state(page, NR_ANON_MAPPED); - if (unlikely(PageMlocked(page))) - clear_page_mlock(page); - if (PageTransCompound(page)) deferred_split_huge_page(compound_head(page)); diff --git a/mm/swap.c b/mm/swap.c index bcf3ac288b56..ff4810e4a4bc 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -74,8 +74,8 @@ static DEFINE_PER_CPU(struct lru_pvecs, lru_pvecs) = { }; /* - * This path almost never happens for VM activity - pages are normally - * freed via pagevecs. But it gets used by networking. + * This path almost never happens for VM activity - pages are normally freed + * via pagevecs. But it gets used by networking - and for compound pages. */ static void __page_cache_release(struct page *page) { @@ -89,6 +89,14 @@ static void __page_cache_release(struct page *page) __clear_page_lru_flags(page); unlock_page_lruvec_irqrestore(lruvec, flags); } + /* See comment on PageMlocked in release_pages() */ + if (unlikely(PageMlocked(page))) { + int nr_pages = thp_nr_pages(page); + + __ClearPageMlocked(page); + mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages); + count_vm_events(UNEVICTABLE_PGCLEARED, nr_pages); + } __ClearPageWaiters(page); } @@ -489,12 +497,8 @@ void lru_cache_add_inactive_or_unevictable(struct page *page, unevictable = (vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) == VM_LOCKED; if (unlikely(unevictable) && !TestSetPageMlocked(page)) { int nr_pages = thp_nr_pages(page); - /* - * We use the irq-unsafe __mod_zone_page_state because this - * counter is not modified from interrupt context, and the pte - * lock is held(spinlock), which implies preemption disabled. - */ - __mod_zone_page_state(page_zone(page), NR_MLOCK, nr_pages); + + mod_zone_page_state(page_zone(page), NR_MLOCK, nr_pages); count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages); } lru_cache_add(page); @@ -969,6 +973,18 @@ void release_pages(struct page **pages, int nr) __clear_page_lru_flags(page); } + /* + * In rare cases, when truncation or holepunching raced with + * munlock after VM_LOCKED was cleared, Mlocked may still be + * found set here. This does not indicate a problem, unless + * "unevictable_pgs_cleared" appears worryingly large. + */ + if (unlikely(PageMlocked(page))) { + __ClearPageMlocked(page); + dec_zone_page_state(page, NR_MLOCK); + count_vm_event(UNEVICTABLE_PGCLEARED); + } + __ClearPageWaiters(page); list_add(&page->lru, &pages_to_free); -- cgit v1.2.3 From 34b6792380ce4f4b41018351cd67c9c26f4a7a0d Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 14 Feb 2022 18:31:48 -0800 Subject: mm/munlock: mlock_pte_range() when mlocking or munlocking Fill in missing pieces: reimplementation of munlock_vma_pages_range(), required to lower the mlock_counts when munlocking without munmapping; and its complement, implementation of mlock_vma_pages_range(), required to raise the mlock_counts on pages already there when a range is mlocked. Combine them into just the one function mlock_vma_pages_range(), using walk_page_range() to run mlock_pte_range(). This approach fixes the "Very slow unlockall()" of unpopulated PROT_NONE areas, reported in https://lore.kernel.org/linux-mm/70885d37-62b7-748b-29df-9e94f3291736@gmail.com/ Munlock clears VM_LOCKED at the start, under exclusive mmap_lock; but if a racing truncate or holepunch (depending on i_mmap_rwsem) gets to the pte first, it will not try to munlock the page: leaving release_pages() to correct it when the last reference to the page is gone - that's okay, a page is not evictable anyway while it is held by an extra reference. Mlock sets VM_LOCKED at the start, under exclusive mmap_lock; but if a racing remove_migration_pte() or try_to_unmap_one() (depending on i_mmap_rwsem) gets to the pte first, it will try to mlock the page, then mlock_pte_range() mlock it a second time. This is harder to reproduce, but a more serious race because it could leave the page unevictable indefinitely though the area is munlocked afterwards. Guard against it by setting the (inappropriate) VM_IO flag, and modifying mlock_vma_page() to decline such vmas. Signed-off-by: Hugh Dickins Acked-by: Vlastimil Babka Signed-off-by: Matthew Wilcox (Oracle) --- mm/internal.h | 3 +- mm/mlock.c | 111 ++++++++++++++++++++++++++++++++++++++++++++++------------ 2 files changed, 91 insertions(+), 23 deletions(-) (limited to 'mm/internal.h') diff --git a/mm/internal.h b/mm/internal.h index a43d79335c16..b3f0dd3ffba2 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -412,7 +412,8 @@ void mlock_page(struct page *page); static inline void mlock_vma_page(struct page *page, struct vm_area_struct *vma, bool compound) { - if (unlikely(vma->vm_flags & VM_LOCKED) && + /* VM_IO check prevents migration from double-counting during mlock */ + if (unlikely((vma->vm_flags & (VM_LOCKED|VM_IO)) == VM_LOCKED) && (compound || !PageTransCompound(page))) mlock_page(page); } diff --git a/mm/mlock.c b/mm/mlock.c index f8a3a54687dd..581ea8bf1b83 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -127,25 +128,91 @@ out: unlock_page_memcg(page); } +static int mlock_pte_range(pmd_t *pmd, unsigned long addr, + unsigned long end, struct mm_walk *walk) + +{ + struct vm_area_struct *vma = walk->vma; + spinlock_t *ptl; + pte_t *start_pte, *pte; + struct page *page; + + ptl = pmd_trans_huge_lock(pmd, vma); + if (ptl) { + if (!pmd_present(*pmd)) + goto out; + if (is_huge_zero_pmd(*pmd)) + goto out; + page = pmd_page(*pmd); + if (vma->vm_flags & VM_LOCKED) + mlock_page(page); + else + munlock_page(page); + goto out; + } + + start_pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); + for (pte = start_pte; addr != end; pte++, addr += PAGE_SIZE) { + if (!pte_present(*pte)) + continue; + page = vm_normal_page(vma, addr, *pte); + if (!page) + continue; + if (PageTransCompound(page)) + continue; + if (vma->vm_flags & VM_LOCKED) + mlock_page(page); + else + munlock_page(page); + } + pte_unmap(start_pte); +out: + spin_unlock(ptl); + cond_resched(); + return 0; +} + /* - * munlock_vma_pages_range() - munlock all pages in the vma range.' - * @vma - vma containing range to be munlock()ed. + * mlock_vma_pages_range() - mlock any pages already in the range, + * or munlock all pages in the range. + * @vma - vma containing range to be mlock()ed or munlock()ed * @start - start address in @vma of the range - * @end - end of range in @vma. - * - * For mremap(), munmap() and exit(). + * @end - end of range in @vma + * @newflags - the new set of flags for @vma. * - * Called with @vma VM_LOCKED. - * - * Returns with VM_LOCKED cleared. Callers must be prepared to - * deal with this. + * Called for mlock(), mlock2() and mlockall(), to set @vma VM_LOCKED; + * called for munlock() and munlockall(), to clear VM_LOCKED from @vma. */ -static void munlock_vma_pages_range(struct vm_area_struct *vma, - unsigned long start, unsigned long end) +static void mlock_vma_pages_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end, vm_flags_t newflags) { - vma->vm_flags &= VM_LOCKED_CLEAR_MASK; + static const struct mm_walk_ops mlock_walk_ops = { + .pmd_entry = mlock_pte_range, + }; - /* Reimplementation to follow in later commit */ + /* + * There is a slight chance that concurrent page migration, + * or page reclaim finding a page of this now-VM_LOCKED vma, + * will call mlock_vma_page() and raise page's mlock_count: + * double counting, leaving the page unevictable indefinitely. + * Communicate this danger to mlock_vma_page() with VM_IO, + * which is a VM_SPECIAL flag not allowed on VM_LOCKED vmas. + * mmap_lock is held in write mode here, so this weird + * combination should not be visible to other mmap_lock users; + * but WRITE_ONCE so rmap walkers must see VM_IO if VM_LOCKED. + */ + if (newflags & VM_LOCKED) + newflags |= VM_IO; + WRITE_ONCE(vma->vm_flags, newflags); + + lru_add_drain(); + walk_page_range(vma->vm_mm, start, end, &mlock_walk_ops, NULL); + lru_add_drain(); + + if (newflags & VM_IO) { + newflags &= ~VM_IO; + WRITE_ONCE(vma->vm_flags, newflags); + } } /* @@ -164,10 +231,9 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, pgoff_t pgoff; int nr_pages; int ret = 0; - int lock = !!(newflags & VM_LOCKED); - vm_flags_t old_flags = vma->vm_flags; + vm_flags_t oldflags = vma->vm_flags; - if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) || + if (newflags == oldflags || (oldflags & VM_SPECIAL) || is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm) || vma_is_dax(vma) || vma_is_secretmem(vma)) /* don't set VM_LOCKED or VM_LOCKONFAULT and don't count */ @@ -199,9 +265,9 @@ success: * Keep track of amount of locked VM. */ nr_pages = (end - start) >> PAGE_SHIFT; - if (!lock) + if (!(newflags & VM_LOCKED)) nr_pages = -nr_pages; - else if (old_flags & VM_LOCKED) + else if (oldflags & VM_LOCKED) nr_pages = 0; mm->locked_vm += nr_pages; @@ -211,11 +277,12 @@ success: * set VM_LOCKED, populate_vma_page_range will bring it back. */ - if (lock) + if ((newflags & VM_LOCKED) && (oldflags & VM_LOCKED)) { + /* No work to do, and mlocking twice would be wrong */ vma->vm_flags = newflags; - else - munlock_vma_pages_range(vma, start, end); - + } else { + mlock_vma_pages_range(vma, start, end, newflags); + } out: *prev = vma; return ret; -- cgit v1.2.3 From 2fbb0c10d1e8222604132b3a3f81bfd8345a44b6 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 14 Feb 2022 18:37:29 -0800 Subject: mm/munlock: mlock_page() munlock_page() batch by pagevec A weakness of the page->mlock_count approach is the need for lruvec lock while holding page table lock. That is not an overhead we would allow on normal pages, but I think acceptable just for pages in an mlocked area. But let's try to amortize the extra cost by gathering on per-cpu pagevec before acquiring the lruvec lock. I have an unverified conjecture that the mlock pagevec might work out well for delaying the mlock processing of new file pages until they have got off lru_cache_add()'s pagevec and on to LRU. The initialization of page->mlock_count is subject to races and awkward: 0 or !!PageMlocked or 1? Was it wrong even in the implementation before this commit, which just widens the window? I haven't gone back to think it through. Maybe someone can point out a better way to initialize it. Bringing lru_cache_add_inactive_or_unevictable()'s mlock initialization into mm/mlock.c has helped: mlock_new_page(), using the mlock pagevec, rather than lru_cache_add()'s pagevec. Experimented with various orderings: the right thing seems to be for mlock_page() and mlock_new_page() to TestSetPageMlocked before adding to pagevec, but munlock_page() to leave TestClearPageMlocked to the later pagevec processing. Dropped the VM_BUG_ON_PAGE(PageTail)s this time around: they have made their point, and the thp_nr_page()s already contain a VM_BUG_ON_PGFLAGS() for that. This still leaves acquiring lruvec locks under page table lock each time the pagevec fills (or a THP is added): which I suppose is rather silly, since they sit on pagevec waiting to be processed long after page table lock has been dropped; but I'm disinclined to uglify the calling sequence until some load shows an actual problem with it (nothing wrong with taking lruvec lock under page table lock, just "nicer" to do it less). Signed-off-by: Hugh Dickins Acked-by: Vlastimil Babka Signed-off-by: Matthew Wilcox (Oracle) --- mm/internal.h | 9 ++- mm/mlock.c | 219 +++++++++++++++++++++++++++++++++++++++++++++++++--------- mm/swap.c | 27 ++++---- 3 files changed, 208 insertions(+), 47 deletions(-) (limited to 'mm/internal.h') diff --git a/mm/internal.h b/mm/internal.h index b3f0dd3ffba2..18af980bb1b8 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -402,7 +402,8 @@ extern int mlock_future_check(struct mm_struct *mm, unsigned long flags, * * mlock is usually called at the end of page_add_*_rmap(), * munlock at the end of page_remove_rmap(); but new anon - * pages are managed in lru_cache_add_inactive_or_unevictable(). + * pages are managed by lru_cache_add_inactive_or_unevictable() + * calling mlock_new_page(). * * @compound is used to include pmd mappings of THPs, but filter out * pte mappings of THPs, which cannot be consistently counted: a pte @@ -425,6 +426,9 @@ static inline void munlock_vma_page(struct page *page, (compound || !PageTransCompound(page))) munlock_page(page); } +void mlock_new_page(struct page *page); +bool need_mlock_page_drain(int cpu); +void mlock_page_drain(int cpu); extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma); @@ -503,6 +507,9 @@ static inline void mlock_vma_page(struct page *page, struct vm_area_struct *vma, bool compound) { } static inline void munlock_vma_page(struct page *page, struct vm_area_struct *vma, bool compound) { } +static inline void mlock_new_page(struct page *page) { } +static inline bool need_mlock_page_drain(int cpu) { return false; } +static inline void mlock_page_drain(int cpu) { } static inline void vunmap_range_noflush(unsigned long start, unsigned long end) { } diff --git a/mm/mlock.c b/mm/mlock.c index 581ea8bf1b83..d28e56529e5b 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -28,6 +28,8 @@ #include "internal.h" +static DEFINE_PER_CPU(struct pagevec, mlock_pvec); + bool can_do_mlock(void) { if (rlimit(RLIMIT_MEMLOCK) != 0) @@ -49,57 +51,79 @@ EXPORT_SYMBOL(can_do_mlock); * PageUnevictable is set to indicate the unevictable state. */ -/** - * mlock_page - mlock a page - * @page: page to be mlocked, either a normal page or a THP head. - */ -void mlock_page(struct page *page) +static struct lruvec *__mlock_page(struct page *page, struct lruvec *lruvec) { - struct lruvec *lruvec; - int nr_pages = thp_nr_pages(page); + /* There is nothing more we can do while it's off LRU */ + if (!TestClearPageLRU(page)) + return lruvec; - VM_BUG_ON_PAGE(PageTail(page), page); + lruvec = folio_lruvec_relock_irq(page_folio(page), lruvec); - if (!TestSetPageMlocked(page)) { - mod_zone_page_state(page_zone(page), NR_MLOCK, nr_pages); - __count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages); + if (unlikely(page_evictable(page))) { + /* + * This is a little surprising, but quite possible: + * PageMlocked must have got cleared already by another CPU. + * Could this page be on the Unevictable LRU? I'm not sure, + * but move it now if so. + */ + if (PageUnevictable(page)) { + del_page_from_lru_list(page, lruvec); + ClearPageUnevictable(page); + add_page_to_lru_list(page, lruvec); + __count_vm_events(UNEVICTABLE_PGRESCUED, + thp_nr_pages(page)); + } + goto out; } - /* There is nothing more we can do while it's off LRU */ - if (!TestClearPageLRU(page)) - return; - - lruvec = folio_lruvec_lock_irq(page_folio(page)); if (PageUnevictable(page)) { - page->mlock_count++; + if (PageMlocked(page)) + page->mlock_count++; goto out; } del_page_from_lru_list(page, lruvec); ClearPageActive(page); SetPageUnevictable(page); - page->mlock_count = 1; + page->mlock_count = !!PageMlocked(page); add_page_to_lru_list(page, lruvec); - __count_vm_events(UNEVICTABLE_PGCULLED, nr_pages); + __count_vm_events(UNEVICTABLE_PGCULLED, thp_nr_pages(page)); out: SetPageLRU(page); - unlock_page_lruvec_irq(lruvec); + return lruvec; } -/** - * munlock_page - munlock a page - * @page: page to be munlocked, either a normal page or a THP head. - */ -void munlock_page(struct page *page) +static struct lruvec *__mlock_new_page(struct page *page, struct lruvec *lruvec) +{ + VM_BUG_ON_PAGE(PageLRU(page), page); + + lruvec = folio_lruvec_relock_irq(page_folio(page), lruvec); + + /* As above, this is a little surprising, but possible */ + if (unlikely(page_evictable(page))) + goto out; + + SetPageUnevictable(page); + page->mlock_count = !!PageMlocked(page); + __count_vm_events(UNEVICTABLE_PGCULLED, thp_nr_pages(page)); +out: + add_page_to_lru_list(page, lruvec); + SetPageLRU(page); + return lruvec; +} + +static struct lruvec *__munlock_page(struct page *page, struct lruvec *lruvec) { - struct lruvec *lruvec; int nr_pages = thp_nr_pages(page); + bool isolated = false; + + if (!TestClearPageLRU(page)) + goto munlock; - VM_BUG_ON_PAGE(PageTail(page), page); + isolated = true; + lruvec = folio_lruvec_relock_irq(page_folio(page), lruvec); - lock_page_memcg(page); - lruvec = folio_lruvec_lock_irq(page_folio(page)); - if (PageLRU(page) && PageUnevictable(page)) { + if (PageUnevictable(page)) { /* Then mlock_count is maintained, but might undercount */ if (page->mlock_count) page->mlock_count--; @@ -108,24 +132,151 @@ void munlock_page(struct page *page) } /* else assume that was the last mlock: reclaim will fix it if not */ +munlock: if (TestClearPageMlocked(page)) { __mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages); - if (PageLRU(page) || !PageUnevictable(page)) + if (isolated || !PageUnevictable(page)) __count_vm_events(UNEVICTABLE_PGMUNLOCKED, nr_pages); else __count_vm_events(UNEVICTABLE_PGSTRANDED, nr_pages); } /* page_evictable() has to be checked *after* clearing Mlocked */ - if (PageLRU(page) && PageUnevictable(page) && page_evictable(page)) { + if (isolated && PageUnevictable(page) && page_evictable(page)) { del_page_from_lru_list(page, lruvec); ClearPageUnevictable(page); add_page_to_lru_list(page, lruvec); __count_vm_events(UNEVICTABLE_PGRESCUED, nr_pages); } out: - unlock_page_lruvec_irq(lruvec); - unlock_page_memcg(page); + if (isolated) + SetPageLRU(page); + return lruvec; +} + +/* + * Flags held in the low bits of a struct page pointer on the mlock_pvec. + */ +#define LRU_PAGE 0x1 +#define NEW_PAGE 0x2 +static inline struct page *mlock_lru(struct page *page) +{ + return (struct page *)((unsigned long)page + LRU_PAGE); +} + +static inline struct page *mlock_new(struct page *page) +{ + return (struct page *)((unsigned long)page + NEW_PAGE); +} + +/* + * mlock_pagevec() is derived from pagevec_lru_move_fn(): + * perhaps that can make use of such page pointer flags in future, + * but for now just keep it for mlock. We could use three separate + * pagevecs instead, but one feels better (munlocking a full pagevec + * does not need to drain mlocking pagevecs first). + */ +static void mlock_pagevec(struct pagevec *pvec) +{ + struct lruvec *lruvec = NULL; + unsigned long mlock; + struct page *page; + int i; + + for (i = 0; i < pagevec_count(pvec); i++) { + page = pvec->pages[i]; + mlock = (unsigned long)page & (LRU_PAGE | NEW_PAGE); + page = (struct page *)((unsigned long)page - mlock); + pvec->pages[i] = page; + + if (mlock & LRU_PAGE) + lruvec = __mlock_page(page, lruvec); + else if (mlock & NEW_PAGE) + lruvec = __mlock_new_page(page, lruvec); + else + lruvec = __munlock_page(page, lruvec); + } + + if (lruvec) + unlock_page_lruvec_irq(lruvec); + release_pages(pvec->pages, pvec->nr); + pagevec_reinit(pvec); +} + +void mlock_page_drain(int cpu) +{ + struct pagevec *pvec; + + pvec = &per_cpu(mlock_pvec, cpu); + if (pagevec_count(pvec)) + mlock_pagevec(pvec); +} + +bool need_mlock_page_drain(int cpu) +{ + return pagevec_count(&per_cpu(mlock_pvec, cpu)); +} + +/** + * mlock_page - mlock a page already on (or temporarily off) LRU + * @page: page to be mlocked, either a normal page or a THP head. + */ +void mlock_page(struct page *page) +{ + struct pagevec *pvec = &get_cpu_var(mlock_pvec); + + if (!TestSetPageMlocked(page)) { + int nr_pages = thp_nr_pages(page); + + mod_zone_page_state(page_zone(page), NR_MLOCK, nr_pages); + __count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages); + } + + get_page(page); + if (!pagevec_add(pvec, mlock_lru(page)) || + PageHead(page) || lru_cache_disabled()) + mlock_pagevec(pvec); + put_cpu_var(mlock_pvec); +} + +/** + * mlock_new_page - mlock a newly allocated page not yet on LRU + * @page: page to be mlocked, either a normal page or a THP head. + */ +void mlock_new_page(struct page *page) +{ + struct pagevec *pvec = &get_cpu_var(mlock_pvec); + int nr_pages = thp_nr_pages(page); + + SetPageMlocked(page); + mod_zone_page_state(page_zone(page), NR_MLOCK, nr_pages); + __count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages); + + get_page(page); + if (!pagevec_add(pvec, mlock_new(page)) || + PageHead(page) || lru_cache_disabled()) + mlock_pagevec(pvec); + put_cpu_var(mlock_pvec); +} + +/** + * munlock_page - munlock a page + * @page: page to be munlocked, either a normal page or a THP head. + */ +void munlock_page(struct page *page) +{ + struct pagevec *pvec = &get_cpu_var(mlock_pvec); + + /* + * TestClearPageMlocked(page) must be left to __munlock_page(), + * which will check whether the page is multiply mlocked. + */ + + get_page(page); + if (!pagevec_add(pvec, page) || + PageHead(page) || lru_cache_disabled()) + mlock_pagevec(pvec); + put_cpu_var(mlock_pvec); } static int mlock_pte_range(pmd_t *pmd, unsigned long addr, diff --git a/mm/swap.c b/mm/swap.c index 3f770b1ea2c1..842d5cd92cf6 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -490,18 +490,12 @@ EXPORT_SYMBOL(folio_add_lru); void lru_cache_add_inactive_or_unevictable(struct page *page, struct vm_area_struct *vma) { - bool unevictable; - VM_BUG_ON_PAGE(PageLRU(page), page); - unevictable = (vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) == VM_LOCKED; - if (unlikely(unevictable) && !TestSetPageMlocked(page)) { - int nr_pages = thp_nr_pages(page); - - mod_zone_page_state(page_zone(page), NR_MLOCK, nr_pages); - count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages); - } - lru_cache_add(page); + if (unlikely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) == VM_LOCKED)) + mlock_new_page(page); + else + lru_cache_add(page); } /* @@ -640,6 +634,7 @@ void lru_add_drain_cpu(int cpu) pagevec_lru_move_fn(pvec, lru_lazyfree_fn); activate_page_drain(cpu); + mlock_page_drain(cpu); } /** @@ -842,6 +837,7 @@ inline void __lru_add_drain_all(bool force_all_cpus) pagevec_count(&per_cpu(lru_pvecs.lru_deactivate, cpu)) || pagevec_count(&per_cpu(lru_pvecs.lru_lazyfree, cpu)) || need_activate_page_drain(cpu) || + need_mlock_page_drain(cpu) || has_bh_in_lru(cpu, NULL)) { INIT_WORK(work, lru_add_drain_per_cpu); queue_work_on(cpu, mm_percpu_wq, work); @@ -1030,7 +1026,7 @@ static void __pagevec_lru_add_fn(struct folio *folio, struct lruvec *lruvec) * Is an smp_mb__after_atomic() still required here, before * folio_evictable() tests PageMlocked, to rule out the possibility * of stranding an evictable folio on an unevictable LRU? I think - * not, because munlock_page() only clears PageMlocked while the LRU + * not, because __munlock_page() only clears PageMlocked while the LRU * lock is held. * * (That is not true of __page_cache_release(), and not necessarily @@ -1043,7 +1039,14 @@ static void __pagevec_lru_add_fn(struct folio *folio, struct lruvec *lruvec) } else { folio_clear_active(folio); folio_set_unevictable(folio); - folio->mlock_count = !!folio_test_mlocked(folio); + /* + * folio->mlock_count = !!folio_test_mlocked(folio)? + * But that leaves __mlock_page() in doubt whether another + * actor has already counted the mlock or not. Err on the + * safe side, underestimate, let page reclaim fix it, rather + * than leaving a page on the unevictable LRU indefinitely. + */ + folio->mlock_count = 0; if (!was_unevictable) __count_vm_events(UNEVICTABLE_PGCULLED, nr_pages); } -- cgit v1.2.3 From c8263bd605009355edf781f2dd711de633998475 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 2 Mar 2022 17:35:30 -0800 Subject: mm/munlock: mlock_vma_page() check against VM_SPECIAL Although mmap_region() and mlock_fixup() take care that VM_LOCKED is never left set on a VM_SPECIAL vma, there is an interval while file->f_op->mmap() is using vm_insert_page(s), when VM_LOCKED may still be set while VM_SPECIAL bits are added: so mlock_vma_page() should ignore VM_LOCKED while any VM_SPECIAL bits are set. This showed up as a "Bad page" still mlocked, when vfree()ing pages which had been vm_inserted by remap_vmalloc_range_partial(): while release_pages() and __page_cache_release(), and so put_page(), catch pages still mlocked when freeing (and clear_page_mlock() caught them when unmapping), the vfree() path is unprepared for them: fix it? but these pages should not have been mlocked in the first place. I assume that an mlockall(MCL_FUTURE) had been done in the past; or maybe the user got to specify MAP_LOCKED on a vmalloc'ing driver mmap. Signed-off-by: Hugh Dickins Signed-off-by: Matthew Wilcox (Oracle) --- mm/internal.h | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'mm/internal.h') diff --git a/mm/internal.h b/mm/internal.h index 18af980bb1b8..450a2c8a43f3 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -413,8 +413,15 @@ void mlock_page(struct page *page); static inline void mlock_vma_page(struct page *page, struct vm_area_struct *vma, bool compound) { - /* VM_IO check prevents migration from double-counting during mlock */ - if (unlikely((vma->vm_flags & (VM_LOCKED|VM_IO)) == VM_LOCKED) && + /* + * The VM_SPECIAL check here serves two purposes. + * 1) VM_IO check prevents migration from double-counting during mlock. + * 2) Although mmap_region() and mlock_fixup() take care that VM_LOCKED + * is never left set on a VM_SPECIAL vma, there is an interval while + * file->f_op->mmap() is using vm_insert_page(s), when VM_LOCKED may + * still be set while VM_SPECIAL bits are added: so ignore it then. + */ + if (unlikely((vma->vm_flags & (VM_LOCKED|VM_SPECIAL)) == VM_LOCKED) && (compound || !PageTransCompound(page))) mlock_page(page); } -- cgit v1.2.3 From 27674ef6c73f0c9096a9827dc5d6ba9fc7808422 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 16 Feb 2022 15:31:36 +1100 Subject: mm: remove the extra ZONE_DEVICE struct page refcount ZONE_DEVICE struct pages have an extra reference count that complicates the code for put_page() and several places in the kernel that need to check the reference count to see that a page is not being used (gup, compaction, migration, etc.). Clean up the code so the reference count doesn't need to be treated specially for ZONE_DEVICE pages. Note that this excludes the special idle page wakeup for fsdax pages, which still happens at refcount 1. This is a separate issue and will be sorted out later. Given that only fsdax pages require the notifiacation when the refcount hits 1 now, the PAGEMAP_OPS Kconfig symbol can go away and be replaced with a FS_DAX check for this hook in the put_page fastpath. Based on an earlier patch from Ralph Campbell . Link: https://lkml.kernel.org/r/20220210072828.2930359-8-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Logan Gunthorpe Reviewed-by: Ralph Campbell Reviewed-by: Jason Gunthorpe Reviewed-by: Dan Williams Acked-by: Felix Kuehling Tested-by: "Sierra Guiza, Alejandro (Alex)" Cc: Alex Deucher Cc: Alistair Popple Cc: Ben Skeggs Cc: Chaitanya Kulkarni Cc: Christian Knig Cc: Karol Herbst Cc: Lyude Paul Cc: Miaohe Lin Cc: Muchun Song Cc: "Pan, Xinhui" Signed-off-by: Andrew Morton Signed-off-by: Matthew Wilcox (Oracle) --- arch/powerpc/kvm/book3s_hv_uvmem.c | 1 - drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 1 - drivers/gpu/drm/nouveau/nouveau_dmem.c | 1 - fs/Kconfig | 1 - include/linux/memremap.h | 12 +++---- include/linux/mm.h | 6 ++-- lib/test_hmm.c | 1 - mm/Kconfig | 4 --- mm/internal.h | 2 ++ mm/memcontrol.c | 11 ++---- mm/memremap.c | 57 +++++++++++--------------------- mm/migrate.c | 6 ---- mm/swap.c | 16 +++------ 13 files changed, 36 insertions(+), 83 deletions(-) (limited to 'mm/internal.h') diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c index 881951604227..8cabdb39cbbc 100644 --- a/arch/powerpc/kvm/book3s_hv_uvmem.c +++ b/arch/powerpc/kvm/book3s_hv_uvmem.c @@ -713,7 +713,6 @@ static struct page *kvmppc_uvmem_get_page(unsigned long gpa, struct kvm *kvm) dpage = pfn_to_page(uvmem_pfn); dpage->zone_device_data = pvt; - get_page(dpage); lock_page(dpage); return dpage; out_clear: diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c index cb835f95a76e..e27ca3758762 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c @@ -225,7 +225,6 @@ svm_migrate_get_vram_page(struct svm_range *prange, unsigned long pfn) page = pfn_to_page(pfn); svm_range_bo_ref(prange->svm_bo); page->zone_device_data = prange->svm_bo; - get_page(page); lock_page(page); } diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c b/drivers/gpu/drm/nouveau/nouveau_dmem.c index a5cdfbe32b5e..7ba66ad68a8a 100644 --- a/drivers/gpu/drm/nouveau/nouveau_dmem.c +++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c @@ -326,7 +326,6 @@ nouveau_dmem_page_alloc_locked(struct nouveau_drm *drm) return NULL; } - get_page(page); lock_page(page); return page; } diff --git a/fs/Kconfig b/fs/Kconfig index 6c7dc1387beb..e9433bbc4801 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -48,7 +48,6 @@ config FS_DAX bool "File system based Direct Access (DAX) support" depends on MMU depends on !(ARM || MIPS || SPARC) - select DEV_PAGEMAP_OPS if (ZONE_DEVICE && !FS_DAX_LIMITED) select FS_IOMAP select DAX help diff --git a/include/linux/memremap.h b/include/linux/memremap.h index 514ab46f597e..d6a114dd5ea8 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -68,9 +68,9 @@ enum memory_type { struct dev_pagemap_ops { /* - * Called once the page refcount reaches 1. (ZONE_DEVICE pages never - * reach 0 refcount unless there is a refcount bug. This allows the - * device driver to implement its own memory management.) + * Called once the page refcount reaches 0. The reference count will be + * reset to one by the core code after the method is called to prepare + * for handing out the page again. */ void (*page_free)(struct page *page); @@ -133,16 +133,14 @@ static inline unsigned long pgmap_vmemmap_nr(struct dev_pagemap *pgmap) static inline bool is_device_private_page(const struct page *page) { - return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) && - IS_ENABLED(CONFIG_DEVICE_PRIVATE) && + return IS_ENABLED(CONFIG_DEVICE_PRIVATE) && is_zone_device_page(page) && page->pgmap->type == MEMORY_DEVICE_PRIVATE; } static inline bool is_pci_p2pdma_page(const struct page *page) { - return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) && - IS_ENABLED(CONFIG_PCI_P2PDMA) && + return IS_ENABLED(CONFIG_PCI_P2PDMA) && is_zone_device_page(page) && page->pgmap->type == MEMORY_DEVICE_PCI_P2PDMA; } diff --git a/include/linux/mm.h b/include/linux/mm.h index cb8bee88e70c..0201d258c646 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1090,7 +1090,7 @@ static inline bool is_zone_movable_page(const struct page *page) return page_zonenum(page) == ZONE_MOVABLE; } -#ifdef CONFIG_DEV_PAGEMAP_OPS +#if defined(CONFIG_ZONE_DEVICE) && defined(CONFIG_FS_DAX) DECLARE_STATIC_KEY_FALSE(devmap_managed_key); bool __put_devmap_managed_page(struct page *page); @@ -1103,12 +1103,12 @@ static inline bool put_devmap_managed_page(struct page *page) return __put_devmap_managed_page(page); } -#else /* CONFIG_DEV_PAGEMAP_OPS */ +#else /* CONFIG_ZONE_DEVICE && CONFIG_FS_DAX */ static inline bool put_devmap_managed_page(struct page *page) { return false; } -#endif /* CONFIG_DEV_PAGEMAP_OPS */ +#endif /* CONFIG_ZONE_DEVICE && CONFIG_FS_DAX */ /* 127: arbitrary random number, small enough to assemble well */ #define folio_ref_zero_or_close_to_overflow(folio) \ diff --git a/lib/test_hmm.c b/lib/test_hmm.c index e5fc14ba71f3..cfe632047839 100644 --- a/lib/test_hmm.c +++ b/lib/test_hmm.c @@ -566,7 +566,6 @@ static struct page *dmirror_devmem_alloc_page(struct dmirror_device *mdevice) } dpage->zone_device_data = rpage; - get_page(dpage); lock_page(dpage); return dpage; diff --git a/mm/Kconfig b/mm/Kconfig index 3326ee3903f3..a1901ae6d062 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -776,9 +776,6 @@ config ZONE_DEVICE If FS_DAX is enabled, then say Y. -config DEV_PAGEMAP_OPS - bool - # # Helpers to mirror range of the CPU page tables of a process into device page # tables. @@ -790,7 +787,6 @@ config HMM_MIRROR config DEVICE_PRIVATE bool "Unaddressable device memory (GPU memory, ...)" depends on ZONE_DEVICE - select DEV_PAGEMAP_OPS help Allows creation of struct pages to represent unaddressable device diff --git a/mm/internal.h b/mm/internal.h index 450a2c8a43f3..3756dd5d2c92 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -735,4 +735,6 @@ void vunmap_range_noflush(unsigned long start, unsigned long end); int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, unsigned long addr, int page_nid, int *flags); +void free_zone_device_page(struct page *page); + #endif /* __MM_INTERNAL_H */ diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2c5032294c9f..8fef072dc1ce 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5503,17 +5503,12 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, return NULL; /* - * Handle MEMORY_DEVICE_PRIVATE which are ZONE_DEVICE page belonging to - * a device and because they are not accessible by CPU they are store - * as special swap entry in the CPU page table. + * Handle device private pages that are not accessible by the CPU, but + * stored as special swap entries in the page table. */ if (is_device_private_entry(ent)) { page = pfn_swap_entry_to_page(ent); - /* - * MEMORY_DEVICE_PRIVATE means ZONE_DEVICE page and which have - * a refcount of 1 when free (unlike normal page) - */ - if (!page_ref_add_unless(page, 1, 1)) + if (!get_page_unless_zero(page)) return NULL; return page; } diff --git a/mm/memremap.c b/mm/memremap.c index a0ece2344c2c..fef5734d5e49 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -12,6 +12,7 @@ #include #include #include +#include "internal.h" static DEFINE_XARRAY(pgmap_array); @@ -37,21 +38,19 @@ unsigned long memremap_compat_align(void) EXPORT_SYMBOL_GPL(memremap_compat_align); #endif -#ifdef CONFIG_DEV_PAGEMAP_OPS +#ifdef CONFIG_FS_DAX DEFINE_STATIC_KEY_FALSE(devmap_managed_key); EXPORT_SYMBOL(devmap_managed_key); static void devmap_managed_enable_put(struct dev_pagemap *pgmap) { - if (pgmap->type == MEMORY_DEVICE_PRIVATE || - pgmap->type == MEMORY_DEVICE_FS_DAX) + if (pgmap->type == MEMORY_DEVICE_FS_DAX) static_branch_dec(&devmap_managed_key); } static void devmap_managed_enable_get(struct dev_pagemap *pgmap) { - if (pgmap->type == MEMORY_DEVICE_PRIVATE || - pgmap->type == MEMORY_DEVICE_FS_DAX) + if (pgmap->type == MEMORY_DEVICE_FS_DAX) static_branch_inc(&devmap_managed_key); } #else @@ -61,7 +60,7 @@ static void devmap_managed_enable_get(struct dev_pagemap *pgmap) static void devmap_managed_enable_put(struct dev_pagemap *pgmap) { } -#endif /* CONFIG_DEV_PAGEMAP_OPS */ +#endif /* CONFIG_FS_DAX */ static void pgmap_array_delete(struct range *range) { @@ -102,23 +101,12 @@ static unsigned long pfn_end(struct dev_pagemap *pgmap, int range_id) return (range->start + range_len(range)) >> PAGE_SHIFT; } -static unsigned long pfn_next(struct dev_pagemap *pgmap, unsigned long pfn) -{ - if (pfn % (1024 << pgmap->vmemmap_shift)) - cond_resched(); - return pfn + pgmap_vmemmap_nr(pgmap); -} - static unsigned long pfn_len(struct dev_pagemap *pgmap, unsigned long range_id) { return (pfn_end(pgmap, range_id) - pfn_first(pgmap, range_id)) >> pgmap->vmemmap_shift; } -#define for_each_device_pfn(pfn, map, i) \ - for (pfn = pfn_first(map, i); pfn < pfn_end(map, i); \ - pfn = pfn_next(map, pfn)) - static void pageunmap_range(struct dev_pagemap *pgmap, int range_id) { struct range *range = &pgmap->ranges[range_id]; @@ -147,13 +135,11 @@ static void pageunmap_range(struct dev_pagemap *pgmap, int range_id) void memunmap_pages(struct dev_pagemap *pgmap) { - unsigned long pfn; int i; percpu_ref_kill(&pgmap->ref); for (i = 0; i < pgmap->nr_range; i++) - for_each_device_pfn(pfn, pgmap, i) - put_page(pfn_to_page(pfn)); + percpu_ref_put_many(&pgmap->ref, pfn_len(pgmap, i)); wait_for_completion(&pgmap->done); percpu_ref_exit(&pgmap->ref); @@ -464,14 +450,10 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn, } EXPORT_SYMBOL_GPL(get_dev_pagemap); -#ifdef CONFIG_DEV_PAGEMAP_OPS -void free_devmap_managed_page(struct page *page) +void free_zone_device_page(struct page *page) { - /* notify page idle for dax */ - if (!is_device_private_page(page)) { - wake_up_var(&page->_refcount); + if (WARN_ON_ONCE(!is_device_private_page(page))) return; - } __ClearPageWaiters(page); @@ -500,28 +482,27 @@ void free_devmap_managed_page(struct page *page) */ page->mapping = NULL; page->pgmap->ops->page_free(page); + + /* + * Reset the page count to 1 to prepare for handing out the page again. + */ + set_page_count(page, 1); } +#ifdef CONFIG_FS_DAX bool __put_devmap_managed_page(struct page *page) { - if (page->pgmap->type != MEMORY_DEVICE_PRIVATE && - page->pgmap->type != MEMORY_DEVICE_FS_DAX) + if (page->pgmap->type != MEMORY_DEVICE_FS_DAX) return false; /* - * devmap page refcounts are 1-based, rather than 0-based: if + * fsdax page refcounts are 1-based, rather than 0-based: if * refcount is 1, then the page is free and the refcount is * stable because nobody holds a reference on the page. */ - switch (page_ref_dec_return(page)) { - case 1: - free_devmap_managed_page(page); - break; - case 0: - __put_page(page); - break; - } + if (page_ref_dec_return(page) == 1) + wake_up_var(&page->_refcount); return true; } EXPORT_SYMBOL(__put_devmap_managed_page); -#endif /* CONFIG_DEV_PAGEMAP_OPS */ +#endif /* CONFIG_FS_DAX */ diff --git a/mm/migrate.c b/mm/migrate.c index e7d0b68d5dcb..af0534de618a 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -338,14 +338,8 @@ static int expected_page_refs(struct address_space *mapping, struct page *page) { int expected_count = 1; - /* - * Device private pages have an extra refcount as they are - * ZONE_DEVICE pages. - */ - expected_count += is_device_private_page(page); if (mapping) expected_count += compound_nr(page) + page_has_private(page); - return expected_count; } diff --git a/mm/swap.c b/mm/swap.c index db8d0eea13d7..fc3b7989f5b2 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -122,17 +122,9 @@ static void __put_compound_page(struct page *page) void __put_page(struct page *page) { - if (is_zone_device_page(page)) { - put_dev_pagemap(page->pgmap); - - /* - * The page belongs to the device that created pgmap. Do - * not return it to page allocator. - */ - return; - } - - if (unlikely(PageCompound(page))) + if (unlikely(is_zone_device_page(page))) + free_zone_device_page(page); + else if (unlikely(PageCompound(page))) __put_compound_page(page); else __put_single_page(page); @@ -933,7 +925,7 @@ void release_pages(struct page **pages, int nr) if (put_devmap_managed_page(page)) continue; if (put_page_testzero(page)) - put_dev_pagemap(page->pgmap); + free_zone_device_page(page); continue; } -- cgit v1.2.3 From ece1ed7bfa1208b527b3dc90bb45c55e0d139a88 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 4 Feb 2022 10:27:40 -0500 Subject: mm/gup: Add try_get_folio() and try_grab_folio() Convert try_get_compound_head() into try_get_folio() and convert try_grab_compound_head() into try_grab_folio(). Add a temporary try_grab_compound_head() wrapper around try_grab_folio() to let us convert callers individually. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Reviewed-by: John Hubbard Reviewed-by: Jason Gunthorpe Reviewed-by: William Kucharski --- mm/gup.c | 99 +++++++++++++++++++++++++++++------------------------------ mm/internal.h | 5 +++ 2 files changed, 54 insertions(+), 50 deletions(-) (limited to 'mm/internal.h') diff --git a/mm/gup.c b/mm/gup.c index 56b6b01a430b..11eba8e812aa 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -47,75 +47,70 @@ static void put_page_refs(struct page *page, int refs) } /* - * Return the compound head page with ref appropriately incremented, + * Return the folio with ref appropriately incremented, * or NULL if that failed. */ -static inline struct page *try_get_compound_head(struct page *page, int refs) +static inline struct folio *try_get_folio(struct page *page, int refs) { - struct page *head; + struct folio *folio; retry: - head = compound_head(page); - - if (WARN_ON_ONCE(page_ref_count(head) < 0)) + folio = page_folio(page); + if (WARN_ON_ONCE(folio_ref_count(folio) < 0)) return NULL; - if (unlikely(!page_cache_add_speculative(head, refs))) + if (unlikely(!folio_ref_try_add_rcu(folio, refs))) return NULL; /* - * At this point we have a stable reference to the head page; but it - * could be that between the compound_head() lookup and the refcount - * increment, the compound page was split, in which case we'd end up - * holding a reference on a page that has nothing to do with the page + * At this point we have a stable reference to the folio; but it + * could be that between calling page_folio() and the refcount + * increment, the folio was split, in which case we'd end up + * holding a reference on a folio that has nothing to do with the page * we were given anymore. - * So now that the head page is stable, recheck that the pages still - * belong together. + * So now that the folio is stable, recheck that the page still + * belongs to this folio. */ - if (unlikely(compound_head(page) != head)) { - put_page_refs(head, refs); + if (unlikely(page_folio(page) != folio)) { + folio_put_refs(folio, refs); goto retry; } - return head; + return folio; } /** - * try_grab_compound_head() - attempt to elevate a page's refcount, by a - * flags-dependent amount. - * - * Even though the name includes "compound_head", this function is still - * appropriate for callers that have a non-compound @page to get. - * + * try_grab_folio() - Attempt to get or pin a folio. * @page: pointer to page to be grabbed - * @refs: the value to (effectively) add to the page's refcount + * @refs: the value to (effectively) add to the folio's refcount * @flags: gup flags: these are the FOLL_* flag values. * * "grab" names in this file mean, "look at flags to decide whether to use - * FOLL_PIN or FOLL_GET behavior, when incrementing the page's refcount. + * FOLL_PIN or FOLL_GET behavior, when incrementing the folio's refcount. * * Either FOLL_PIN or FOLL_GET (or neither) must be set, but not both at the * same time. (That's true throughout the get_user_pages*() and * pin_user_pages*() APIs.) Cases: * - * FOLL_GET: page's refcount will be incremented by @refs. + * FOLL_GET: folio's refcount will be incremented by @refs. * - * FOLL_PIN on compound pages: page's refcount will be incremented by - * @refs, and page[1].compound_pincount will be incremented by @refs. + * FOLL_PIN on large folios: folio's refcount will be incremented by + * @refs, and its compound_pincount will be incremented by @refs. * - * FOLL_PIN on normal pages: page's refcount will be incremented by + * FOLL_PIN on single-page folios: folio's refcount will be incremented by * @refs * GUP_PIN_COUNTING_BIAS. * - * Return: head page (with refcount appropriately incremented) for success, or - * NULL upon failure. If neither FOLL_GET nor FOLL_PIN was set, that's - * considered failure, and furthermore, a likely bug in the caller, so a warning - * is also emitted. + * Return: The folio containing @page (with refcount appropriately + * incremented) for success, or NULL upon failure. If neither FOLL_GET + * nor FOLL_PIN was set, that's considered failure, and furthermore, + * a likely bug in the caller, so a warning is also emitted. */ -__maybe_unused struct page *try_grab_compound_head(struct page *page, - int refs, unsigned int flags) +struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags) { if (flags & FOLL_GET) - return try_get_compound_head(page, refs); + return try_get_folio(page, refs); else if (flags & FOLL_PIN) { + struct folio *folio; + /* * Can't do FOLL_LONGTERM + FOLL_PIN gup fast path if not in a * right zone, so fail and let the caller fall back to the slow @@ -129,34 +124,38 @@ __maybe_unused struct page *try_grab_compound_head(struct page *page, * CAUTION: Don't use compound_head() on the page before this * point, the result won't be stable. */ - page = try_get_compound_head(page, refs); - if (!page) + folio = try_get_folio(page, refs); + if (!folio) return NULL; /* - * When pinning a compound page, use an exact count to - * track it. + * When pinning a large folio, use an exact count to track it. * - * However, be sure to *also* increment the normal page - * refcount field at least once, so that the page really + * However, be sure to *also* increment the normal folio + * refcount field at least once, so that the folio really * is pinned. That's why the refcount from the earlier - * try_get_compound_head() is left intact. + * try_get_folio() is left intact. */ - if (PageHead(page)) - atomic_add(refs, compound_pincount_ptr(page)); + if (folio_test_large(folio)) + atomic_add(refs, folio_pincount_ptr(folio)); else - page_ref_add(page, refs * (GUP_PIN_COUNTING_BIAS - 1)); + folio_ref_add(folio, + refs * (GUP_PIN_COUNTING_BIAS - 1)); + node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, refs); - mod_node_page_state(page_pgdat(page), NR_FOLL_PIN_ACQUIRED, - refs); - - return page; + return folio; } WARN_ON_ONCE(1); return NULL; } +struct page *try_grab_compound_head(struct page *page, + int refs, unsigned int flags) +{ + return &try_grab_folio(page, refs, flags)->page; +} + static void put_compound_head(struct page *page, int refs, unsigned int flags) { VM_BUG_ON_PAGE(PageTail(page), page); @@ -185,7 +184,7 @@ static void put_compound_head(struct page *page, int refs, unsigned int flags) * @flags: gup flags: these are the FOLL_* flag values. * * Either FOLL_PIN or FOLL_GET (or neither) may be set, but not both at the same - * time. Cases: please see the try_grab_compound_head() documentation, with + * time. Cases: please see the try_grab_folio() documentation, with * "refs=1". * * Return: true for success, or if no action was required (if neither FOLL_PIN diff --git a/mm/internal.h b/mm/internal.h index 3756dd5d2c92..98b97cb5a97b 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -737,4 +737,9 @@ int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, void free_zone_device_page(struct page *page); +/* + * mm/gup.c + */ +struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags); + #endif /* __MM_INTERNAL_H */ -- cgit v1.2.3 From d1d8a3b4d06d8c9188f2b9b89ef053db0bf899de Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 24 Dec 2021 13:26:22 -0500 Subject: mm: Turn isolate_lru_page() into folio_isolate_lru() Add isolate_lru_page() as a wrapper around isolate_lru_folio(). TestClearPageLRU() would have always failed on a tail page, so returning -EBUSY is the same behaviour. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Reviewed-by: John Hubbard Reviewed-by: Jason Gunthorpe Reviewed-by: William Kucharski --- arch/powerpc/include/asm/mmu_context.h | 1 - mm/folio-compat.c | 8 +++++++ mm/internal.h | 3 ++- mm/vmscan.c | 43 +++++++++++++++------------------- 4 files changed, 29 insertions(+), 26 deletions(-) (limited to 'mm/internal.h') diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h index fd277b15635c..b8527a74bd4d 100644 --- a/arch/powerpc/include/asm/mmu_context.h +++ b/arch/powerpc/include/asm/mmu_context.h @@ -21,7 +21,6 @@ extern void destroy_context(struct mm_struct *mm); #ifdef CONFIG_SPAPR_TCE_IOMMU struct mm_iommu_table_group_mem_t; -extern int isolate_lru_page(struct page *page); /* from internal.h */ extern bool mm_iommu_preregistered(struct mm_struct *mm); extern long mm_iommu_new(struct mm_struct *mm, unsigned long ua, unsigned long entries, diff --git a/mm/folio-compat.c b/mm/folio-compat.c index 749555a232a8..a4a7725f4486 100644 --- a/mm/folio-compat.c +++ b/mm/folio-compat.c @@ -7,6 +7,7 @@ #include #include #include +#include "internal.h" struct address_space *page_mapping(struct page *page) { @@ -151,3 +152,10 @@ int try_to_release_page(struct page *page, gfp_t gfp) return filemap_release_folio(page_folio(page), gfp); } EXPORT_SYMBOL(try_to_release_page); + +int isolate_lru_page(struct page *page) +{ + if (WARN_RATELIMIT(PageTail(page), "trying to isolate tail page")) + return -EBUSY; + return folio_isolate_lru((struct folio *)page); +} diff --git a/mm/internal.h b/mm/internal.h index 98b97cb5a97b..69e88fb7546e 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -152,7 +152,8 @@ extern unsigned long highest_memmap_pfn; /* * in mm/vmscan.c: */ -extern int isolate_lru_page(struct page *page); +int isolate_lru_page(struct page *page); +int folio_isolate_lru(struct folio *folio); extern void putback_lru_page(struct page *page); extern void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason); diff --git a/mm/vmscan.c b/mm/vmscan.c index 74d3e5e8ebe9..55fb6d8e30fd 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2211,45 +2211,40 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, } /** - * isolate_lru_page - tries to isolate a page from its LRU list - * @page: page to isolate from its LRU list + * folio_isolate_lru() - Try to isolate a folio from its LRU list. + * @folio: Folio to isolate from its LRU list. * - * Isolates a @page from an LRU list, clears PageLRU and adjusts the - * vmstat statistic corresponding to whatever LRU list the page was on. + * Isolate a @folio from an LRU list and adjust the vmstat statistic + * corresponding to whatever LRU list the folio was on. * - * Returns 0 if the page was removed from an LRU list. - * Returns -EBUSY if the page was not on an LRU list. - * - * The returned page will have PageLRU() cleared. If it was found on - * the active list, it will have PageActive set. If it was found on - * the unevictable list, it will have the PageUnevictable bit set. That flag + * The folio will have its LRU flag cleared. If it was found on the + * active list, it will have the Active flag set. If it was found on the + * unevictable list, it will have the Unevictable flag set. These flags * may need to be cleared by the caller before letting the page go. * - * The vmstat statistic corresponding to the list on which the page was - * found will be decremented. - * - * Restrictions: + * Context: * * (1) Must be called with an elevated refcount on the page. This is a - * fundamental difference from isolate_lru_pages (which is called + * fundamental difference from isolate_lru_pages() (which is called * without a stable reference). - * (2) the lru_lock must not be held. - * (3) interrupts must be enabled. + * (2) The lru_lock must not be held. + * (3) Interrupts must be enabled. + * + * Return: 0 if the folio was removed from an LRU list. + * -EBUSY if the folio was not on an LRU list. */ -int isolate_lru_page(struct page *page) +int folio_isolate_lru(struct folio *folio) { - struct folio *folio = page_folio(page); int ret = -EBUSY; - VM_BUG_ON_PAGE(!page_count(page), page); - WARN_RATELIMIT(PageTail(page), "trying to isolate tail page"); + VM_BUG_ON_FOLIO(!folio_ref_count(folio), folio); - if (TestClearPageLRU(page)) { + if (folio_test_clear_lru(folio)) { struct lruvec *lruvec; - get_page(page); + folio_get(folio); lruvec = folio_lruvec_lock_irq(folio); - del_page_from_lru_list(page, lruvec); + lruvec_del_folio(lruvec, folio); unlock_page_lruvec_irq(lruvec); ret = 0; } -- cgit v1.2.3 From ca6d60f3f18b78d37b7a93262108ade0727d1441 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 21 Jan 2022 08:41:46 -0500 Subject: mm: Turn putback_lru_page() into folio_putback_lru() Add a putback_lru_page() wrapper. Removes a couple of compound_head() calls. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig --- mm/folio-compat.c | 5 +++++ mm/internal.h | 3 ++- mm/vmscan.c | 16 ++++++++-------- 3 files changed, 15 insertions(+), 9 deletions(-) (limited to 'mm/internal.h') diff --git a/mm/folio-compat.c b/mm/folio-compat.c index a4a7725f4486..46fa179e32fb 100644 --- a/mm/folio-compat.c +++ b/mm/folio-compat.c @@ -159,3 +159,8 @@ int isolate_lru_page(struct page *page) return -EBUSY; return folio_isolate_lru((struct folio *)page); } + +void putback_lru_page(struct page *page) +{ + folio_putback_lru(page_folio(page)); +} diff --git a/mm/internal.h b/mm/internal.h index 69e88fb7546e..ade30a1e6682 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -154,7 +154,8 @@ extern unsigned long highest_memmap_pfn; */ int isolate_lru_page(struct page *page); int folio_isolate_lru(struct folio *folio); -extern void putback_lru_page(struct page *page); +void putback_lru_page(struct page *page); +void folio_putback_lru(struct folio *folio); extern void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason); /* diff --git a/mm/vmscan.c b/mm/vmscan.c index 4f54c6d22083..5c108d557adf 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1357,18 +1357,18 @@ int remove_mapping(struct address_space *mapping, struct page *page) } /** - * putback_lru_page - put previously isolated page onto appropriate LRU list - * @page: page to be put back to appropriate lru list + * folio_putback_lru - Put previously isolated folio onto appropriate LRU list. + * @folio: Folio to be returned to an LRU list. * - * Add previously isolated @page to appropriate LRU list. - * Page may still be unevictable for other reasons. + * Add previously isolated @folio to appropriate LRU list. + * The folio may still be unevictable for other reasons. * - * lru_lock must not be held, interrupts must be enabled. + * Context: lru_lock must not be held, interrupts must be enabled. */ -void putback_lru_page(struct page *page) +void folio_putback_lru(struct folio *folio) { - lru_cache_add(page); - put_page(page); /* drop ref from isolate */ + folio_add_lru(folio); + folio_put(folio); /* drop ref from isolate */ } enum page_references { -- cgit v1.2.3 From d6c75dc22c755c567838f12f12a16f2a323ebd4e Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sun, 13 Feb 2022 15:22:28 -0500 Subject: mm/truncate: Split invalidate_inode_page() into mapping_evict_folio() Some of the callers already have the address_space and can avoid calling folio_mapping() and checking if the folio was already truncated. Also add kernel-doc and fix the return type (in case we ever support folios larger than 4TB). Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Reviewed-by: Miaohe Lin --- include/linux/mm.h | 1 - mm/internal.h | 1 + mm/memory-failure.c | 4 ++-- mm/truncate.c | 34 +++++++++++++++++++++++----------- 4 files changed, 26 insertions(+), 14 deletions(-) (limited to 'mm/internal.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index a583b7375445..dede2eda4d7f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1825,7 +1825,6 @@ extern void truncate_setsize(struct inode *inode, loff_t newsize); void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to); void truncate_pagecache_range(struct inode *inode, loff_t offset, loff_t end); int generic_error_remove_page(struct address_space *mapping, struct page *page); -int invalidate_inode_page(struct page *page); #ifdef CONFIG_MMU extern vm_fault_t handle_mm_fault(struct vm_area_struct *vma, diff --git a/mm/internal.h b/mm/internal.h index ade30a1e6682..9c1959fff477 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -95,6 +95,7 @@ void filemap_free_folio(struct address_space *mapping, struct folio *folio); int truncate_inode_folio(struct address_space *mapping, struct folio *folio); bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end); +long invalidate_inode_page(struct page *page); /** * folio_evictable - Test whether a folio is evictable. diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 97a9ed8f87a9..0b72a936b8dd 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -2139,7 +2139,7 @@ static bool isolate_page(struct page *page, struct list_head *pagelist) */ static int __soft_offline_page(struct page *page) { - int ret = 0; + long ret = 0; unsigned long pfn = page_to_pfn(page); struct page *hpage = compound_head(page); char const *msg_page[] = {"page", "hugepage"}; @@ -2196,7 +2196,7 @@ static int __soft_offline_page(struct page *page) if (!list_empty(&pagelist)) putback_movable_pages(&pagelist); - pr_info("soft offline: %#lx: %s migration failed %d, type %pGp\n", + pr_info("soft offline: %#lx: %s migration failed %ld, type %pGp\n", pfn, msg_page[huge], ret, &page->flags); if (ret > 0) ret = -EBUSY; diff --git a/mm/truncate.c b/mm/truncate.c index 1d97c4cae6a0..2fb10735aab4 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -273,18 +273,9 @@ int generic_error_remove_page(struct address_space *mapping, struct page *page) } EXPORT_SYMBOL(generic_error_remove_page); -/* - * Safely invalidate one page from its pagecache mapping. - * It only drops clean, unused pages. The page must be locked. - * - * Returns 1 if the page is successfully invalidated, otherwise 0. - */ -int invalidate_inode_page(struct page *page) +static long mapping_evict_folio(struct address_space *mapping, + struct folio *folio) { - struct folio *folio = page_folio(page); - struct address_space *mapping = folio_mapping(folio); - if (!mapping) - return 0; if (folio_test_dirty(folio) || folio_test_writeback(folio)) return 0; /* The refcount will be elevated if any page in the folio is mapped */ @@ -297,6 +288,27 @@ int invalidate_inode_page(struct page *page) return remove_mapping(mapping, folio); } +/** + * invalidate_inode_page() - Remove an unused page from the pagecache. + * @page: The page to remove. + * + * Safely invalidate one page from its pagecache mapping. + * It only drops clean, unused pages. + * + * Context: Page must be locked. + * Return: The number of pages successfully removed. + */ +long invalidate_inode_page(struct page *page) +{ + struct folio *folio = page_folio(page); + struct address_space *mapping = folio_mapping(folio); + + /* The page may have been truncated before it was locked */ + if (!mapping) + return 0; + return mapping_evict_folio(mapping, folio); +} + /** * truncate_inode_pages_range - truncate range of pages specified by start & end byte offsets * @mapping: mapping to truncate -- cgit v1.2.3 From 261b6840ed10419ac2f554e515592d59dd5c82cf Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sun, 13 Feb 2022 16:40:24 -0500 Subject: mm: Turn deactivate_file_page() into deactivate_file_folio() This function has one caller which already has a reference to the page, so we don't need to use get_page_unless_zero(). Also move the prototype to mm/internal.h. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Reviewed-by: Miaohe Lin --- include/linux/swap.h | 1 - mm/internal.h | 1 + mm/swap.c | 35 ++++++++++++++++++----------------- mm/truncate.c | 2 +- 4 files changed, 20 insertions(+), 19 deletions(-) (limited to 'mm/internal.h') diff --git a/include/linux/swap.h b/include/linux/swap.h index 304f174b4d31..064e60e9f63f 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -372,7 +372,6 @@ extern void lru_add_drain(void); extern void lru_add_drain_cpu(int cpu); extern void lru_add_drain_cpu_zone(struct zone *zone); extern void lru_add_drain_all(void); -extern void deactivate_file_page(struct page *page); extern void deactivate_page(struct page *page); extern void mark_page_lazyfree(struct page *page); extern void swap_setup(void); diff --git a/mm/internal.h b/mm/internal.h index 9c1959fff477..7c441f43ba31 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -66,6 +66,7 @@ static inline void wake_throttle_isolated(pg_data_t *pgdat) vm_fault_t do_swap_page(struct vm_fault *vmf); void folio_rotate_reclaimable(struct folio *folio); bool __folio_end_writeback(struct folio *folio); +void deactivate_file_folio(struct folio *folio); void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, unsigned long floor, unsigned long ceiling); diff --git a/mm/swap.c b/mm/swap.c index fc3b7989f5b2..65ec5cbab78b 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -630,32 +630,33 @@ void lru_add_drain_cpu(int cpu) } /** - * deactivate_file_page - forcefully deactivate a file page - * @page: page to deactivate + * deactivate_file_folio() - Forcefully deactivate a file folio. + * @folio: Folio to deactivate. * - * This function hints the VM that @page is a good reclaim candidate, - * for example if its invalidation fails due to the page being dirty + * This function hints to the VM that @folio is a good reclaim candidate, + * for example if its invalidation fails due to the folio being dirty * or under writeback. + * + * Context: Caller holds a reference on the page. */ -void deactivate_file_page(struct page *page) +void deactivate_file_folio(struct folio *folio) { + struct pagevec *pvec; + /* - * In a workload with many unevictable page such as mprotect, - * unevictable page deactivation for accelerating reclaim is pointless. + * In a workload with many unevictable pages such as mprotect, + * unevictable folio deactivation for accelerating reclaim is pointless. */ - if (PageUnevictable(page)) + if (folio_test_unevictable(folio)) return; - if (likely(get_page_unless_zero(page))) { - struct pagevec *pvec; - - local_lock(&lru_pvecs.lock); - pvec = this_cpu_ptr(&lru_pvecs.lru_deactivate_file); + folio_get(folio); + local_lock(&lru_pvecs.lock); + pvec = this_cpu_ptr(&lru_pvecs.lru_deactivate_file); - if (pagevec_add_and_need_flush(pvec, page)) - pagevec_lru_move_fn(pvec, lru_deactivate_file_fn); - local_unlock(&lru_pvecs.lock); - } + if (pagevec_add_and_need_flush(pvec, &folio->page)) + pagevec_lru_move_fn(pvec, lru_deactivate_file_fn); + local_unlock(&lru_pvecs.lock); } /* diff --git a/mm/truncate.c b/mm/truncate.c index a8b0243eadf6..9ed62cb3c503 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -527,7 +527,7 @@ static unsigned long __invalidate_mapping_pages(struct address_space *mapping, * of interest and try to speed up its reclaim. */ if (!ret) { - deactivate_file_page(&folio->page); + deactivate_file_folio(folio); /* It is likely on the pagevec of a remote CPU */ if (nr_pagevec) (*nr_pagevec)++; -- cgit v1.2.3 From c56109dd35c9204cd6c49d2116ef36e5044ef867 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sun, 13 Feb 2022 17:22:10 -0500 Subject: mm/truncate: Combine invalidate_mapping_pagevec() and __invalidate_mapping_pages() We can save a function call by combining these two functions, which are identical except for the return value. Also move the prototype to mm/internal.h. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Reviewed-by: Miaohe Lin --- include/linux/fs.h | 4 ---- mm/internal.h | 2 ++ mm/truncate.c | 32 +++++++++++++------------------- 3 files changed, 15 insertions(+), 23 deletions(-) (limited to 'mm/internal.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index e2d892b201b0..85c584c5c623 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2749,10 +2749,6 @@ extern bool is_bad_inode(struct inode *); unsigned long invalidate_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t end); -void invalidate_mapping_pagevec(struct address_space *mapping, - pgoff_t start, pgoff_t end, - unsigned long *nr_pagevec); - static inline void invalidate_remote_inode(struct inode *inode) { if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || diff --git a/mm/internal.h b/mm/internal.h index 7c441f43ba31..6047268076e7 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -97,6 +97,8 @@ int truncate_inode_folio(struct address_space *mapping, struct folio *folio); bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end); long invalidate_inode_page(struct page *page); +unsigned long invalidate_mapping_pagevec(struct address_space *mapping, + pgoff_t start, pgoff_t end, unsigned long *nr_pagevec); /** * folio_evictable - Test whether a folio is evictable. diff --git a/mm/truncate.c b/mm/truncate.c index 9ed62cb3c503..cace6e3e4e8c 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -494,7 +494,18 @@ void truncate_inode_pages_final(struct address_space *mapping) } EXPORT_SYMBOL(truncate_inode_pages_final); -static unsigned long __invalidate_mapping_pages(struct address_space *mapping, +/** + * invalidate_mapping_pagevec - Invalidate all the unlocked pages of one inode + * @mapping: the address_space which holds the pages to invalidate + * @start: the offset 'from' which to invalidate + * @end: the offset 'to' which to invalidate (inclusive) + * @nr_pagevec: invalidate failed page number for caller + * + * This helper is similar to invalidate_mapping_pages(), except that it accounts + * for pages that are likely on a pagevec and counts them in @nr_pagevec, which + * will be used by the caller. + */ +unsigned long invalidate_mapping_pagevec(struct address_space *mapping, pgoff_t start, pgoff_t end, unsigned long *nr_pagevec) { pgoff_t indices[PAGEVEC_SIZE]; @@ -559,27 +570,10 @@ static unsigned long __invalidate_mapping_pages(struct address_space *mapping, unsigned long invalidate_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t end) { - return __invalidate_mapping_pages(mapping, start, end, NULL); + return invalidate_mapping_pagevec(mapping, start, end, NULL); } EXPORT_SYMBOL(invalidate_mapping_pages); -/** - * invalidate_mapping_pagevec - Invalidate all the unlocked pages of one inode - * @mapping: the address_space which holds the pages to invalidate - * @start: the offset 'from' which to invalidate - * @end: the offset 'to' which to invalidate (inclusive) - * @nr_pagevec: invalidate failed page number for caller - * - * This helper is similar to invalidate_mapping_pages(), except that it accounts - * for pages that are likely on a pagevec and counts them in @nr_pagevec, which - * will be used by the caller. - */ -void invalidate_mapping_pagevec(struct address_space *mapping, - pgoff_t start, pgoff_t end, unsigned long *nr_pagevec) -{ - __invalidate_mapping_pages(mapping, start, end, nr_pagevec); -} - /* * This is like invalidate_inode_page(), except it ignores the page's * refcount. We do this because invalidate_inode_pages2() needs stronger -- cgit v1.2.3 From 2aff7a4755bed2870ee23b75bc88cdc8d76cdd03 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 3 Feb 2022 11:40:17 -0500 Subject: mm: Convert page_vma_mapped_walk to work on PFNs page_mapped_in_vma() really just wants to walk one page, but as the code stands, if passed the head page of a compound page, it will walk every page in the compound page. Extract pfn/nr_pages/pgoff from the struct page early, so they can be overridden by page_mapped_in_vma(). Signed-off-by: Matthew Wilcox (Oracle) --- include/linux/hugetlb.h | 5 +++++ include/linux/rmap.h | 17 ++++++++++----- mm/internal.h | 15 ++++++++----- mm/migrate.c | 5 +++-- mm/page_vma_mapped.c | 58 ++++++++++++++++++++++--------------------------- mm/rmap.c | 8 +++---- 6 files changed, 58 insertions(+), 50 deletions(-) (limited to 'mm/internal.h') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index d1897a69c540..6ba2f8e74fbb 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -970,6 +970,11 @@ static inline struct hstate *page_hstate(struct page *page) return NULL; } +static inline struct hstate *size_to_hstate(unsigned long size) +{ + return NULL; +} + static inline unsigned long huge_page_size(struct hstate *h) { return PAGE_SIZE; diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 0d894a2bfaa1..0c838ba1a8ee 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -11,6 +11,7 @@ #include #include #include +#include /* * The anon_vma heads a list of private "related" vmas, to scan if @@ -201,11 +202,13 @@ int make_device_exclusive_range(struct mm_struct *mm, unsigned long start, /* Avoid racy checks */ #define PVMW_SYNC (1 << 0) -/* Look for migarion entries rather than present PTEs */ +/* Look for migration entries rather than present PTEs */ #define PVMW_MIGRATION (1 << 1) struct page_vma_mapped_walk { - struct page *page; + unsigned long pfn; + unsigned long nr_pages; + pgoff_t pgoff; struct vm_area_struct *vma; unsigned long address; pmd_t *pmd; @@ -216,7 +219,9 @@ struct page_vma_mapped_walk { #define DEFINE_PAGE_VMA_WALK(name, _page, _vma, _address, _flags) \ struct page_vma_mapped_walk name = { \ - .page = _page, \ + .pfn = page_to_pfn(_page), \ + .nr_pages = compound_nr(page), \ + .pgoff = page_to_pgoff(page), \ .vma = _vma, \ .address = _address, \ .flags = _flags, \ @@ -224,7 +229,9 @@ struct page_vma_mapped_walk { #define DEFINE_FOLIO_VMA_WALK(name, _folio, _vma, _address, _flags) \ struct page_vma_mapped_walk name = { \ - .page = &_folio->page, \ + .pfn = folio_pfn(_folio), \ + .nr_pages = folio_nr_pages(_folio), \ + .pgoff = folio_pgoff(_folio), \ .vma = _vma, \ .address = _address, \ .flags = _flags, \ @@ -233,7 +240,7 @@ struct page_vma_mapped_walk { static inline void page_vma_mapped_walk_done(struct page_vma_mapped_walk *pvmw) { /* HugeTLB pte is set to the relevant page table entry without pte_mapped. */ - if (pvmw->pte && !PageHuge(pvmw->page)) + if (pvmw->pte && !is_vm_hugetlb_page(pvmw->vma)) pte_unmap(pvmw->pte); if (pvmw->ptl) spin_unlock(pvmw->ptl); diff --git a/mm/internal.h b/mm/internal.h index 6047268076e7..3b652444f070 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -10,6 +10,7 @@ #include #include #include +#include #include struct folio_batch; @@ -475,18 +476,20 @@ vma_address(struct page *page, struct vm_area_struct *vma) } /* - * Then at what user virtual address will none of the page be found in vma? + * Then at what user virtual address will none of the range be found in vma? * Assumes that vma_address() already returned a good starting address. - * If page is a compound head, the entire compound page is considered. */ -static inline unsigned long -vma_address_end(struct page *page, struct vm_area_struct *vma) +static inline unsigned long vma_address_end(struct page_vma_mapped_walk *pvmw) { + struct vm_area_struct *vma = pvmw->vma; pgoff_t pgoff; unsigned long address; - VM_BUG_ON_PAGE(PageKsm(page), page); /* KSM page->index unusable */ - pgoff = page_to_pgoff(page) + compound_nr(page); + /* Common case, plus ->pgoff is invalid for KSM */ + if (pvmw->nr_pages == 1) + return pvmw->address + PAGE_SIZE; + + pgoff = pvmw->pgoff + pvmw->nr_pages; address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); /* Check for address beyond vma (or wrapped through 0?) */ if (address < vma->vm_start || address > vma->vm_end) diff --git a/mm/migrate.c b/mm/migrate.c index 71f92e8ed934..358bc311caaa 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -174,7 +174,8 @@ void putback_movable_pages(struct list_head *l) static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma, unsigned long addr, void *old) { - DEFINE_PAGE_VMA_WALK(pvmw, old, vma, addr, PVMW_SYNC | PVMW_MIGRATION); + DEFINE_PAGE_VMA_WALK(pvmw, (struct page *)old, vma, addr, + PVMW_SYNC | PVMW_MIGRATION); struct page *new; pte_t pte; swp_entry_t entry; @@ -184,7 +185,7 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma, if (PageKsm(page)) new = page; else - new = page - pvmw.page->index + + new = page - pvmw.pgoff + linear_page_index(vma, pvmw.address); #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c index f7b331081791..1187f9c1ec5b 100644 --- a/mm/page_vma_mapped.c +++ b/mm/page_vma_mapped.c @@ -53,18 +53,6 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw) return true; } -static inline bool pfn_is_match(struct page *page, unsigned long pfn) -{ - unsigned long page_pfn = page_to_pfn(page); - - /* normal page and hugetlbfs page */ - if (!PageTransCompound(page) || PageHuge(page)) - return page_pfn == pfn; - - /* THP can be referenced by any subpage */ - return pfn >= page_pfn && pfn - page_pfn < thp_nr_pages(page); -} - /** * check_pte - check if @pvmw->page is mapped at the @pvmw->pte * @pvmw: page_vma_mapped_walk struct, includes a pair pte and page for checking @@ -116,7 +104,17 @@ static bool check_pte(struct page_vma_mapped_walk *pvmw) pfn = pte_pfn(*pvmw->pte); } - return pfn_is_match(pvmw->page, pfn); + return (pfn - pvmw->pfn) < pvmw->nr_pages; +} + +/* Returns true if the two ranges overlap. Careful to not overflow. */ +static bool check_pmd(unsigned long pfn, struct page_vma_mapped_walk *pvmw) +{ + if ((pfn + HPAGE_PMD_NR - 1) < pvmw->pfn) + return false; + if (pfn > pvmw->pfn + pvmw->nr_pages - 1) + return false; + return true; } static void step_forward(struct page_vma_mapped_walk *pvmw, unsigned long size) @@ -127,7 +125,7 @@ static void step_forward(struct page_vma_mapped_walk *pvmw, unsigned long size) } /** - * page_vma_mapped_walk - check if @pvmw->page is mapped in @pvmw->vma at + * page_vma_mapped_walk - check if @pvmw->pfn is mapped in @pvmw->vma at * @pvmw->address * @pvmw: pointer to struct page_vma_mapped_walk. page, vma, address and flags * must be set. pmd, pte and ptl must be NULL. @@ -152,8 +150,8 @@ static void step_forward(struct page_vma_mapped_walk *pvmw, unsigned long size) */ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw) { - struct mm_struct *mm = pvmw->vma->vm_mm; - struct page *page = pvmw->page; + struct vm_area_struct *vma = pvmw->vma; + struct mm_struct *mm = vma->vm_mm; unsigned long end; pgd_t *pgd; p4d_t *p4d; @@ -164,32 +162,26 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw) if (pvmw->pmd && !pvmw->pte) return not_found(pvmw); - if (unlikely(PageHuge(page))) { + if (unlikely(is_vm_hugetlb_page(vma))) { + unsigned long size = pvmw->nr_pages * PAGE_SIZE; /* The only possible mapping was handled on last iteration */ if (pvmw->pte) return not_found(pvmw); /* when pud is not present, pte will be NULL */ - pvmw->pte = huge_pte_offset(mm, pvmw->address, page_size(page)); + pvmw->pte = huge_pte_offset(mm, pvmw->address, size); if (!pvmw->pte) return false; - pvmw->ptl = huge_pte_lockptr(page_hstate(page), mm, pvmw->pte); + pvmw->ptl = huge_pte_lockptr(size_to_hstate(size), mm, + pvmw->pte); spin_lock(pvmw->ptl); if (!check_pte(pvmw)) return not_found(pvmw); return true; } - /* - * Seek to next pte only makes sense for THP. - * But more important than that optimization, is to filter out - * any PageKsm page: whose page->index misleads vma_address() - * and vma_address_end() to disaster. - */ - end = PageTransCompound(page) ? - vma_address_end(page, pvmw->vma) : - pvmw->address + PAGE_SIZE; + end = vma_address_end(pvmw); if (pvmw->pte) goto next_pte; restart: @@ -224,7 +216,7 @@ restart: if (likely(pmd_trans_huge(pmde))) { if (pvmw->flags & PVMW_MIGRATION) return not_found(pvmw); - if (pmd_page(pmde) != page) + if (!check_pmd(pmd_pfn(pmde), pvmw)) return not_found(pvmw); return true; } @@ -236,7 +228,7 @@ restart: return not_found(pvmw); entry = pmd_to_swp_entry(pmde); if (!is_migration_entry(entry) || - pfn_swap_entry_to_page(entry) != page) + !check_pmd(swp_offset(entry), pvmw)) return not_found(pvmw); return true; } @@ -250,7 +242,8 @@ restart: * cleared *pmd but not decremented compound_mapcount(). */ if ((pvmw->flags & PVMW_SYNC) && - PageTransCompound(page)) { + transparent_hugepage_active(vma) && + (pvmw->nr_pages >= HPAGE_PMD_NR)) { spinlock_t *ptl = pmd_lock(mm, pvmw->pmd); spin_unlock(ptl); @@ -307,7 +300,8 @@ next_pte: int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) { struct page_vma_mapped_walk pvmw = { - .page = page, + .pfn = page_to_pfn(page), + .nr_pages = 1, .vma = vma, .flags = PVMW_SYNC, }; diff --git a/mm/rmap.c b/mm/rmap.c index a7f06b76b503..e27ba4172069 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -940,7 +940,7 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma, */ mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE, 0, vma, vma->vm_mm, address, - vma_address_end(page, vma)); + vma_address_end(&pvmw)); mmu_notifier_invalidate_range_start(&range); while (page_vma_mapped_walk(&pvmw)) { @@ -1437,8 +1437,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, * Note that the page can not be free in this function as call of * try_to_unmap() must hold a reference on the page. */ - range.end = PageKsm(page) ? - address + PAGE_SIZE : vma_address_end(page, vma); + range.end = vma_address_end(&pvmw); mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, address, range.end); if (PageHuge(page)) { @@ -1732,8 +1731,7 @@ static bool try_to_migrate_one(struct page *page, struct vm_area_struct *vma, * Note that the page can not be free in this function as call of * try_to_unmap() must hold a reference on the page. */ - range.end = PageKsm(page) ? - address + PAGE_SIZE : vma_address_end(page, vma); + range.end = vma_address_end(&pvmw); mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, address, range.end); if (PageHuge(page)) { -- cgit v1.2.3 From dcc5d337c5e62761ee71f2e25c7aa890b1aa41a2 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 15 Feb 2022 13:33:59 -0500 Subject: mm/mlock: Add mlock_vma_folio() Convert mlock_page() into mlock_folio() and convert the callers. Keep mlock_vma_page() as a wrapper. Signed-off-by: Matthew Wilcox (Oracle) --- mm/internal.h | 15 +++++++++++---- mm/mlock.c | 22 +++++++++++----------- 2 files changed, 22 insertions(+), 15 deletions(-) (limited to 'mm/internal.h') diff --git a/mm/internal.h b/mm/internal.h index 3b652444f070..6039acc780c0 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -416,8 +416,8 @@ extern int mlock_future_check(struct mm_struct *mm, unsigned long flags, * pte mappings of THPs, which cannot be consistently counted: a pte * mapping of the THP head cannot be distinguished by the page alone. */ -void mlock_page(struct page *page); -static inline void mlock_vma_page(struct page *page, +void mlock_folio(struct folio *folio); +static inline void mlock_vma_folio(struct folio *folio, struct vm_area_struct *vma, bool compound) { /* @@ -429,9 +429,16 @@ static inline void mlock_vma_page(struct page *page, * still be set while VM_SPECIAL bits are added: so ignore it then. */ if (unlikely((vma->vm_flags & (VM_LOCKED|VM_SPECIAL)) == VM_LOCKED) && - (compound || !PageTransCompound(page))) - mlock_page(page); + (compound || !folio_test_large(folio))) + mlock_folio(folio); +} + +static inline void mlock_vma_page(struct page *page, + struct vm_area_struct *vma, bool compound) +{ + mlock_vma_folio(page_folio(page), vma, compound); } + void munlock_page(struct page *page); static inline void munlock_vma_page(struct page *page, struct vm_area_struct *vma, bool compound) diff --git a/mm/mlock.c b/mm/mlock.c index d28e56529e5b..833d482746d9 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -218,23 +218,23 @@ bool need_mlock_page_drain(int cpu) } /** - * mlock_page - mlock a page already on (or temporarily off) LRU - * @page: page to be mlocked, either a normal page or a THP head. + * mlock_folio - mlock a folio already on (or temporarily off) LRU + * @folio: folio to be mlocked. */ -void mlock_page(struct page *page) +void mlock_folio(struct folio *folio) { struct pagevec *pvec = &get_cpu_var(mlock_pvec); - if (!TestSetPageMlocked(page)) { - int nr_pages = thp_nr_pages(page); + if (!folio_test_set_mlocked(folio)) { + int nr_pages = folio_nr_pages(folio); - mod_zone_page_state(page_zone(page), NR_MLOCK, nr_pages); + zone_stat_mod_folio(folio, NR_MLOCK, nr_pages); __count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages); } - get_page(page); - if (!pagevec_add(pvec, mlock_lru(page)) || - PageHead(page) || lru_cache_disabled()) + folio_get(folio); + if (!pagevec_add(pvec, mlock_lru(&folio->page)) || + folio_test_large(folio) || lru_cache_disabled()) mlock_pagevec(pvec); put_cpu_var(mlock_pvec); } @@ -296,7 +296,7 @@ static int mlock_pte_range(pmd_t *pmd, unsigned long addr, goto out; page = pmd_page(*pmd); if (vma->vm_flags & VM_LOCKED) - mlock_page(page); + mlock_folio(page_folio(page)); else munlock_page(page); goto out; @@ -312,7 +312,7 @@ static int mlock_pte_range(pmd_t *pmd, unsigned long addr, if (PageTransCompound(page)) continue; if (vma->vm_flags & VM_LOCKED) - mlock_page(page); + mlock_folio(page_folio(page)); else munlock_page(page); } -- cgit v1.2.3 From e05b34539d008ab819388f699b25eae962ba24ac Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sat, 29 Jan 2022 11:52:52 -0500 Subject: mm: Turn page_anon_vma() into folio_anon_vma() Move the prototype from mm.h to mm/internal.h and convert all callers to pass a folio. Signed-off-by: Matthew Wilcox (Oracle) --- include/linux/mm.h | 1 - mm/internal.h | 1 + mm/ksm.c | 3 ++- mm/rmap.c | 19 ++++++++++++------- mm/util.c | 3 +-- 5 files changed, 16 insertions(+), 11 deletions(-) (limited to 'mm/internal.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index 0d380dc26847..a879c583f665 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1730,7 +1730,6 @@ static inline void *folio_address(const struct folio *folio) } extern void *page_rmapping(struct page *page); -extern struct anon_vma *page_anon_vma(struct page *page); extern pgoff_t __page_file_index(struct page *page); /* diff --git a/mm/internal.h b/mm/internal.h index 6039acc780c0..2b2c2c4eb63a 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -392,6 +392,7 @@ static inline bool is_data_mapping(vm_flags_t flags) void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_area_struct *prev); void __vma_unlink_list(struct mm_struct *mm, struct vm_area_struct *vma); +struct anon_vma *folio_anon_vma(struct folio *folio); #ifdef CONFIG_MMU void unmap_mapping_folio(struct folio *folio); diff --git a/mm/ksm.c b/mm/ksm.c index ea82fef93a31..b25d545e0cd1 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -2554,7 +2554,8 @@ void __ksm_exit(struct mm_struct *mm) struct page *ksm_might_need_to_copy(struct page *page, struct vm_area_struct *vma, unsigned long address) { - struct anon_vma *anon_vma = page_anon_vma(page); + struct folio *folio = page_folio(page); + struct anon_vma *anon_vma = folio_anon_vma(folio); struct page *new_page; if (PageKsm(page)) { diff --git a/mm/rmap.c b/mm/rmap.c index 64655d345234..09301aecf2fc 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -737,8 +737,9 @@ static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags) */ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) { - if (PageAnon(page)) { - struct anon_vma *page__anon_vma = page_anon_vma(page); + struct folio *folio = page_folio(page); + if (folio_test_anon(folio)) { + struct anon_vma *page__anon_vma = folio_anon_vma(folio); /* * Note: swapoff's unuse_vma() is more efficient with this * check, and needs it to match anon_vma when KSM is active. @@ -748,7 +749,7 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) return -EFAULT; } else if (!vma->vm_file) { return -EFAULT; - } else if (vma->vm_file->f_mapping != compound_head(page)->mapping) { + } else if (vma->vm_file->f_mapping != folio->mapping) { return -EFAULT; } @@ -1103,6 +1104,7 @@ static void __page_set_anon_rmap(struct page *page, static void __page_check_anon_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address) { + struct folio *folio = page_folio(page); /* * The page's anon-rmap details (mapping and index) are guaranteed to * be set up correctly at this point. @@ -1114,7 +1116,8 @@ static void __page_check_anon_rmap(struct page *page, * are initially only visible via the pagetables, and the pte is locked * over the call to page_add_new_anon_rmap. */ - VM_BUG_ON_PAGE(page_anon_vma(page)->root != vma->anon_vma->root, page); + VM_BUG_ON_FOLIO(folio_anon_vma(folio)->root != vma->anon_vma->root, + folio); VM_BUG_ON_PAGE(page_to_pgoff(page) != linear_page_index(vma, address), page); } @@ -2177,6 +2180,7 @@ void __put_anon_vma(struct anon_vma *anon_vma) static struct anon_vma *rmap_walk_anon_lock(struct page *page, struct rmap_walk_control *rwc) { + struct folio *folio = page_folio(page); struct anon_vma *anon_vma; if (rwc->anon_lock) @@ -2188,7 +2192,7 @@ static struct anon_vma *rmap_walk_anon_lock(struct page *page, * are holding mmap_lock. Users without mmap_lock are required to * take a reference count to prevent the anon_vma disappearing */ - anon_vma = page_anon_vma(page); + anon_vma = folio_anon_vma(folio); if (!anon_vma) return NULL; @@ -2208,14 +2212,15 @@ static struct anon_vma *rmap_walk_anon_lock(struct page *page, static void rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc, bool locked) { + struct folio *folio = page_folio(page); struct anon_vma *anon_vma; pgoff_t pgoff_start, pgoff_end; struct anon_vma_chain *avc; if (locked) { - anon_vma = page_anon_vma(page); + anon_vma = folio_anon_vma(folio); /* anon_vma disappear under us? */ - VM_BUG_ON_PAGE(!anon_vma, page); + VM_BUG_ON_FOLIO(!anon_vma, folio); } else { anon_vma = rmap_walk_anon_lock(page, rwc); } diff --git a/mm/util.c b/mm/util.c index b614f423aaa4..13fc88ac8e70 100644 --- a/mm/util.c +++ b/mm/util.c @@ -679,9 +679,8 @@ bool folio_mapped(struct folio *folio) } EXPORT_SYMBOL(folio_mapped); -struct anon_vma *page_anon_vma(struct page *page) +struct anon_vma *folio_anon_vma(struct folio *folio) { - struct folio *folio = page_folio(page); unsigned long mapping = (unsigned long)folio->mapping; if ((mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) -- cgit v1.2.3 From 56a4d67c264e37014b8392cba9869c7fe904ed1e Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sat, 24 Jul 2021 23:26:14 -0400 Subject: mm/readahead: Switch to page_cache_ra_order do_page_cache_ra() was being exposed for the benefit of do_sync_mmap_readahead(). Switch it over to page_cache_ra_order() partly because it's a better interface but mostly for the benefit of the next patch. Signed-off-by: Matthew Wilcox (Oracle) --- mm/filemap.c | 2 +- mm/internal.h | 4 ++-- mm/readahead.c | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) (limited to 'mm/internal.h') diff --git a/mm/filemap.c b/mm/filemap.c index 8f7ac3de9098..fe764225ae99 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3027,7 +3027,7 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf) ra->size = ra->ra_pages; ra->async_size = ra->ra_pages / 4; ractl._index = ra->start; - do_page_cache_ra(&ractl, ra->size, ra->async_size); + page_cache_ra_order(&ractl, ra, 0); return fpin; } diff --git a/mm/internal.h b/mm/internal.h index 2b2c2c4eb63a..293eca1360dc 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -79,8 +79,8 @@ void unmap_page_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, struct zap_details *details); -void do_page_cache_ra(struct readahead_control *, unsigned long nr_to_read, - unsigned long lookahead_size); +void page_cache_ra_order(struct readahead_control *, struct file_ra_state *, + unsigned int order); void force_page_cache_ra(struct readahead_control *, unsigned long nr); static inline void force_page_cache_readahead(struct address_space *mapping, struct file *file, pgoff_t index, unsigned long nr_to_read) diff --git a/mm/readahead.c b/mm/readahead.c index 5100eaf5b0ee..a20391d6a71b 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -247,7 +247,7 @@ EXPORT_SYMBOL_GPL(page_cache_ra_unbounded); * behaviour which would occur if page allocations are causing VM writeback. * We really don't want to intermingle reads and writes like that. */ -void do_page_cache_ra(struct readahead_control *ractl, +static void do_page_cache_ra(struct readahead_control *ractl, unsigned long nr_to_read, unsigned long lookahead_size) { struct inode *inode = ractl->mapping->host; @@ -462,7 +462,7 @@ static inline int ra_alloc_folio(struct readahead_control *ractl, pgoff_t index, return err; } -static void page_cache_ra_order(struct readahead_control *ractl, +void page_cache_ra_order(struct readahead_control *ractl, struct file_ra_state *ra, unsigned int new_order) { struct address_space *mapping = ractl->mapping; -- cgit v1.2.3