From b109b87050df5438ee745b2bddfa3587970025bb Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 14 Feb 2022 18:28:05 -0800 Subject: mm/munlock: replace clear_page_mlock() by final clearance Placing munlock_vma_page() at the end of page_remove_rmap() shifts most of the munlocking to clear_page_mlock(), since PageMlocked is typically still set when mapcount has fallen to 0. That is not what we want: we want /proc/vmstat's unevictable_pgs_cleared to remain as a useful check on the integrity of of the mlock/munlock protocol - small numbers are not surprising, but big numbers mean the protocol is not working. That could be easily fixed by placing munlock_vma_page() at the start of page_remove_rmap(); but later in the series we shall want to batch the munlocking, and that too would tend to leave PageMlocked still set at the point when it is checked. So delete clear_page_mlock() now: leave it instead to release_pages() (and __page_cache_release()) to do this backstop clearing of Mlocked, when page refcount has fallen to 0. If a pinned page occasionally gets counted as Mlocked and Unevictable until it is unpinned, that's okay. A slightly regrettable side-effect of this change is that, since release_pages() and __page_cache_release() may be called at interrupt time, those places which update NR_MLOCK with interrupts enabled had better use mod_zone_page_state() than __mod_zone_page_state() (but holding the lruvec lock always has interrupts disabled). This change, forcing Mlocked off when refcount 0 instead of earlier when mapcount 0, is not fundamental: it can be reversed if performance or something else is found to suffer; but this is the easiest way to separate the stats - let's not complicate that without good reason. Signed-off-by: Hugh Dickins Acked-by: Vlastimil Babka Signed-off-by: Matthew Wilcox (Oracle) --- mm/swap.c | 32 ++++++++++++++++++++++++-------- 1 file changed, 24 insertions(+), 8 deletions(-) (limited to 'mm/swap.c') diff --git a/mm/swap.c b/mm/swap.c index bcf3ac288b56..ff4810e4a4bc 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -74,8 +74,8 @@ static DEFINE_PER_CPU(struct lru_pvecs, lru_pvecs) = { }; /* - * This path almost never happens for VM activity - pages are normally - * freed via pagevecs. But it gets used by networking. + * This path almost never happens for VM activity - pages are normally freed + * via pagevecs. But it gets used by networking - and for compound pages. */ static void __page_cache_release(struct page *page) { @@ -89,6 +89,14 @@ static void __page_cache_release(struct page *page) __clear_page_lru_flags(page); unlock_page_lruvec_irqrestore(lruvec, flags); } + /* See comment on PageMlocked in release_pages() */ + if (unlikely(PageMlocked(page))) { + int nr_pages = thp_nr_pages(page); + + __ClearPageMlocked(page); + mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages); + count_vm_events(UNEVICTABLE_PGCLEARED, nr_pages); + } __ClearPageWaiters(page); } @@ -489,12 +497,8 @@ void lru_cache_add_inactive_or_unevictable(struct page *page, unevictable = (vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) == VM_LOCKED; if (unlikely(unevictable) && !TestSetPageMlocked(page)) { int nr_pages = thp_nr_pages(page); - /* - * We use the irq-unsafe __mod_zone_page_state because this - * counter is not modified from interrupt context, and the pte - * lock is held(spinlock), which implies preemption disabled. - */ - __mod_zone_page_state(page_zone(page), NR_MLOCK, nr_pages); + + mod_zone_page_state(page_zone(page), NR_MLOCK, nr_pages); count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages); } lru_cache_add(page); @@ -969,6 +973,18 @@ void release_pages(struct page **pages, int nr) __clear_page_lru_flags(page); } + /* + * In rare cases, when truncation or holepunching raced with + * munlock after VM_LOCKED was cleared, Mlocked may still be + * found set here. This does not indicate a problem, unless + * "unevictable_pgs_cleared" appears worryingly large. + */ + if (unlikely(PageMlocked(page))) { + __ClearPageMlocked(page); + dec_zone_page_state(page, NR_MLOCK); + count_vm_event(UNEVICTABLE_PGCLEARED); + } + __ClearPageWaiters(page); list_add(&page->lru, &pages_to_free); -- cgit v1.2.3 From 07ca760673088f262da57ff42c15558688565aa2 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 14 Feb 2022 18:29:54 -0800 Subject: mm/munlock: maintain page->mlock_count while unevictable Previous patches have been preparatory: now implement page->mlock_count. The ordering of the "Unevictable LRU" is of no significance, and there is no point holding unevictable pages on a list: place page->mlock_count to overlay page->lru.prev (since page->lru.next is overlaid by compound_head, which needs to be even so as not to satisfy PageTail - though 2 could be added instead of 1 for each mlock, if that's ever an improvement). But it's only safe to rely on or modify page->mlock_count while lruvec lock is held and page is on unevictable "LRU" - we can save lots of edits by continuing to pretend that there's an imaginary LRU here (there is an unevictable count which still needs to be maintained, but not a list). The mlock_count technique suffers from an unreliability much like with page_mlock(): while someone else has the page off LRU, not much can be done. As before, err on the safe side (behave as if mlock_count 0), and let try_to_unlock_one() move the page to unevictable if reclaim finds out later on - a few misplaced pages don't matter, what we want to avoid is imbalancing reclaim by flooding evictable lists with unevictable pages. I am not a fan of "if (!isolate_lru_page(page)) putback_lru_page(page);": if we have taken lruvec lock to get the page off its present list, then we save everyone trouble (and however many extra atomic ops) by putting it on its destination list immediately. Signed-off-by: Hugh Dickins Acked-by: Vlastimil Babka Signed-off-by: Matthew Wilcox (Oracle) --- include/linux/mm_inline.h | 11 +++++--- include/linux/mm_types.h | 19 +++++++++++-- mm/huge_memory.c | 5 +++- mm/memcontrol.c | 3 +-- mm/mlock.c | 68 +++++++++++++++++++++++++++++++++++++---------- mm/mmzone.c | 7 +++++ mm/swap.c | 1 + 7 files changed, 92 insertions(+), 22 deletions(-) (limited to 'mm/swap.c') diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index b725839dfe71..884d6f6af05b 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -99,7 +99,8 @@ void lruvec_add_folio(struct lruvec *lruvec, struct folio *folio) update_lru_size(lruvec, lru, folio_zonenum(folio), folio_nr_pages(folio)); - list_add(&folio->lru, &lruvec->lists[lru]); + if (lru != LRU_UNEVICTABLE) + list_add(&folio->lru, &lruvec->lists[lru]); } static __always_inline void add_page_to_lru_list(struct page *page, @@ -115,6 +116,7 @@ void lruvec_add_folio_tail(struct lruvec *lruvec, struct folio *folio) update_lru_size(lruvec, lru, folio_zonenum(folio), folio_nr_pages(folio)); + /* This is not expected to be used on LRU_UNEVICTABLE */ list_add_tail(&folio->lru, &lruvec->lists[lru]); } @@ -127,8 +129,11 @@ static __always_inline void add_page_to_lru_list_tail(struct page *page, static __always_inline void lruvec_del_folio(struct lruvec *lruvec, struct folio *folio) { - list_del(&folio->lru); - update_lru_size(lruvec, folio_lru_list(folio), folio_zonenum(folio), + enum lru_list lru = folio_lru_list(folio); + + if (lru != LRU_UNEVICTABLE) + list_del(&folio->lru); + update_lru_size(lruvec, lru, folio_zonenum(folio), -folio_nr_pages(folio)); } diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 5140e5feb486..475bdb282769 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -85,7 +85,16 @@ struct page { * lruvec->lru_lock. Sometimes used as a generic list * by the page owner. */ - struct list_head lru; + union { + struct list_head lru; + /* Or, for the Unevictable "LRU list" slot */ + struct { + /* Always even, to negate PageTail */ + void *__filler; + /* Count page's or folio's mlocks */ + unsigned int mlock_count; + }; + }; /* See page-flags.h for PAGE_MAPPING_FLAGS */ struct address_space *mapping; pgoff_t index; /* Our offset within mapping. */ @@ -241,7 +250,13 @@ struct folio { struct { /* public: */ unsigned long flags; - struct list_head lru; + union { + struct list_head lru; + struct { + void *__filler; + unsigned int mlock_count; + }; + }; struct address_space *mapping; pgoff_t index; void *private; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index d6477f48a27e..9afca0122723 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2300,8 +2300,11 @@ static void lru_add_page_tail(struct page *head, struct page *tail, } else { /* head is still on lru (and we have it frozen) */ VM_WARN_ON(!PageLRU(head)); + if (PageUnevictable(tail)) + tail->mlock_count = 0; + else + list_add_tail(&tail->lru, &head->lru); SetPageLRU(tail); - list_add_tail(&tail->lru, &head->lru); } } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 36e9f38c919d..c78b9d3b9c04 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1257,8 +1257,7 @@ struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio, * @nr_pages: positive when adding or negative when removing * * This function must be called under lru_lock, just before a page is added - * to or just after a page is removed from an lru list (that ordering being - * so as to allow it to check that lru_size 0 is consistent with list_empty). + * to or just after a page is removed from an lru list. */ void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, int zid, int nr_pages) diff --git a/mm/mlock.c b/mm/mlock.c index 3c26473050a3..f8a3a54687dd 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -54,16 +54,35 @@ EXPORT_SYMBOL(can_do_mlock); */ void mlock_page(struct page *page) { + struct lruvec *lruvec; + int nr_pages = thp_nr_pages(page); + VM_BUG_ON_PAGE(PageTail(page), page); if (!TestSetPageMlocked(page)) { - int nr_pages = thp_nr_pages(page); - mod_zone_page_state(page_zone(page), NR_MLOCK, nr_pages); - count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages); - if (!isolate_lru_page(page)) - putback_lru_page(page); + __count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages); + } + + /* There is nothing more we can do while it's off LRU */ + if (!TestClearPageLRU(page)) + return; + + lruvec = folio_lruvec_lock_irq(page_folio(page)); + if (PageUnevictable(page)) { + page->mlock_count++; + goto out; } + + del_page_from_lru_list(page, lruvec); + ClearPageActive(page); + SetPageUnevictable(page); + page->mlock_count = 1; + add_page_to_lru_list(page, lruvec); + __count_vm_events(UNEVICTABLE_PGCULLED, nr_pages); +out: + SetPageLRU(page); + unlock_page_lruvec_irq(lruvec); } /** @@ -72,19 +91,40 @@ void mlock_page(struct page *page) */ void munlock_page(struct page *page) { + struct lruvec *lruvec; + int nr_pages = thp_nr_pages(page); + VM_BUG_ON_PAGE(PageTail(page), page); + lock_page_memcg(page); + lruvec = folio_lruvec_lock_irq(page_folio(page)); + if (PageLRU(page) && PageUnevictable(page)) { + /* Then mlock_count is maintained, but might undercount */ + if (page->mlock_count) + page->mlock_count--; + if (page->mlock_count) + goto out; + } + /* else assume that was the last mlock: reclaim will fix it if not */ + if (TestClearPageMlocked(page)) { - int nr_pages = thp_nr_pages(page); - - mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages); - if (!isolate_lru_page(page)) { - putback_lru_page(page); - count_vm_events(UNEVICTABLE_PGMUNLOCKED, nr_pages); - } else if (PageUnevictable(page)) { - count_vm_events(UNEVICTABLE_PGSTRANDED, nr_pages); - } + __mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages); + if (PageLRU(page) || !PageUnevictable(page)) + __count_vm_events(UNEVICTABLE_PGMUNLOCKED, nr_pages); + else + __count_vm_events(UNEVICTABLE_PGSTRANDED, nr_pages); + } + + /* page_evictable() has to be checked *after* clearing Mlocked */ + if (PageLRU(page) && PageUnevictable(page) && page_evictable(page)) { + del_page_from_lru_list(page, lruvec); + ClearPageUnevictable(page); + add_page_to_lru_list(page, lruvec); + __count_vm_events(UNEVICTABLE_PGRESCUED, nr_pages); } +out: + unlock_page_lruvec_irq(lruvec); + unlock_page_memcg(page); } /* diff --git a/mm/mmzone.c b/mm/mmzone.c index eb89d6e018e2..40e1d9428300 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c @@ -81,6 +81,13 @@ void lruvec_init(struct lruvec *lruvec) for_each_lru(lru) INIT_LIST_HEAD(&lruvec->lists[lru]); + /* + * The "Unevictable LRU" is imaginary: though its size is maintained, + * it is never scanned, and unevictable pages are not threaded on it + * (so that their lru fields can be reused to hold mlock_count). + * Poison its list head, so that any operations on it would crash. + */ + list_del(&lruvec->lists[LRU_UNEVICTABLE]); } #if defined(CONFIG_NUMA_BALANCING) && !defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS) diff --git a/mm/swap.c b/mm/swap.c index ff4810e4a4bc..682a03301a2c 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -1062,6 +1062,7 @@ static void __pagevec_lru_add_fn(struct folio *folio, struct lruvec *lruvec) } else { folio_clear_active(folio); folio_set_unevictable(folio); + folio->mlock_count = !!folio_test_mlocked(folio); if (!was_unevictable) __count_vm_events(UNEVICTABLE_PGCULLED, nr_pages); } -- cgit v1.2.3 From 2262ace60713348228b2abe0629e77091643d29e Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 14 Feb 2022 18:34:46 -0800 Subject: mm/munlock: delete smp_mb() from __pagevec_lru_add_fn() My reading of comment on smp_mb__after_atomic() in __pagevec_lru_add_fn() says that it can now be deleted; and that remains so when the next patch is added. Signed-off-by: Hugh Dickins Acked-by: Vlastimil Babka Signed-off-by: Matthew Wilcox (Oracle) --- mm/swap.c | 37 +++++++++---------------------------- 1 file changed, 9 insertions(+), 28 deletions(-) (limited to 'mm/swap.c') diff --git a/mm/swap.c b/mm/swap.c index 682a03301a2c..3f770b1ea2c1 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -1025,37 +1025,18 @@ static void __pagevec_lru_add_fn(struct folio *folio, struct lruvec *lruvec) VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); + folio_set_lru(folio); /* - * A folio becomes evictable in two ways: - * 1) Within LRU lock [munlock_vma_page() and __munlock_pagevec()]. - * 2) Before acquiring LRU lock to put the folio on the correct LRU - * and then - * a) do PageLRU check with lock [check_move_unevictable_pages] - * b) do PageLRU check before lock [clear_page_mlock] - * - * (1) & (2a) are ok as LRU lock will serialize them. For (2b), we need - * following strict ordering: - * - * #0: __pagevec_lru_add_fn #1: clear_page_mlock - * - * folio_set_lru() folio_test_clear_mlocked() - * smp_mb() // explicit ordering // above provides strict - * // ordering - * folio_test_mlocked() folio_test_lru() + * Is an smp_mb__after_atomic() still required here, before + * folio_evictable() tests PageMlocked, to rule out the possibility + * of stranding an evictable folio on an unevictable LRU? I think + * not, because munlock_page() only clears PageMlocked while the LRU + * lock is held. * - * - * if '#1' does not observe setting of PG_lru by '#0' and - * fails isolation, the explicit barrier will make sure that - * folio_evictable check will put the folio on the correct - * LRU. Without smp_mb(), folio_set_lru() can be reordered - * after folio_test_mlocked() check and can make '#1' fail the - * isolation of the folio whose mlocked bit is cleared (#0 is - * also looking at the same folio) and the evictable folio will - * be stranded on an unevictable LRU. + * (That is not true of __page_cache_release(), and not necessarily + * true of release_pages(): but those only clear PageMlocked after + * put_page_testzero() has excluded any other users of the page.) */ - folio_set_lru(folio); - smp_mb__after_atomic(); - if (folio_evictable(folio)) { if (was_unevictable) __count_vm_events(UNEVICTABLE_PGRESCUED, nr_pages); -- cgit v1.2.3 From 2fbb0c10d1e8222604132b3a3f81bfd8345a44b6 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 14 Feb 2022 18:37:29 -0800 Subject: mm/munlock: mlock_page() munlock_page() batch by pagevec A weakness of the page->mlock_count approach is the need for lruvec lock while holding page table lock. That is not an overhead we would allow on normal pages, but I think acceptable just for pages in an mlocked area. But let's try to amortize the extra cost by gathering on per-cpu pagevec before acquiring the lruvec lock. I have an unverified conjecture that the mlock pagevec might work out well for delaying the mlock processing of new file pages until they have got off lru_cache_add()'s pagevec and on to LRU. The initialization of page->mlock_count is subject to races and awkward: 0 or !!PageMlocked or 1? Was it wrong even in the implementation before this commit, which just widens the window? I haven't gone back to think it through. Maybe someone can point out a better way to initialize it. Bringing lru_cache_add_inactive_or_unevictable()'s mlock initialization into mm/mlock.c has helped: mlock_new_page(), using the mlock pagevec, rather than lru_cache_add()'s pagevec. Experimented with various orderings: the right thing seems to be for mlock_page() and mlock_new_page() to TestSetPageMlocked before adding to pagevec, but munlock_page() to leave TestClearPageMlocked to the later pagevec processing. Dropped the VM_BUG_ON_PAGE(PageTail)s this time around: they have made their point, and the thp_nr_page()s already contain a VM_BUG_ON_PGFLAGS() for that. This still leaves acquiring lruvec locks under page table lock each time the pagevec fills (or a THP is added): which I suppose is rather silly, since they sit on pagevec waiting to be processed long after page table lock has been dropped; but I'm disinclined to uglify the calling sequence until some load shows an actual problem with it (nothing wrong with taking lruvec lock under page table lock, just "nicer" to do it less). Signed-off-by: Hugh Dickins Acked-by: Vlastimil Babka Signed-off-by: Matthew Wilcox (Oracle) --- mm/internal.h | 9 ++- mm/mlock.c | 219 +++++++++++++++++++++++++++++++++++++++++++++++++--------- mm/swap.c | 27 ++++---- 3 files changed, 208 insertions(+), 47 deletions(-) (limited to 'mm/swap.c') diff --git a/mm/internal.h b/mm/internal.h index b3f0dd3ffba2..18af980bb1b8 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -402,7 +402,8 @@ extern int mlock_future_check(struct mm_struct *mm, unsigned long flags, * * mlock is usually called at the end of page_add_*_rmap(), * munlock at the end of page_remove_rmap(); but new anon - * pages are managed in lru_cache_add_inactive_or_unevictable(). + * pages are managed by lru_cache_add_inactive_or_unevictable() + * calling mlock_new_page(). * * @compound is used to include pmd mappings of THPs, but filter out * pte mappings of THPs, which cannot be consistently counted: a pte @@ -425,6 +426,9 @@ static inline void munlock_vma_page(struct page *page, (compound || !PageTransCompound(page))) munlock_page(page); } +void mlock_new_page(struct page *page); +bool need_mlock_page_drain(int cpu); +void mlock_page_drain(int cpu); extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma); @@ -503,6 +507,9 @@ static inline void mlock_vma_page(struct page *page, struct vm_area_struct *vma, bool compound) { } static inline void munlock_vma_page(struct page *page, struct vm_area_struct *vma, bool compound) { } +static inline void mlock_new_page(struct page *page) { } +static inline bool need_mlock_page_drain(int cpu) { return false; } +static inline void mlock_page_drain(int cpu) { } static inline void vunmap_range_noflush(unsigned long start, unsigned long end) { } diff --git a/mm/mlock.c b/mm/mlock.c index 581ea8bf1b83..d28e56529e5b 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -28,6 +28,8 @@ #include "internal.h" +static DEFINE_PER_CPU(struct pagevec, mlock_pvec); + bool can_do_mlock(void) { if (rlimit(RLIMIT_MEMLOCK) != 0) @@ -49,57 +51,79 @@ EXPORT_SYMBOL(can_do_mlock); * PageUnevictable is set to indicate the unevictable state. */ -/** - * mlock_page - mlock a page - * @page: page to be mlocked, either a normal page or a THP head. - */ -void mlock_page(struct page *page) +static struct lruvec *__mlock_page(struct page *page, struct lruvec *lruvec) { - struct lruvec *lruvec; - int nr_pages = thp_nr_pages(page); + /* There is nothing more we can do while it's off LRU */ + if (!TestClearPageLRU(page)) + return lruvec; - VM_BUG_ON_PAGE(PageTail(page), page); + lruvec = folio_lruvec_relock_irq(page_folio(page), lruvec); - if (!TestSetPageMlocked(page)) { - mod_zone_page_state(page_zone(page), NR_MLOCK, nr_pages); - __count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages); + if (unlikely(page_evictable(page))) { + /* + * This is a little surprising, but quite possible: + * PageMlocked must have got cleared already by another CPU. + * Could this page be on the Unevictable LRU? I'm not sure, + * but move it now if so. + */ + if (PageUnevictable(page)) { + del_page_from_lru_list(page, lruvec); + ClearPageUnevictable(page); + add_page_to_lru_list(page, lruvec); + __count_vm_events(UNEVICTABLE_PGRESCUED, + thp_nr_pages(page)); + } + goto out; } - /* There is nothing more we can do while it's off LRU */ - if (!TestClearPageLRU(page)) - return; - - lruvec = folio_lruvec_lock_irq(page_folio(page)); if (PageUnevictable(page)) { - page->mlock_count++; + if (PageMlocked(page)) + page->mlock_count++; goto out; } del_page_from_lru_list(page, lruvec); ClearPageActive(page); SetPageUnevictable(page); - page->mlock_count = 1; + page->mlock_count = !!PageMlocked(page); add_page_to_lru_list(page, lruvec); - __count_vm_events(UNEVICTABLE_PGCULLED, nr_pages); + __count_vm_events(UNEVICTABLE_PGCULLED, thp_nr_pages(page)); out: SetPageLRU(page); - unlock_page_lruvec_irq(lruvec); + return lruvec; } -/** - * munlock_page - munlock a page - * @page: page to be munlocked, either a normal page or a THP head. - */ -void munlock_page(struct page *page) +static struct lruvec *__mlock_new_page(struct page *page, struct lruvec *lruvec) +{ + VM_BUG_ON_PAGE(PageLRU(page), page); + + lruvec = folio_lruvec_relock_irq(page_folio(page), lruvec); + + /* As above, this is a little surprising, but possible */ + if (unlikely(page_evictable(page))) + goto out; + + SetPageUnevictable(page); + page->mlock_count = !!PageMlocked(page); + __count_vm_events(UNEVICTABLE_PGCULLED, thp_nr_pages(page)); +out: + add_page_to_lru_list(page, lruvec); + SetPageLRU(page); + return lruvec; +} + +static struct lruvec *__munlock_page(struct page *page, struct lruvec *lruvec) { - struct lruvec *lruvec; int nr_pages = thp_nr_pages(page); + bool isolated = false; + + if (!TestClearPageLRU(page)) + goto munlock; - VM_BUG_ON_PAGE(PageTail(page), page); + isolated = true; + lruvec = folio_lruvec_relock_irq(page_folio(page), lruvec); - lock_page_memcg(page); - lruvec = folio_lruvec_lock_irq(page_folio(page)); - if (PageLRU(page) && PageUnevictable(page)) { + if (PageUnevictable(page)) { /* Then mlock_count is maintained, but might undercount */ if (page->mlock_count) page->mlock_count--; @@ -108,24 +132,151 @@ void munlock_page(struct page *page) } /* else assume that was the last mlock: reclaim will fix it if not */ +munlock: if (TestClearPageMlocked(page)) { __mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages); - if (PageLRU(page) || !PageUnevictable(page)) + if (isolated || !PageUnevictable(page)) __count_vm_events(UNEVICTABLE_PGMUNLOCKED, nr_pages); else __count_vm_events(UNEVICTABLE_PGSTRANDED, nr_pages); } /* page_evictable() has to be checked *after* clearing Mlocked */ - if (PageLRU(page) && PageUnevictable(page) && page_evictable(page)) { + if (isolated && PageUnevictable(page) && page_evictable(page)) { del_page_from_lru_list(page, lruvec); ClearPageUnevictable(page); add_page_to_lru_list(page, lruvec); __count_vm_events(UNEVICTABLE_PGRESCUED, nr_pages); } out: - unlock_page_lruvec_irq(lruvec); - unlock_page_memcg(page); + if (isolated) + SetPageLRU(page); + return lruvec; +} + +/* + * Flags held in the low bits of a struct page pointer on the mlock_pvec. + */ +#define LRU_PAGE 0x1 +#define NEW_PAGE 0x2 +static inline struct page *mlock_lru(struct page *page) +{ + return (struct page *)((unsigned long)page + LRU_PAGE); +} + +static inline struct page *mlock_new(struct page *page) +{ + return (struct page *)((unsigned long)page + NEW_PAGE); +} + +/* + * mlock_pagevec() is derived from pagevec_lru_move_fn(): + * perhaps that can make use of such page pointer flags in future, + * but for now just keep it for mlock. We could use three separate + * pagevecs instead, but one feels better (munlocking a full pagevec + * does not need to drain mlocking pagevecs first). + */ +static void mlock_pagevec(struct pagevec *pvec) +{ + struct lruvec *lruvec = NULL; + unsigned long mlock; + struct page *page; + int i; + + for (i = 0; i < pagevec_count(pvec); i++) { + page = pvec->pages[i]; + mlock = (unsigned long)page & (LRU_PAGE | NEW_PAGE); + page = (struct page *)((unsigned long)page - mlock); + pvec->pages[i] = page; + + if (mlock & LRU_PAGE) + lruvec = __mlock_page(page, lruvec); + else if (mlock & NEW_PAGE) + lruvec = __mlock_new_page(page, lruvec); + else + lruvec = __munlock_page(page, lruvec); + } + + if (lruvec) + unlock_page_lruvec_irq(lruvec); + release_pages(pvec->pages, pvec->nr); + pagevec_reinit(pvec); +} + +void mlock_page_drain(int cpu) +{ + struct pagevec *pvec; + + pvec = &per_cpu(mlock_pvec, cpu); + if (pagevec_count(pvec)) + mlock_pagevec(pvec); +} + +bool need_mlock_page_drain(int cpu) +{ + return pagevec_count(&per_cpu(mlock_pvec, cpu)); +} + +/** + * mlock_page - mlock a page already on (or temporarily off) LRU + * @page: page to be mlocked, either a normal page or a THP head. + */ +void mlock_page(struct page *page) +{ + struct pagevec *pvec = &get_cpu_var(mlock_pvec); + + if (!TestSetPageMlocked(page)) { + int nr_pages = thp_nr_pages(page); + + mod_zone_page_state(page_zone(page), NR_MLOCK, nr_pages); + __count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages); + } + + get_page(page); + if (!pagevec_add(pvec, mlock_lru(page)) || + PageHead(page) || lru_cache_disabled()) + mlock_pagevec(pvec); + put_cpu_var(mlock_pvec); +} + +/** + * mlock_new_page - mlock a newly allocated page not yet on LRU + * @page: page to be mlocked, either a normal page or a THP head. + */ +void mlock_new_page(struct page *page) +{ + struct pagevec *pvec = &get_cpu_var(mlock_pvec); + int nr_pages = thp_nr_pages(page); + + SetPageMlocked(page); + mod_zone_page_state(page_zone(page), NR_MLOCK, nr_pages); + __count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages); + + get_page(page); + if (!pagevec_add(pvec, mlock_new(page)) || + PageHead(page) || lru_cache_disabled()) + mlock_pagevec(pvec); + put_cpu_var(mlock_pvec); +} + +/** + * munlock_page - munlock a page + * @page: page to be munlocked, either a normal page or a THP head. + */ +void munlock_page(struct page *page) +{ + struct pagevec *pvec = &get_cpu_var(mlock_pvec); + + /* + * TestClearPageMlocked(page) must be left to __munlock_page(), + * which will check whether the page is multiply mlocked. + */ + + get_page(page); + if (!pagevec_add(pvec, page) || + PageHead(page) || lru_cache_disabled()) + mlock_pagevec(pvec); + put_cpu_var(mlock_pvec); } static int mlock_pte_range(pmd_t *pmd, unsigned long addr, diff --git a/mm/swap.c b/mm/swap.c index 3f770b1ea2c1..842d5cd92cf6 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -490,18 +490,12 @@ EXPORT_SYMBOL(folio_add_lru); void lru_cache_add_inactive_or_unevictable(struct page *page, struct vm_area_struct *vma) { - bool unevictable; - VM_BUG_ON_PAGE(PageLRU(page), page); - unevictable = (vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) == VM_LOCKED; - if (unlikely(unevictable) && !TestSetPageMlocked(page)) { - int nr_pages = thp_nr_pages(page); - - mod_zone_page_state(page_zone(page), NR_MLOCK, nr_pages); - count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages); - } - lru_cache_add(page); + if (unlikely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) == VM_LOCKED)) + mlock_new_page(page); + else + lru_cache_add(page); } /* @@ -640,6 +634,7 @@ void lru_add_drain_cpu(int cpu) pagevec_lru_move_fn(pvec, lru_lazyfree_fn); activate_page_drain(cpu); + mlock_page_drain(cpu); } /** @@ -842,6 +837,7 @@ inline void __lru_add_drain_all(bool force_all_cpus) pagevec_count(&per_cpu(lru_pvecs.lru_deactivate, cpu)) || pagevec_count(&per_cpu(lru_pvecs.lru_lazyfree, cpu)) || need_activate_page_drain(cpu) || + need_mlock_page_drain(cpu) || has_bh_in_lru(cpu, NULL)) { INIT_WORK(work, lru_add_drain_per_cpu); queue_work_on(cpu, mm_percpu_wq, work); @@ -1030,7 +1026,7 @@ static void __pagevec_lru_add_fn(struct folio *folio, struct lruvec *lruvec) * Is an smp_mb__after_atomic() still required here, before * folio_evictable() tests PageMlocked, to rule out the possibility * of stranding an evictable folio on an unevictable LRU? I think - * not, because munlock_page() only clears PageMlocked while the LRU + * not, because __munlock_page() only clears PageMlocked while the LRU * lock is held. * * (That is not true of __page_cache_release(), and not necessarily @@ -1043,7 +1039,14 @@ static void __pagevec_lru_add_fn(struct folio *folio, struct lruvec *lruvec) } else { folio_clear_active(folio); folio_set_unevictable(folio); - folio->mlock_count = !!folio_test_mlocked(folio); + /* + * folio->mlock_count = !!folio_test_mlocked(folio)? + * But that leaves __mlock_page() in doubt whether another + * actor has already counted the mlock or not. Err on the + * safe side, underestimate, let page reclaim fix it, rather + * than leaving a page on the unevictable LRU indefinitely. + */ + folio->mlock_count = 0; if (!was_unevictable) __count_vm_events(UNEVICTABLE_PGCULLED, nr_pages); } -- cgit v1.2.3 From 75e55d8a107edb2fd6e02b1fa8a81531209cda04 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 16 Feb 2022 15:31:35 +1100 Subject: mm: move free_devmap_managed_page to memremap.c free_devmap_managed_page has nothing to do with the code in swap.c, move it to live with the rest of the code for devmap handling. Link: https://lkml.kernel.org/r/20220210072828.2930359-5-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Logan Gunthorpe Reviewed-by: Jason Gunthorpe Reviewed-by: Chaitanya Kulkarni Reviewed-by: Muchun Song Reviewed-by: Dan Williams Tested-by: "Sierra Guiza, Alejandro (Alex)" Cc: Alex Deucher Cc: Alistair Popple Cc: Ben Skeggs Cc: Christian Knig Cc: Felix Kuehling Cc: Karol Herbst Cc: Lyude Paul Cc: Miaohe Lin Cc: "Pan, Xinhui" Cc: Ralph Campbell Signed-off-by: Andrew Morton Signed-off-by: Matthew Wilcox (Oracle) --- include/linux/mm.h | 1 - mm/memremap.c | 21 +++++++++++++++++++++ mm/swap.c | 23 ----------------------- 3 files changed, 21 insertions(+), 24 deletions(-) (limited to 'mm/swap.c') diff --git a/include/linux/mm.h b/include/linux/mm.h index 2cca8cd30186..a9d6473fc045 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1092,7 +1092,6 @@ static inline bool is_zone_movable_page(const struct page *page) } #ifdef CONFIG_DEV_PAGEMAP_OPS -void free_devmap_managed_page(struct page *page); DECLARE_STATIC_KEY_FALSE(devmap_managed_key); static inline bool page_is_devmap_managed(struct page *page) diff --git a/mm/memremap.c b/mm/memremap.c index 5f04a0709e43..55d23e9f5c04 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -501,4 +501,25 @@ void free_devmap_managed_page(struct page *page) page->mapping = NULL; page->pgmap->ops->page_free(page); } + +void put_devmap_managed_page(struct page *page) +{ + int count; + + if (WARN_ON_ONCE(!page_is_devmap_managed(page))) + return; + + count = page_ref_dec_return(page); + + /* + * devmap page refcounts are 1-based, rather than 0-based: if + * refcount is 1, then the page is free and the refcount is + * stable because nobody holds a reference on the page. + */ + if (count == 1) + free_devmap_managed_page(page); + else if (!count) + __put_page(page); +} +EXPORT_SYMBOL(put_devmap_managed_page); #endif /* CONFIG_DEV_PAGEMAP_OPS */ diff --git a/mm/swap.c b/mm/swap.c index 842d5cd92cf6..e499df864ef7 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -1154,26 +1154,3 @@ void __init swap_setup(void) * _really_ don't want to cluster much more */ } - -#ifdef CONFIG_DEV_PAGEMAP_OPS -void put_devmap_managed_page(struct page *page) -{ - int count; - - if (WARN_ON_ONCE(!page_is_devmap_managed(page))) - return; - - count = page_ref_dec_return(page); - - /* - * devmap page refcounts are 1-based, rather than 0-based: if - * refcount is 1, then the page is free and the refcount is - * stable because nobody holds a reference on the page. - */ - if (count == 1) - free_devmap_managed_page(page); - else if (!count) - __put_page(page); -} -EXPORT_SYMBOL(put_devmap_managed_page); -#endif -- cgit v1.2.3 From 895749455f6054e0c7b40a6ec449a3ab6db51bdd Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 16 Feb 2022 15:31:35 +1100 Subject: mm: simplify freeing of devmap managed pages Make put_devmap_managed_page return if it took charge of the page or not and remove the separate page_is_devmap_managed helper. Link: https://lkml.kernel.org/r/20220210072828.2930359-6-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Logan Gunthorpe Reviewed-by: Jason Gunthorpe Reviewed-by: Chaitanya Kulkarni Reviewed-by: Dan Williams Tested-by: "Sierra Guiza, Alejandro (Alex)" Cc: Alex Deucher Cc: Alistair Popple Cc: Ben Skeggs Cc: Christian Knig Cc: Felix Kuehling Cc: Karol Herbst Cc: Lyude Paul Cc: Miaohe Lin Cc: Muchun Song Cc: "Pan, Xinhui" Cc: Ralph Campbell Signed-off-by: Andrew Morton Signed-off-by: Matthew Wilcox (Oracle) --- include/linux/mm.h | 34 ++++++++++------------------------ mm/memremap.c | 20 +++++++++----------- mm/swap.c | 10 +--------- 3 files changed, 20 insertions(+), 44 deletions(-) (limited to 'mm/swap.c') diff --git a/include/linux/mm.h b/include/linux/mm.h index a9d6473fc045..8a59f0456149 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1094,33 +1094,24 @@ static inline bool is_zone_movable_page(const struct page *page) #ifdef CONFIG_DEV_PAGEMAP_OPS DECLARE_STATIC_KEY_FALSE(devmap_managed_key); -static inline bool page_is_devmap_managed(struct page *page) +bool __put_devmap_managed_page(struct page *page); +static inline bool put_devmap_managed_page(struct page *page) { if (!static_branch_unlikely(&devmap_managed_key)) return false; if (!is_zone_device_page(page)) return false; - switch (page->pgmap->type) { - case MEMORY_DEVICE_PRIVATE: - case MEMORY_DEVICE_FS_DAX: - return true; - default: - break; - } - return false; + if (page->pgmap->type != MEMORY_DEVICE_PRIVATE && + page->pgmap->type != MEMORY_DEVICE_FS_DAX) + return false; + return __put_devmap_managed_page(page); } -void put_devmap_managed_page(struct page *page); - #else /* CONFIG_DEV_PAGEMAP_OPS */ -static inline bool page_is_devmap_managed(struct page *page) +static inline bool put_devmap_managed_page(struct page *page) { return false; } - -static inline void put_devmap_managed_page(struct page *page) -{ -} #endif /* CONFIG_DEV_PAGEMAP_OPS */ static inline bool is_device_private_page(const struct page *page) @@ -1220,16 +1211,11 @@ static inline void put_page(struct page *page) struct folio *folio = page_folio(page); /* - * For devmap managed pages we need to catch refcount transition from - * 2 to 1, when refcount reach one it means the page is free and we - * need to inform the device driver through callback. See - * include/linux/memremap.h and HMM for details. + * For some devmap managed pages we need to catch refcount transition + * from 2 to 1: */ - if (page_is_devmap_managed(&folio->page)) { - put_devmap_managed_page(&folio->page); + if (put_devmap_managed_page(&folio->page)) return; - } - folio_put(folio); } diff --git a/mm/memremap.c b/mm/memremap.c index 55d23e9f5c04..f41233a67edb 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -502,24 +502,22 @@ void free_devmap_managed_page(struct page *page) page->pgmap->ops->page_free(page); } -void put_devmap_managed_page(struct page *page) +bool __put_devmap_managed_page(struct page *page) { - int count; - - if (WARN_ON_ONCE(!page_is_devmap_managed(page))) - return; - - count = page_ref_dec_return(page); - /* * devmap page refcounts are 1-based, rather than 0-based: if * refcount is 1, then the page is free and the refcount is * stable because nobody holds a reference on the page. */ - if (count == 1) + switch (page_ref_dec_return(page)) { + case 1: free_devmap_managed_page(page); - else if (!count) + break; + case 0: __put_page(page); + break; + } + return true; } -EXPORT_SYMBOL(put_devmap_managed_page); +EXPORT_SYMBOL(__put_devmap_managed_page); #endif /* CONFIG_DEV_PAGEMAP_OPS */ diff --git a/mm/swap.c b/mm/swap.c index e499df864ef7..db8d0eea13d7 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -930,16 +930,8 @@ void release_pages(struct page **pages, int nr) unlock_page_lruvec_irqrestore(lruvec, flags); lruvec = NULL; } - /* - * ZONE_DEVICE pages that return 'false' from - * page_is_devmap_managed() do not require special - * processing, and instead, expect a call to - * put_page_testzero(). - */ - if (page_is_devmap_managed(page)) { - put_devmap_managed_page(page); + if (put_devmap_managed_page(page)) continue; - } if (put_page_testzero(page)) put_dev_pagemap(page->pgmap); continue; -- cgit v1.2.3 From 27674ef6c73f0c9096a9827dc5d6ba9fc7808422 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 16 Feb 2022 15:31:36 +1100 Subject: mm: remove the extra ZONE_DEVICE struct page refcount ZONE_DEVICE struct pages have an extra reference count that complicates the code for put_page() and several places in the kernel that need to check the reference count to see that a page is not being used (gup, compaction, migration, etc.). Clean up the code so the reference count doesn't need to be treated specially for ZONE_DEVICE pages. Note that this excludes the special idle page wakeup for fsdax pages, which still happens at refcount 1. This is a separate issue and will be sorted out later. Given that only fsdax pages require the notifiacation when the refcount hits 1 now, the PAGEMAP_OPS Kconfig symbol can go away and be replaced with a FS_DAX check for this hook in the put_page fastpath. Based on an earlier patch from Ralph Campbell . Link: https://lkml.kernel.org/r/20220210072828.2930359-8-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Logan Gunthorpe Reviewed-by: Ralph Campbell Reviewed-by: Jason Gunthorpe Reviewed-by: Dan Williams Acked-by: Felix Kuehling Tested-by: "Sierra Guiza, Alejandro (Alex)" Cc: Alex Deucher Cc: Alistair Popple Cc: Ben Skeggs Cc: Chaitanya Kulkarni Cc: Christian Knig Cc: Karol Herbst Cc: Lyude Paul Cc: Miaohe Lin Cc: Muchun Song Cc: "Pan, Xinhui" Signed-off-by: Andrew Morton Signed-off-by: Matthew Wilcox (Oracle) --- arch/powerpc/kvm/book3s_hv_uvmem.c | 1 - drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 1 - drivers/gpu/drm/nouveau/nouveau_dmem.c | 1 - fs/Kconfig | 1 - include/linux/memremap.h | 12 +++---- include/linux/mm.h | 6 ++-- lib/test_hmm.c | 1 - mm/Kconfig | 4 --- mm/internal.h | 2 ++ mm/memcontrol.c | 11 ++---- mm/memremap.c | 57 +++++++++++--------------------- mm/migrate.c | 6 ---- mm/swap.c | 16 +++------ 13 files changed, 36 insertions(+), 83 deletions(-) (limited to 'mm/swap.c') diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c index 881951604227..8cabdb39cbbc 100644 --- a/arch/powerpc/kvm/book3s_hv_uvmem.c +++ b/arch/powerpc/kvm/book3s_hv_uvmem.c @@ -713,7 +713,6 @@ static struct page *kvmppc_uvmem_get_page(unsigned long gpa, struct kvm *kvm) dpage = pfn_to_page(uvmem_pfn); dpage->zone_device_data = pvt; - get_page(dpage); lock_page(dpage); return dpage; out_clear: diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c index cb835f95a76e..e27ca3758762 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c @@ -225,7 +225,6 @@ svm_migrate_get_vram_page(struct svm_range *prange, unsigned long pfn) page = pfn_to_page(pfn); svm_range_bo_ref(prange->svm_bo); page->zone_device_data = prange->svm_bo; - get_page(page); lock_page(page); } diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c b/drivers/gpu/drm/nouveau/nouveau_dmem.c index a5cdfbe32b5e..7ba66ad68a8a 100644 --- a/drivers/gpu/drm/nouveau/nouveau_dmem.c +++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c @@ -326,7 +326,6 @@ nouveau_dmem_page_alloc_locked(struct nouveau_drm *drm) return NULL; } - get_page(page); lock_page(page); return page; } diff --git a/fs/Kconfig b/fs/Kconfig index 6c7dc1387beb..e9433bbc4801 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -48,7 +48,6 @@ config FS_DAX bool "File system based Direct Access (DAX) support" depends on MMU depends on !(ARM || MIPS || SPARC) - select DEV_PAGEMAP_OPS if (ZONE_DEVICE && !FS_DAX_LIMITED) select FS_IOMAP select DAX help diff --git a/include/linux/memremap.h b/include/linux/memremap.h index 514ab46f597e..d6a114dd5ea8 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -68,9 +68,9 @@ enum memory_type { struct dev_pagemap_ops { /* - * Called once the page refcount reaches 1. (ZONE_DEVICE pages never - * reach 0 refcount unless there is a refcount bug. This allows the - * device driver to implement its own memory management.) + * Called once the page refcount reaches 0. The reference count will be + * reset to one by the core code after the method is called to prepare + * for handing out the page again. */ void (*page_free)(struct page *page); @@ -133,16 +133,14 @@ static inline unsigned long pgmap_vmemmap_nr(struct dev_pagemap *pgmap) static inline bool is_device_private_page(const struct page *page) { - return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) && - IS_ENABLED(CONFIG_DEVICE_PRIVATE) && + return IS_ENABLED(CONFIG_DEVICE_PRIVATE) && is_zone_device_page(page) && page->pgmap->type == MEMORY_DEVICE_PRIVATE; } static inline bool is_pci_p2pdma_page(const struct page *page) { - return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) && - IS_ENABLED(CONFIG_PCI_P2PDMA) && + return IS_ENABLED(CONFIG_PCI_P2PDMA) && is_zone_device_page(page) && page->pgmap->type == MEMORY_DEVICE_PCI_P2PDMA; } diff --git a/include/linux/mm.h b/include/linux/mm.h index cb8bee88e70c..0201d258c646 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1090,7 +1090,7 @@ static inline bool is_zone_movable_page(const struct page *page) return page_zonenum(page) == ZONE_MOVABLE; } -#ifdef CONFIG_DEV_PAGEMAP_OPS +#if defined(CONFIG_ZONE_DEVICE) && defined(CONFIG_FS_DAX) DECLARE_STATIC_KEY_FALSE(devmap_managed_key); bool __put_devmap_managed_page(struct page *page); @@ -1103,12 +1103,12 @@ static inline bool put_devmap_managed_page(struct page *page) return __put_devmap_managed_page(page); } -#else /* CONFIG_DEV_PAGEMAP_OPS */ +#else /* CONFIG_ZONE_DEVICE && CONFIG_FS_DAX */ static inline bool put_devmap_managed_page(struct page *page) { return false; } -#endif /* CONFIG_DEV_PAGEMAP_OPS */ +#endif /* CONFIG_ZONE_DEVICE && CONFIG_FS_DAX */ /* 127: arbitrary random number, small enough to assemble well */ #define folio_ref_zero_or_close_to_overflow(folio) \ diff --git a/lib/test_hmm.c b/lib/test_hmm.c index e5fc14ba71f3..cfe632047839 100644 --- a/lib/test_hmm.c +++ b/lib/test_hmm.c @@ -566,7 +566,6 @@ static struct page *dmirror_devmem_alloc_page(struct dmirror_device *mdevice) } dpage->zone_device_data = rpage; - get_page(dpage); lock_page(dpage); return dpage; diff --git a/mm/Kconfig b/mm/Kconfig index 3326ee3903f3..a1901ae6d062 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -776,9 +776,6 @@ config ZONE_DEVICE If FS_DAX is enabled, then say Y. -config DEV_PAGEMAP_OPS - bool - # # Helpers to mirror range of the CPU page tables of a process into device page # tables. @@ -790,7 +787,6 @@ config HMM_MIRROR config DEVICE_PRIVATE bool "Unaddressable device memory (GPU memory, ...)" depends on ZONE_DEVICE - select DEV_PAGEMAP_OPS help Allows creation of struct pages to represent unaddressable device diff --git a/mm/internal.h b/mm/internal.h index 450a2c8a43f3..3756dd5d2c92 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -735,4 +735,6 @@ void vunmap_range_noflush(unsigned long start, unsigned long end); int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, unsigned long addr, int page_nid, int *flags); +void free_zone_device_page(struct page *page); + #endif /* __MM_INTERNAL_H */ diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2c5032294c9f..8fef072dc1ce 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5503,17 +5503,12 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, return NULL; /* - * Handle MEMORY_DEVICE_PRIVATE which are ZONE_DEVICE page belonging to - * a device and because they are not accessible by CPU they are store - * as special swap entry in the CPU page table. + * Handle device private pages that are not accessible by the CPU, but + * stored as special swap entries in the page table. */ if (is_device_private_entry(ent)) { page = pfn_swap_entry_to_page(ent); - /* - * MEMORY_DEVICE_PRIVATE means ZONE_DEVICE page and which have - * a refcount of 1 when free (unlike normal page) - */ - if (!page_ref_add_unless(page, 1, 1)) + if (!get_page_unless_zero(page)) return NULL; return page; } diff --git a/mm/memremap.c b/mm/memremap.c index a0ece2344c2c..fef5734d5e49 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -12,6 +12,7 @@ #include #include #include +#include "internal.h" static DEFINE_XARRAY(pgmap_array); @@ -37,21 +38,19 @@ unsigned long memremap_compat_align(void) EXPORT_SYMBOL_GPL(memremap_compat_align); #endif -#ifdef CONFIG_DEV_PAGEMAP_OPS +#ifdef CONFIG_FS_DAX DEFINE_STATIC_KEY_FALSE(devmap_managed_key); EXPORT_SYMBOL(devmap_managed_key); static void devmap_managed_enable_put(struct dev_pagemap *pgmap) { - if (pgmap->type == MEMORY_DEVICE_PRIVATE || - pgmap->type == MEMORY_DEVICE_FS_DAX) + if (pgmap->type == MEMORY_DEVICE_FS_DAX) static_branch_dec(&devmap_managed_key); } static void devmap_managed_enable_get(struct dev_pagemap *pgmap) { - if (pgmap->type == MEMORY_DEVICE_PRIVATE || - pgmap->type == MEMORY_DEVICE_FS_DAX) + if (pgmap->type == MEMORY_DEVICE_FS_DAX) static_branch_inc(&devmap_managed_key); } #else @@ -61,7 +60,7 @@ static void devmap_managed_enable_get(struct dev_pagemap *pgmap) static void devmap_managed_enable_put(struct dev_pagemap *pgmap) { } -#endif /* CONFIG_DEV_PAGEMAP_OPS */ +#endif /* CONFIG_FS_DAX */ static void pgmap_array_delete(struct range *range) { @@ -102,23 +101,12 @@ static unsigned long pfn_end(struct dev_pagemap *pgmap, int range_id) return (range->start + range_len(range)) >> PAGE_SHIFT; } -static unsigned long pfn_next(struct dev_pagemap *pgmap, unsigned long pfn) -{ - if (pfn % (1024 << pgmap->vmemmap_shift)) - cond_resched(); - return pfn + pgmap_vmemmap_nr(pgmap); -} - static unsigned long pfn_len(struct dev_pagemap *pgmap, unsigned long range_id) { return (pfn_end(pgmap, range_id) - pfn_first(pgmap, range_id)) >> pgmap->vmemmap_shift; } -#define for_each_device_pfn(pfn, map, i) \ - for (pfn = pfn_first(map, i); pfn < pfn_end(map, i); \ - pfn = pfn_next(map, pfn)) - static void pageunmap_range(struct dev_pagemap *pgmap, int range_id) { struct range *range = &pgmap->ranges[range_id]; @@ -147,13 +135,11 @@ static void pageunmap_range(struct dev_pagemap *pgmap, int range_id) void memunmap_pages(struct dev_pagemap *pgmap) { - unsigned long pfn; int i; percpu_ref_kill(&pgmap->ref); for (i = 0; i < pgmap->nr_range; i++) - for_each_device_pfn(pfn, pgmap, i) - put_page(pfn_to_page(pfn)); + percpu_ref_put_many(&pgmap->ref, pfn_len(pgmap, i)); wait_for_completion(&pgmap->done); percpu_ref_exit(&pgmap->ref); @@ -464,14 +450,10 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn, } EXPORT_SYMBOL_GPL(get_dev_pagemap); -#ifdef CONFIG_DEV_PAGEMAP_OPS -void free_devmap_managed_page(struct page *page) +void free_zone_device_page(struct page *page) { - /* notify page idle for dax */ - if (!is_device_private_page(page)) { - wake_up_var(&page->_refcount); + if (WARN_ON_ONCE(!is_device_private_page(page))) return; - } __ClearPageWaiters(page); @@ -500,28 +482,27 @@ void free_devmap_managed_page(struct page *page) */ page->mapping = NULL; page->pgmap->ops->page_free(page); + + /* + * Reset the page count to 1 to prepare for handing out the page again. + */ + set_page_count(page, 1); } +#ifdef CONFIG_FS_DAX bool __put_devmap_managed_page(struct page *page) { - if (page->pgmap->type != MEMORY_DEVICE_PRIVATE && - page->pgmap->type != MEMORY_DEVICE_FS_DAX) + if (page->pgmap->type != MEMORY_DEVICE_FS_DAX) return false; /* - * devmap page refcounts are 1-based, rather than 0-based: if + * fsdax page refcounts are 1-based, rather than 0-based: if * refcount is 1, then the page is free and the refcount is * stable because nobody holds a reference on the page. */ - switch (page_ref_dec_return(page)) { - case 1: - free_devmap_managed_page(page); - break; - case 0: - __put_page(page); - break; - } + if (page_ref_dec_return(page) == 1) + wake_up_var(&page->_refcount); return true; } EXPORT_SYMBOL(__put_devmap_managed_page); -#endif /* CONFIG_DEV_PAGEMAP_OPS */ +#endif /* CONFIG_FS_DAX */ diff --git a/mm/migrate.c b/mm/migrate.c index e7d0b68d5dcb..af0534de618a 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -338,14 +338,8 @@ static int expected_page_refs(struct address_space *mapping, struct page *page) { int expected_count = 1; - /* - * Device private pages have an extra refcount as they are - * ZONE_DEVICE pages. - */ - expected_count += is_device_private_page(page); if (mapping) expected_count += compound_nr(page) + page_has_private(page); - return expected_count; } diff --git a/mm/swap.c b/mm/swap.c index db8d0eea13d7..fc3b7989f5b2 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -122,17 +122,9 @@ static void __put_compound_page(struct page *page) void __put_page(struct page *page) { - if (is_zone_device_page(page)) { - put_dev_pagemap(page->pgmap); - - /* - * The page belongs to the device that created pgmap. Do - * not return it to page allocator. - */ - return; - } - - if (unlikely(PageCompound(page))) + if (unlikely(is_zone_device_page(page))) + free_zone_device_page(page); + else if (unlikely(PageCompound(page))) __put_compound_page(page); else __put_single_page(page); @@ -933,7 +925,7 @@ void release_pages(struct page **pages, int nr) if (put_devmap_managed_page(page)) continue; if (put_page_testzero(page)) - put_dev_pagemap(page->pgmap); + free_zone_device_page(page); continue; } -- cgit v1.2.3 From 261b6840ed10419ac2f554e515592d59dd5c82cf Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sun, 13 Feb 2022 16:40:24 -0500 Subject: mm: Turn deactivate_file_page() into deactivate_file_folio() This function has one caller which already has a reference to the page, so we don't need to use get_page_unless_zero(). Also move the prototype to mm/internal.h. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Reviewed-by: Miaohe Lin --- include/linux/swap.h | 1 - mm/internal.h | 1 + mm/swap.c | 35 ++++++++++++++++++----------------- mm/truncate.c | 2 +- 4 files changed, 20 insertions(+), 19 deletions(-) (limited to 'mm/swap.c') diff --git a/include/linux/swap.h b/include/linux/swap.h index 304f174b4d31..064e60e9f63f 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -372,7 +372,6 @@ extern void lru_add_drain(void); extern void lru_add_drain_cpu(int cpu); extern void lru_add_drain_cpu_zone(struct zone *zone); extern void lru_add_drain_all(void); -extern void deactivate_file_page(struct page *page); extern void deactivate_page(struct page *page); extern void mark_page_lazyfree(struct page *page); extern void swap_setup(void); diff --git a/mm/internal.h b/mm/internal.h index 9c1959fff477..7c441f43ba31 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -66,6 +66,7 @@ static inline void wake_throttle_isolated(pg_data_t *pgdat) vm_fault_t do_swap_page(struct vm_fault *vmf); void folio_rotate_reclaimable(struct folio *folio); bool __folio_end_writeback(struct folio *folio); +void deactivate_file_folio(struct folio *folio); void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, unsigned long floor, unsigned long ceiling); diff --git a/mm/swap.c b/mm/swap.c index fc3b7989f5b2..65ec5cbab78b 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -630,32 +630,33 @@ void lru_add_drain_cpu(int cpu) } /** - * deactivate_file_page - forcefully deactivate a file page - * @page: page to deactivate + * deactivate_file_folio() - Forcefully deactivate a file folio. + * @folio: Folio to deactivate. * - * This function hints the VM that @page is a good reclaim candidate, - * for example if its invalidation fails due to the page being dirty + * This function hints to the VM that @folio is a good reclaim candidate, + * for example if its invalidation fails due to the folio being dirty * or under writeback. + * + * Context: Caller holds a reference on the page. */ -void deactivate_file_page(struct page *page) +void deactivate_file_folio(struct folio *folio) { + struct pagevec *pvec; + /* - * In a workload with many unevictable page such as mprotect, - * unevictable page deactivation for accelerating reclaim is pointless. + * In a workload with many unevictable pages such as mprotect, + * unevictable folio deactivation for accelerating reclaim is pointless. */ - if (PageUnevictable(page)) + if (folio_test_unevictable(folio)) return; - if (likely(get_page_unless_zero(page))) { - struct pagevec *pvec; - - local_lock(&lru_pvecs.lock); - pvec = this_cpu_ptr(&lru_pvecs.lru_deactivate_file); + folio_get(folio); + local_lock(&lru_pvecs.lock); + pvec = this_cpu_ptr(&lru_pvecs.lru_deactivate_file); - if (pagevec_add_and_need_flush(pvec, page)) - pagevec_lru_move_fn(pvec, lru_deactivate_file_fn); - local_unlock(&lru_pvecs.lock); - } + if (pagevec_add_and_need_flush(pvec, &folio->page)) + pagevec_lru_move_fn(pvec, lru_deactivate_file_fn); + local_unlock(&lru_pvecs.lock); } /* diff --git a/mm/truncate.c b/mm/truncate.c index a8b0243eadf6..9ed62cb3c503 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -527,7 +527,7 @@ static unsigned long __invalidate_mapping_pages(struct address_space *mapping, * of interest and try to speed up its reclaim. */ if (!ret) { - deactivate_file_page(&folio->page); + deactivate_file_folio(folio); /* It is likely on the pagevec of a remote CPU */ if (nr_pagevec) (*nr_pagevec)++; -- cgit v1.2.3