summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
authorRyan Roberts <ryan.roberts@arm.com>2024-04-08 21:39:41 +0300
committerAndrew Morton <akpm@linux-foundation.org>2024-04-26 06:56:37 +0300
commita62fb92ac12ed39df4930dca599a3b427552882a (patch)
treec0f9ffe693552fb8c89aae858db0186976579e50 /mm
parentd7d0d389ff90644546ffcb8e15ea3ccaf6138958 (diff)
downloadlinux-a62fb92ac12ed39df4930dca599a3b427552882a.tar.xz
mm: swap: free_swap_and_cache_nr() as batched free_swap_and_cache()
Now that we no longer have a convenient flag in the cluster to determine if a folio is large, free_swap_and_cache() will take a reference and lock a large folio much more often, which could lead to contention and (e.g.) failure to split large folios, etc. Let's solve that problem by batch freeing swap and cache with a new function, free_swap_and_cache_nr(), to free a contiguous range of swap entries together. This allows us to first drop a reference to each swap slot before we try to release the cache folio. This means we only try to release the folio once, only taking the reference and lock once - much better than the previous 512 times for the 2M THP case. Contiguous swap entries are gathered in zap_pte_range() and madvise_free_pte_range() in a similar way to how present ptes are already gathered in zap_pte_range(). While we are at it, let's simplify by converting the return type of both functions to void. The return value was used only by zap_pte_range() to print a bad pte, and was ignored by everyone else, so the extra reporting wasn't exactly guaranteed. We will still get the warning with most of the information from get_swap_device(). With the batch version, we wouldn't know which pte was bad anyway so could print the wrong one. [ryan.roberts@arm.com: fix a build warning on parisc] Link: https://lkml.kernel.org/r/20240409111840.3173122-1-ryan.roberts@arm.com Link: https://lkml.kernel.org/r/20240408183946.2991168-3-ryan.roberts@arm.com Signed-off-by: Ryan Roberts <ryan.roberts@arm.com> Acked-by: David Hildenbrand <david@redhat.com> Cc: Barry Song <21cnbao@gmail.com> Cc: Barry Song <v-songbaohua@oppo.com> Cc: Chris Li <chrisl@kernel.org> Cc: Gao Xiang <xiang@kernel.org> Cc: "Huang, Ying" <ying.huang@intel.com> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Lance Yang <ioworker0@gmail.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Michal Hocko <mhocko@suse.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yu Zhao <yuzhao@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Diffstat (limited to 'mm')
-rw-r--r--mm/internal.h64
-rw-r--r--mm/madvise.c12
-rw-r--r--mm/memory.c13
-rw-r--r--mm/swapfile.c97
4 files changed, 158 insertions, 28 deletions
diff --git a/mm/internal.h b/mm/internal.h
index d567381b12cc..d34df04b11f6 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -11,6 +11,8 @@
#include <linux/mm.h>
#include <linux/pagemap.h>
#include <linux/rmap.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
#include <linux/tracepoint-defs.h>
struct folio_batch;
@@ -189,6 +191,68 @@ static inline int folio_pte_batch(struct folio *folio, unsigned long addr,
return min(ptep - start_ptep, max_nr);
}
+
+/**
+ * pte_next_swp_offset - Increment the swap entry offset field of a swap pte.
+ * @pte: The initial pte state; is_swap_pte(pte) must be true and
+ * non_swap_entry() must be false.
+ *
+ * Increments the swap offset, while maintaining all other fields, including
+ * swap type, and any swp pte bits. The resulting pte is returned.
+ */
+static inline pte_t pte_next_swp_offset(pte_t pte)
+{
+ swp_entry_t entry = pte_to_swp_entry(pte);
+ pte_t new = __swp_entry_to_pte(__swp_entry(swp_type(entry),
+ (swp_offset(entry) + 1)));
+
+ if (pte_swp_soft_dirty(pte))
+ new = pte_swp_mksoft_dirty(new);
+ if (pte_swp_exclusive(pte))
+ new = pte_swp_mkexclusive(new);
+ if (pte_swp_uffd_wp(pte))
+ new = pte_swp_mkuffd_wp(new);
+
+ return new;
+}
+
+/**
+ * swap_pte_batch - detect a PTE batch for a set of contiguous swap entries
+ * @start_ptep: Page table pointer for the first entry.
+ * @max_nr: The maximum number of table entries to consider.
+ * @pte: Page table entry for the first entry.
+ *
+ * Detect a batch of contiguous swap entries: consecutive (non-present) PTEs
+ * containing swap entries all with consecutive offsets and targeting the same
+ * swap type, all with matching swp pte bits.
+ *
+ * max_nr must be at least one and must be limited by the caller so scanning
+ * cannot exceed a single page table.
+ *
+ * Return: the number of table entries in the batch.
+ */
+static inline int swap_pte_batch(pte_t *start_ptep, int max_nr, pte_t pte)
+{
+ pte_t expected_pte = pte_next_swp_offset(pte);
+ const pte_t *end_ptep = start_ptep + max_nr;
+ pte_t *ptep = start_ptep + 1;
+
+ VM_WARN_ON(max_nr < 1);
+ VM_WARN_ON(!is_swap_pte(pte));
+ VM_WARN_ON(non_swap_entry(pte_to_swp_entry(pte)));
+
+ while (ptep < end_ptep) {
+ pte = ptep_get(ptep);
+
+ if (!pte_same(pte, expected_pte))
+ break;
+
+ expected_pte = pte_next_swp_offset(expected_pte);
+ ptep++;
+ }
+
+ return ptep - start_ptep;
+}
#endif /* CONFIG_MMU */
void __acct_reclaim_writeback(pg_data_t *pgdat, struct folio *folio,
diff --git a/mm/madvise.c b/mm/madvise.c
index 1f77a51baaac..5011ecb24344 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -628,6 +628,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
struct folio *folio;
int nr_swap = 0;
unsigned long next;
+ int nr, max_nr;
next = pmd_addr_end(addr, end);
if (pmd_trans_huge(*pmd))
@@ -640,7 +641,8 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
return 0;
flush_tlb_batched_pending(mm);
arch_enter_lazy_mmu_mode();
- for (; addr != end; pte++, addr += PAGE_SIZE) {
+ for (; addr != end; pte += nr, addr += PAGE_SIZE * nr) {
+ nr = 1;
ptent = ptep_get(pte);
if (pte_none(ptent))
@@ -655,9 +657,11 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
entry = pte_to_swp_entry(ptent);
if (!non_swap_entry(entry)) {
- nr_swap--;
- free_swap_and_cache(entry);
- pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
+ max_nr = (end - addr) / PAGE_SIZE;
+ nr = swap_pte_batch(pte, max_nr, ptent);
+ nr_swap -= nr;
+ free_swap_and_cache_nr(entry, nr);
+ clear_not_present_full_ptes(mm, addr, pte, nr, tlb->fullmm);
} else if (is_hwpoison_entry(entry) ||
is_poisoned_swp_entry(entry)) {
pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
diff --git a/mm/memory.c b/mm/memory.c
index 694e18837cd8..7880400370c8 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1637,12 +1637,13 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
folio_remove_rmap_pte(folio, page, vma);
folio_put(folio);
} else if (!non_swap_entry(entry)) {
- /* Genuine swap entry, hence a private anon page */
+ max_nr = (end - addr) / PAGE_SIZE;
+ nr = swap_pte_batch(pte, max_nr, ptent);
+ /* Genuine swap entries, hence a private anon pages */
if (!should_zap_cows(details))
continue;
- rss[MM_SWAPENTS]--;
- if (unlikely(!free_swap_and_cache(entry)))
- print_bad_pte(vma, addr, ptent, NULL);
+ rss[MM_SWAPENTS] -= nr;
+ free_swap_and_cache_nr(entry, nr);
} else if (is_migration_entry(entry)) {
folio = pfn_swap_entry_folio(entry);
if (!should_zap_folio(details, folio))
@@ -1665,8 +1666,8 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
pr_alert("unrecognized swap entry 0x%lx\n", entry.val);
WARN_ON_ONCE(1);
}
- pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
- zap_install_uffd_wp_if_needed(vma, addr, pte, 1, details, ptent);
+ clear_not_present_full_ptes(mm, addr, pte, nr, tlb->fullmm);
+ zap_install_uffd_wp_if_needed(vma, addr, pte, nr, details, ptent);
} while (pte += nr, addr += PAGE_SIZE * nr, addr != end);
add_mm_rss_vec(mm, rss);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 1ded6d1dcab4..20c45757f2b2 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -130,7 +130,11 @@ static inline unsigned char swap_count(unsigned char ent)
/* Reclaim the swap entry if swap is getting full*/
#define TTRS_FULL 0x4
-/* returns 1 if swap entry is freed */
+/*
+ * returns number of pages in the folio that backs the swap entry. If positive,
+ * the folio was reclaimed. If negative, the folio was not reclaimed. If 0, no
+ * folio was associated with the swap entry.
+ */
static int __try_to_reclaim_swap(struct swap_info_struct *si,
unsigned long offset, unsigned long flags)
{
@@ -155,6 +159,7 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si,
ret = folio_free_swap(folio);
folio_unlock(folio);
}
+ ret = ret ? folio_nr_pages(folio) : -folio_nr_pages(folio);
folio_put(folio);
return ret;
}
@@ -895,7 +900,7 @@ checks:
swap_was_freed = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY);
spin_lock(&si->lock);
/* entry was freed successfully, try to use this again */
- if (swap_was_freed)
+ if (swap_was_freed > 0)
goto checks;
goto scan; /* check next one */
}
@@ -1572,32 +1577,88 @@ bool folio_free_swap(struct folio *folio)
return true;
}
-/*
- * Free the swap entry like above, but also try to
- * free the page cache entry if it is the last user.
+/**
+ * free_swap_and_cache_nr() - Release reference on range of swap entries and
+ * reclaim their cache if no more references remain.
+ * @entry: First entry of range.
+ * @nr: Number of entries in range.
+ *
+ * For each swap entry in the contiguous range, release a reference. If any swap
+ * entries become free, try to reclaim their underlying folios, if present. The
+ * offset range is defined by [entry.offset, entry.offset + nr).
*/
-int free_swap_and_cache(swp_entry_t entry)
+void free_swap_and_cache_nr(swp_entry_t entry, int nr)
{
- struct swap_info_struct *p;
+ const unsigned long start_offset = swp_offset(entry);
+ const unsigned long end_offset = start_offset + nr;
+ unsigned int type = swp_type(entry);
+ struct swap_info_struct *si;
+ bool any_only_cache = false;
+ unsigned long offset;
unsigned char count;
if (non_swap_entry(entry))
- return 1;
+ return;
- p = get_swap_device(entry);
- if (p) {
- if (WARN_ON(data_race(!p->swap_map[swp_offset(entry)]))) {
- put_swap_device(p);
- return 0;
+ si = get_swap_device(entry);
+ if (!si)
+ return;
+
+ if (WARN_ON(end_offset > si->max))
+ goto out;
+
+ /*
+ * First free all entries in the range.
+ */
+ for (offset = start_offset; offset < end_offset; offset++) {
+ if (data_race(si->swap_map[offset])) {
+ count = __swap_entry_free(si, swp_entry(type, offset));
+ if (count == SWAP_HAS_CACHE)
+ any_only_cache = true;
+ } else {
+ WARN_ON_ONCE(1);
}
+ }
+
+ /*
+ * Short-circuit the below loop if none of the entries had their
+ * reference drop to zero.
+ */
+ if (!any_only_cache)
+ goto out;
- count = __swap_entry_free(p, entry);
- if (count == SWAP_HAS_CACHE)
- __try_to_reclaim_swap(p, swp_offset(entry),
+ /*
+ * Now go back over the range trying to reclaim the swap cache. This is
+ * more efficient for large folios because we will only try to reclaim
+ * the swap once per folio in the common case. If we do
+ * __swap_entry_free() and __try_to_reclaim_swap() in the same loop, the
+ * latter will get a reference and lock the folio for every individual
+ * page but will only succeed once the swap slot for every subpage is
+ * zero.
+ */
+ for (offset = start_offset; offset < end_offset; offset += nr) {
+ nr = 1;
+ if (READ_ONCE(si->swap_map[offset]) == SWAP_HAS_CACHE) {
+ /*
+ * Folios are always naturally aligned in swap so
+ * advance forward to the next boundary. Zero means no
+ * folio was found for the swap entry, so advance by 1
+ * in this case. Negative value means folio was found
+ * but could not be reclaimed. Here we can still advance
+ * to the next boundary.
+ */
+ nr = __try_to_reclaim_swap(si, offset,
TTRS_UNMAPPED | TTRS_FULL);
- put_swap_device(p);
+ if (nr == 0)
+ nr = 1;
+ else if (nr < 0)
+ nr = -nr;
+ nr = ALIGN(offset + 1, nr) - offset;
+ }
}
- return p != NULL;
+
+out:
+ put_swap_device(si);
}
#ifdef CONFIG_HIBERNATION