summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--include/linux/pgtable.h70
-rw-r--r--mm/memory.c92
2 files changed, 136 insertions, 26 deletions
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index aab227e12493..49ab1f73b5c2 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -580,6 +580,76 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm,
}
#endif
+#ifndef get_and_clear_full_ptes
+/**
+ * get_and_clear_full_ptes - Clear present PTEs that map consecutive pages of
+ * the same folio, collecting dirty/accessed bits.
+ * @mm: Address space the pages are mapped into.
+ * @addr: Address the first page is mapped at.
+ * @ptep: Page table pointer for the first entry.
+ * @nr: Number of entries to clear.
+ * @full: Whether we are clearing a full mm.
+ *
+ * May be overridden by the architecture; otherwise, implemented as a simple
+ * loop over ptep_get_and_clear_full(), merging dirty/accessed bits into the
+ * returned PTE.
+ *
+ * Note that PTE bits in the PTE range besides the PFN can differ. For example,
+ * some PTEs might be write-protected.
+ *
+ * Context: The caller holds the page table lock. The PTEs map consecutive
+ * pages that belong to the same folio. The PTEs are all in the same PMD.
+ */
+static inline pte_t get_and_clear_full_ptes(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep, unsigned int nr, int full)
+{
+ pte_t pte, tmp_pte;
+
+ pte = ptep_get_and_clear_full(mm, addr, ptep, full);
+ while (--nr) {
+ ptep++;
+ addr += PAGE_SIZE;
+ tmp_pte = ptep_get_and_clear_full(mm, addr, ptep, full);
+ if (pte_dirty(tmp_pte))
+ pte = pte_mkdirty(pte);
+ if (pte_young(tmp_pte))
+ pte = pte_mkyoung(pte);
+ }
+ return pte;
+}
+#endif
+
+#ifndef clear_full_ptes
+/**
+ * clear_full_ptes - Clear present PTEs that map consecutive pages of the same
+ * folio.
+ * @mm: Address space the pages are mapped into.
+ * @addr: Address the first page is mapped at.
+ * @ptep: Page table pointer for the first entry.
+ * @nr: Number of entries to clear.
+ * @full: Whether we are clearing a full mm.
+ *
+ * May be overridden by the architecture; otherwise, implemented as a simple
+ * loop over ptep_get_and_clear_full().
+ *
+ * Note that PTE bits in the PTE range besides the PFN can differ. For example,
+ * some PTEs might be write-protected.
+ *
+ * Context: The caller holds the page table lock. The PTEs map consecutive
+ * pages that belong to the same folio. The PTEs are all in the same PMD.
+ */
+static inline void clear_full_ptes(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, unsigned int nr, int full)
+{
+ for (;;) {
+ ptep_get_and_clear_full(mm, addr, ptep, full);
+ if (--nr == 0)
+ break;
+ ptep++;
+ addr += PAGE_SIZE;
+ }
+}
+#endif
/*
* If two threads concurrently fault at the same page, the thread that
diff --git a/mm/memory.c b/mm/memory.c
index 168096f9360e..465ada39c2b7 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1517,7 +1517,7 @@ static inline bool zap_drop_file_uffd_wp(struct zap_details *details)
*/
static inline void
zap_install_uffd_wp_if_needed(struct vm_area_struct *vma,
- unsigned long addr, pte_t *pte,
+ unsigned long addr, pte_t *pte, int nr,
struct zap_details *details, pte_t pteval)
{
/* Zap on anonymous always means dropping everything */
@@ -1527,20 +1527,27 @@ zap_install_uffd_wp_if_needed(struct vm_area_struct *vma,
if (zap_drop_file_uffd_wp(details))
return;
- pte_install_uffd_wp_if_needed(vma, addr, pte, pteval);
+ for (;;) {
+ /* the PFN in the PTE is irrelevant. */
+ pte_install_uffd_wp_if_needed(vma, addr, pte, pteval);
+ if (--nr == 0)
+ break;
+ pte++;
+ addr += PAGE_SIZE;
+ }
}
-static inline void zap_present_folio_pte(struct mmu_gather *tlb,
+static __always_inline void zap_present_folio_ptes(struct mmu_gather *tlb,
struct vm_area_struct *vma, struct folio *folio,
- struct page *page, pte_t *pte, pte_t ptent, unsigned long addr,
- struct zap_details *details, int *rss, bool *force_flush,
- bool *force_break)
+ struct page *page, pte_t *pte, pte_t ptent, unsigned int nr,
+ unsigned long addr, struct zap_details *details, int *rss,
+ bool *force_flush, bool *force_break)
{
struct mm_struct *mm = tlb->mm;
bool delay_rmap = false;
if (!folio_test_anon(folio)) {
- ptent = ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm);
+ ptent = get_and_clear_full_ptes(mm, addr, pte, nr, tlb->fullmm);
if (pte_dirty(ptent)) {
folio_mark_dirty(folio);
if (tlb_delay_rmap(tlb)) {
@@ -1550,36 +1557,49 @@ static inline void zap_present_folio_pte(struct mmu_gather *tlb,
}
if (pte_young(ptent) && likely(vma_has_recency(vma)))
folio_mark_accessed(folio);
- rss[mm_counter(folio)]--;
+ rss[mm_counter(folio)] -= nr;
} else {
/* We don't need up-to-date accessed/dirty bits. */
- ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm);
- rss[MM_ANONPAGES]--;
+ clear_full_ptes(mm, addr, pte, nr, tlb->fullmm);
+ rss[MM_ANONPAGES] -= nr;
}
+ /* Checking a single PTE in a batch is sufficient. */
arch_check_zapped_pte(vma, ptent);
- tlb_remove_tlb_entry(tlb, pte, addr);
+ tlb_remove_tlb_entries(tlb, pte, nr, addr);
if (unlikely(userfaultfd_pte_wp(vma, ptent)))
- zap_install_uffd_wp_if_needed(vma, addr, pte, details, ptent);
+ zap_install_uffd_wp_if_needed(vma, addr, pte, nr, details,
+ ptent);
if (!delay_rmap) {
- folio_remove_rmap_pte(folio, page, vma);
+ folio_remove_rmap_ptes(folio, page, nr, vma);
+
+ /* Only sanity-check the first page in a batch. */
if (unlikely(page_mapcount(page) < 0))
print_bad_pte(vma, addr, ptent, page);
}
- if (unlikely(__tlb_remove_page(tlb, page, delay_rmap))) {
+ if (unlikely(__tlb_remove_folio_pages(tlb, page, nr, delay_rmap))) {
*force_flush = true;
*force_break = true;
}
}
-static inline void zap_present_pte(struct mmu_gather *tlb,
+/*
+ * Zap or skip at least one present PTE, trying to batch-process subsequent
+ * PTEs that map consecutive pages of the same folio.
+ *
+ * Returns the number of processed (skipped or zapped) PTEs (at least 1).
+ */
+static inline int zap_present_ptes(struct mmu_gather *tlb,
struct vm_area_struct *vma, pte_t *pte, pte_t ptent,
- unsigned long addr, struct zap_details *details,
- int *rss, bool *force_flush, bool *force_break)
+ unsigned int max_nr, unsigned long addr,
+ struct zap_details *details, int *rss, bool *force_flush,
+ bool *force_break)
{
+ const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
struct mm_struct *mm = tlb->mm;
struct folio *folio;
struct page *page;
+ int nr;
page = vm_normal_page(vma, addr, ptent);
if (!page) {
@@ -1589,14 +1609,29 @@ static inline void zap_present_pte(struct mmu_gather *tlb,
tlb_remove_tlb_entry(tlb, pte, addr);
VM_WARN_ON_ONCE(userfaultfd_wp(vma));
ksm_might_unmap_zero_page(mm, ptent);
- return;
+ return 1;
}
folio = page_folio(page);
if (unlikely(!should_zap_folio(details, folio)))
- return;
- zap_present_folio_pte(tlb, vma, folio, page, pte, ptent, addr, details,
- rss, force_flush, force_break);
+ return 1;
+
+ /*
+ * Make sure that the common "small folio" case is as fast as possible
+ * by keeping the batching logic separate.
+ */
+ if (unlikely(folio_test_large(folio) && max_nr != 1)) {
+ nr = folio_pte_batch(folio, addr, pte, ptent, max_nr, fpb_flags,
+ NULL);
+
+ zap_present_folio_ptes(tlb, vma, folio, page, pte, ptent, nr,
+ addr, details, rss, force_flush,
+ force_break);
+ return nr;
+ }
+ zap_present_folio_ptes(tlb, vma, folio, page, pte, ptent, 1, addr,
+ details, rss, force_flush, force_break);
+ return 1;
}
static unsigned long zap_pte_range(struct mmu_gather *tlb,
@@ -1611,6 +1646,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
pte_t *start_pte;
pte_t *pte;
swp_entry_t entry;
+ int nr;
tlb_change_page_size(tlb, PAGE_SIZE);
init_rss_vec(rss);
@@ -1624,7 +1660,9 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
pte_t ptent = ptep_get(pte);
struct folio *folio;
struct page *page;
+ int max_nr;
+ nr = 1;
if (pte_none(ptent))
continue;
@@ -1632,10 +1670,12 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
break;
if (pte_present(ptent)) {
- zap_present_pte(tlb, vma, pte, ptent, addr, details,
- rss, &force_flush, &force_break);
+ max_nr = (end - addr) / PAGE_SIZE;
+ nr = zap_present_ptes(tlb, vma, pte, ptent, max_nr,
+ addr, details, rss, &force_flush,
+ &force_break);
if (unlikely(force_break)) {
- addr += PAGE_SIZE;
+ addr += nr * PAGE_SIZE;
break;
}
continue;
@@ -1689,8 +1729,8 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
WARN_ON_ONCE(1);
}
pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
- zap_install_uffd_wp_if_needed(vma, addr, pte, details, ptent);
- } while (pte++, addr += PAGE_SIZE, addr != end);
+ zap_install_uffd_wp_if_needed(vma, addr, pte, 1, details, ptent);
+ } while (pte += nr, addr += PAGE_SIZE * nr, addr != end);
add_mm_rss_vec(mm, rss);
arch_leave_lazy_mmu_mode();