summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--include/linux/rmap.h40
-rw-r--r--include/linux/swap.h15
-rw-r--r--include/linux/swapops.h25
-rw-r--r--mm/huge_memory.c78
-rw-r--r--mm/hugetlb.c15
-rw-r--r--mm/ksm.c13
-rw-r--r--mm/memory.c33
-rw-r--r--mm/migrate.c14
-rw-r--r--mm/migrate_device.c21
-rw-r--r--mm/mprotect.c8
-rw-r--r--mm/rmap.c61
11 files changed, 289 insertions, 34 deletions
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index e4156921eea9..cbe279a6f0de 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -229,6 +229,13 @@ static inline int page_try_dup_anon_rmap(struct page *page, bool compound,
VM_BUG_ON_PAGE(!PageAnon(page), page);
/*
+ * No need to check+clear for already shared pages, including KSM
+ * pages.
+ */
+ if (!PageAnonExclusive(page))
+ goto dup;
+
+ /*
* If this page may have been pinned by the parent process,
* don't allow to duplicate the mapping but instead require to e.g.,
* copy the page immediately for the child so that we'll always
@@ -239,14 +246,47 @@ static inline int page_try_dup_anon_rmap(struct page *page, bool compound,
unlikely(page_needs_cow_for_dma(vma, page))))
return -EBUSY;
+ ClearPageAnonExclusive(page);
/*
* It's okay to share the anon page between both processes, mapping
* the page R/O into both processes.
*/
+dup:
__page_dup_rmap(page, compound);
return 0;
}
+/**
+ * page_try_share_anon_rmap - try marking an exclusive anonymous page possibly
+ * shared to prepare for KSM or temporary unmapping
+ * @page: the exclusive anonymous page to try marking possibly shared
+ *
+ * The caller needs to hold the PT lock and has to have the page table entry
+ * cleared/invalidated+flushed, to properly sync against GUP-fast.
+ *
+ * This is similar to page_try_dup_anon_rmap(), however, not used during fork()
+ * to duplicate a mapping, but instead to prepare for KSM or temporarily
+ * unmapping a page (swap, migration) via page_remove_rmap().
+ *
+ * Marking the page shared can only fail if the page may be pinned; device
+ * private pages cannot get pinned and consequently this function cannot fail.
+ *
+ * Returns 0 if marking the page possibly shared succeeded. Returns -EBUSY
+ * otherwise.
+ */
+static inline int page_try_share_anon_rmap(struct page *page)
+{
+ VM_BUG_ON_PAGE(!PageAnon(page) || !PageAnonExclusive(page), page);
+
+ /* See page_try_dup_anon_rmap(). */
+ if (likely(!is_device_private_page(page) &&
+ unlikely(page_maybe_dma_pinned(page))))
+ return -EBUSY;
+
+ ClearPageAnonExclusive(page);
+ return 0;
+}
+
/*
* Called from mm/vmscan.c to handle paging out
*/
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 27093b477c5f..e6d70a4156e8 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -78,12 +78,19 @@ static inline int current_is_kswapd(void)
#endif
/*
- * NUMA node memory migration support
+ * Page migration support.
+ *
+ * SWP_MIGRATION_READ_EXCLUSIVE is only applicable to anonymous pages and
+ * indicates that the referenced (part of) an anonymous page is exclusive to
+ * a single process. For SWP_MIGRATION_WRITE, that information is implicit:
+ * (part of) an anonymous page that are mapped writable are exclusive to a
+ * single process.
*/
#ifdef CONFIG_MIGRATION
-#define SWP_MIGRATION_NUM 2
-#define SWP_MIGRATION_READ (MAX_SWAPFILES + SWP_HWPOISON_NUM)
-#define SWP_MIGRATION_WRITE (MAX_SWAPFILES + SWP_HWPOISON_NUM + 1)
+#define SWP_MIGRATION_NUM 3
+#define SWP_MIGRATION_READ (MAX_SWAPFILES + SWP_HWPOISON_NUM)
+#define SWP_MIGRATION_READ_EXCLUSIVE (MAX_SWAPFILES + SWP_HWPOISON_NUM + 1)
+#define SWP_MIGRATION_WRITE (MAX_SWAPFILES + SWP_HWPOISON_NUM + 2)
#else
#define SWP_MIGRATION_NUM 0
#endif
diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index 5af852b68805..6648b97244e7 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -194,6 +194,7 @@ static inline bool is_writable_device_exclusive_entry(swp_entry_t entry)
static inline int is_migration_entry(swp_entry_t entry)
{
return unlikely(swp_type(entry) == SWP_MIGRATION_READ ||
+ swp_type(entry) == SWP_MIGRATION_READ_EXCLUSIVE ||
swp_type(entry) == SWP_MIGRATION_WRITE);
}
@@ -202,11 +203,26 @@ static inline int is_writable_migration_entry(swp_entry_t entry)
return unlikely(swp_type(entry) == SWP_MIGRATION_WRITE);
}
+static inline int is_readable_migration_entry(swp_entry_t entry)
+{
+ return unlikely(swp_type(entry) == SWP_MIGRATION_READ);
+}
+
+static inline int is_readable_exclusive_migration_entry(swp_entry_t entry)
+{
+ return unlikely(swp_type(entry) == SWP_MIGRATION_READ_EXCLUSIVE);
+}
+
static inline swp_entry_t make_readable_migration_entry(pgoff_t offset)
{
return swp_entry(SWP_MIGRATION_READ, offset);
}
+static inline swp_entry_t make_readable_exclusive_migration_entry(pgoff_t offset)
+{
+ return swp_entry(SWP_MIGRATION_READ_EXCLUSIVE, offset);
+}
+
static inline swp_entry_t make_writable_migration_entry(pgoff_t offset)
{
return swp_entry(SWP_MIGRATION_WRITE, offset);
@@ -224,6 +240,11 @@ static inline swp_entry_t make_readable_migration_entry(pgoff_t offset)
return swp_entry(0, 0);
}
+static inline swp_entry_t make_readable_exclusive_migration_entry(pgoff_t offset)
+{
+ return swp_entry(0, 0);
+}
+
static inline swp_entry_t make_writable_migration_entry(pgoff_t offset)
{
return swp_entry(0, 0);
@@ -244,6 +265,10 @@ static inline int is_writable_migration_entry(swp_entry_t entry)
{
return 0;
}
+static inline int is_readable_migration_entry(swp_entry_t entry)
+{
+ return 0;
+}
#endif
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index cb5bcd833d9e..231911b7bf9d 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1054,7 +1054,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
swp_entry_t entry = pmd_to_swp_entry(pmd);
VM_BUG_ON(!is_pmd_migration_entry(pmd));
- if (is_writable_migration_entry(entry)) {
+ if (!is_readable_migration_entry(entry)) {
entry = make_readable_migration_entry(
swp_offset(entry));
pmd = swp_entry_to_pmd(entry);
@@ -1292,6 +1292,10 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
page = pmd_page(orig_pmd);
VM_BUG_ON_PAGE(!PageHead(page), page);
+ /* Early check when only holding the PT lock. */
+ if (PageAnonExclusive(page))
+ goto reuse;
+
if (!trylock_page(page)) {
get_page(page);
spin_unlock(vmf->ptl);
@@ -1306,6 +1310,12 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
put_page(page);
}
+ /* Recheck after temporarily dropping the PT lock. */
+ if (PageAnonExclusive(page)) {
+ unlock_page(page);
+ goto reuse;
+ }
+
/*
* See do_wp_page(): we can only map the page writable if there are
* no additional references. Note that we always drain the LRU
@@ -1319,11 +1329,12 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
pmd_t entry;
page_move_anon_rmap(page, vma);
+ unlock_page(page);
+reuse:
entry = pmd_mkyoung(orig_pmd);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1))
update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
- unlock_page(page);
spin_unlock(vmf->ptl);
return VM_FAULT_WRITE;
}
@@ -1708,6 +1719,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
if (is_swap_pmd(*pmd)) {
swp_entry_t entry = pmd_to_swp_entry(*pmd);
+ struct page *page = pfn_swap_entry_to_page(entry);
VM_BUG_ON(!is_pmd_migration_entry(*pmd));
if (is_writable_migration_entry(entry)) {
@@ -1716,8 +1728,10 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
* A protection check is difficult so
* just be safe and disable write
*/
- entry = make_readable_migration_entry(
- swp_offset(entry));
+ if (PageAnon(page))
+ entry = make_readable_exclusive_migration_entry(swp_offset(entry));
+ else
+ entry = make_readable_migration_entry(swp_offset(entry));
newpmd = swp_entry_to_pmd(entry);
if (pmd_swp_soft_dirty(*pmd))
newpmd = pmd_swp_mksoft_dirty(newpmd);
@@ -1937,6 +1951,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
pgtable_t pgtable;
pmd_t old_pmd, _pmd;
bool young, write, soft_dirty, pmd_migration = false, uffd_wp = false;
+ bool anon_exclusive = false;
unsigned long addr;
int i;
@@ -2018,6 +2033,8 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
entry = pmd_to_swp_entry(old_pmd);
page = pfn_swap_entry_to_page(entry);
write = is_writable_migration_entry(entry);
+ if (PageAnon(page))
+ anon_exclusive = is_readable_exclusive_migration_entry(entry);
young = false;
soft_dirty = pmd_swp_soft_dirty(old_pmd);
uffd_wp = pmd_swp_uffd_wp(old_pmd);
@@ -2029,8 +2046,26 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
young = pmd_young(old_pmd);
soft_dirty = pmd_soft_dirty(old_pmd);
uffd_wp = pmd_uffd_wp(old_pmd);
+
VM_BUG_ON_PAGE(!page_count(page), page);
page_ref_add(page, HPAGE_PMD_NR - 1);
+
+ /*
+ * Without "freeze", we'll simply split the PMD, propagating the
+ * PageAnonExclusive() flag for each PTE by setting it for
+ * each subpage -- no need to (temporarily) clear.
+ *
+ * With "freeze" we want to replace mapped pages by
+ * migration entries right away. This is only possible if we
+ * managed to clear PageAnonExclusive() -- see
+ * set_pmd_migration_entry().
+ *
+ * In case we cannot clear PageAnonExclusive(), split the PMD
+ * only and let try_to_migrate_one() fail later.
+ */
+ anon_exclusive = PageAnon(page) && PageAnonExclusive(page);
+ if (freeze && anon_exclusive && page_try_share_anon_rmap(page))
+ freeze = false;
}
/*
@@ -2052,6 +2087,9 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
if (write)
swp_entry = make_writable_migration_entry(
page_to_pfn(page + i));
+ else if (anon_exclusive)
+ swp_entry = make_readable_exclusive_migration_entry(
+ page_to_pfn(page + i));
else
swp_entry = make_readable_migration_entry(
page_to_pfn(page + i));
@@ -2063,6 +2101,8 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
} else {
entry = mk_pte(page + i, READ_ONCE(vma->vm_page_prot));
entry = maybe_mkwrite(entry, vma);
+ if (anon_exclusive)
+ SetPageAnonExclusive(page + i);
if (!write)
entry = pte_wrprotect(entry);
if (!young)
@@ -2294,6 +2334,13 @@ static void __split_huge_page_tail(struct page *head, int tail,
*
* After successful get_page_unless_zero() might follow flags change,
* for example lock_page() which set PG_waiters.
+ *
+ * Note that for mapped sub-pages of an anonymous THP,
+ * PG_anon_exclusive has been cleared in unmap_page() and is stored in
+ * the migration entry instead from where remap_page() will restore it.
+ * We can still have PG_anon_exclusive set on effectively unmapped and
+ * unreferenced sub-pages of an anonymous THP: we can simply drop
+ * PG_anon_exclusive (-> PG_mappedtodisk) for these here.
*/
page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
page_tail->flags |= (head->flags &
@@ -3025,6 +3072,7 @@ void set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
struct vm_area_struct *vma = pvmw->vma;
struct mm_struct *mm = vma->vm_mm;
unsigned long address = pvmw->address;
+ bool anon_exclusive;
pmd_t pmdval;
swp_entry_t entry;
pmd_t pmdswp;
@@ -3034,10 +3082,19 @@ void set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
flush_cache_range(vma, address, address + HPAGE_PMD_SIZE);
pmdval = pmdp_invalidate(vma, address, pvmw->pmd);
+
+ anon_exclusive = PageAnon(page) && PageAnonExclusive(page);
+ if (anon_exclusive && page_try_share_anon_rmap(page)) {
+ set_pmd_at(mm, address, pvmw->pmd, pmdval);
+ return;
+ }
+
if (pmd_dirty(pmdval))
set_page_dirty(page);
if (pmd_write(pmdval))
entry = make_writable_migration_entry(page_to_pfn(page));
+ else if (anon_exclusive)
+ entry = make_readable_exclusive_migration_entry(page_to_pfn(page));
else
entry = make_readable_migration_entry(page_to_pfn(page));
pmdswp = swp_entry_to_pmd(entry);
@@ -3071,10 +3128,17 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)
if (pmd_swp_uffd_wp(*pvmw->pmd))
pmde = pmd_wrprotect(pmd_mkuffd_wp(pmde));
- if (PageAnon(new))
- page_add_anon_rmap(new, vma, mmun_start, RMAP_COMPOUND);
- else
+ if (PageAnon(new)) {
+ rmap_t rmap_flags = RMAP_COMPOUND;
+
+ if (!is_readable_migration_entry(entry))
+ rmap_flags |= RMAP_EXCLUSIVE;
+
+ page_add_anon_rmap(new, vma, mmun_start, rmap_flags);
+ } else {
page_add_file_rmap(new, vma, true);
+ }
+ VM_BUG_ON(pmd_write(pmde) && PageAnon(new) && !PageAnonExclusive(new));
set_pmd_at(mm, mmun_start, pvmw->pmd, pmde);
/* No need to invalidate - it was non-present before */
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 03cbb75bcb54..c0012b4a8eec 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -4790,7 +4790,7 @@ again:
is_hugetlb_entry_hwpoisoned(entry))) {
swp_entry_t swp_entry = pte_to_swp_entry(entry);
- if (is_writable_migration_entry(swp_entry) && cow) {
+ if (!is_readable_migration_entry(swp_entry) && cow) {
/*
* COW mappings require pages in both
* parent and child to be set to read.
@@ -5190,6 +5190,8 @@ retry_avoidcopy:
set_huge_ptep_writable(vma, haddr, ptep);
return 0;
}
+ VM_BUG_ON_PAGE(PageAnon(old_page) && PageAnonExclusive(old_page),
+ old_page);
/*
* If the process that created a MAP_PRIVATE mapping is about to
@@ -6187,12 +6189,17 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
}
if (unlikely(is_hugetlb_entry_migration(pte))) {
swp_entry_t entry = pte_to_swp_entry(pte);
+ struct page *page = pfn_swap_entry_to_page(entry);
- if (is_writable_migration_entry(entry)) {
+ if (!is_readable_migration_entry(entry)) {
pte_t newpte;
- entry = make_readable_migration_entry(
- swp_offset(entry));
+ if (PageAnon(page))
+ entry = make_readable_exclusive_migration_entry(
+ swp_offset(entry));
+ else
+ entry = make_readable_migration_entry(
+ swp_offset(entry));
newpte = swp_entry_to_pte(entry);
set_huge_swap_pte_at(mm, address, ptep,
newpte, huge_page_size(h));
diff --git a/mm/ksm.c b/mm/ksm.c
index 2b6692a7df6a..38360285497a 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -872,6 +872,7 @@ static inline struct stable_node *page_stable_node(struct page *page)
static inline void set_page_stable_node(struct page *page,
struct stable_node *stable_node)
{
+ VM_BUG_ON_PAGE(PageAnon(page) && PageAnonExclusive(page), page);
page->mapping = (void *)((unsigned long)stable_node | PAGE_MAPPING_KSM);
}
@@ -1044,6 +1045,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
int swapped;
int err = -EFAULT;
struct mmu_notifier_range range;
+ bool anon_exclusive;
pvmw.address = page_address_in_vma(page, vma);
if (pvmw.address == -EFAULT)
@@ -1061,9 +1063,10 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
if (WARN_ONCE(!pvmw.pte, "Unexpected PMD mapping?"))
goto out_unlock;
+ anon_exclusive = PageAnonExclusive(page);
if (pte_write(*pvmw.pte) || pte_dirty(*pvmw.pte) ||
(pte_protnone(*pvmw.pte) && pte_savedwrite(*pvmw.pte)) ||
- mm_tlb_flush_pending(mm)) {
+ anon_exclusive || mm_tlb_flush_pending(mm)) {
pte_t entry;
swapped = PageSwapCache(page);
@@ -1091,6 +1094,12 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
set_pte_at(mm, pvmw.address, pvmw.pte, entry);
goto out_unlock;
}
+
+ if (anon_exclusive && page_try_share_anon_rmap(page)) {
+ set_pte_at(mm, pvmw.address, pvmw.pte, entry);
+ goto out_unlock;
+ }
+
if (pte_dirty(entry))
set_page_dirty(page);
@@ -1149,6 +1158,8 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
pte_unmap_unlock(ptep, ptl);
goto out_mn;
}
+ VM_BUG_ON_PAGE(PageAnonExclusive(page), page);
+ VM_BUG_ON_PAGE(PageAnon(kpage) && PageAnonExclusive(kpage), kpage);
/*
* No need to check ksm_use_zero_pages here: we can only have a
diff --git a/mm/memory.c b/mm/memory.c
index 0b0727758c86..454ecc05ad85 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -720,6 +720,8 @@ static void restore_exclusive_pte(struct vm_area_struct *vma,
else if (is_writable_device_exclusive_entry(entry))
pte = maybe_mkwrite(pte_mkdirty(pte), vma);
+ VM_BUG_ON(pte_write(pte) && !(PageAnon(page) && PageAnonExclusive(page)));
+
/*
* No need to take a page reference as one was already
* created when the swap entry was made.
@@ -796,11 +798,12 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
rss[mm_counter(page)]++;
- if (is_writable_migration_entry(entry) &&
+ if (!is_readable_migration_entry(entry) &&
is_cow_mapping(vm_flags)) {
/*
- * COW mappings require pages in both
- * parent and child to be set to read.
+ * COW mappings require pages in both parent and child
+ * to be set to read. A previously exclusive entry is
+ * now shared.
*/
entry = make_readable_migration_entry(
swp_offset(entry));
@@ -951,6 +954,7 @@ copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
ptep_set_wrprotect(src_mm, addr, src_pte);
pte = pte_wrprotect(pte);
}
+ VM_BUG_ON(page && PageAnon(page) && PageAnonExclusive(page));
/*
* If it's a shared mapping, mark it clean in
@@ -2949,6 +2953,9 @@ static inline void wp_page_reuse(struct vm_fault *vmf)
struct vm_area_struct *vma = vmf->vma;
struct page *page = vmf->page;
pte_t entry;
+
+ VM_BUG_ON(PageAnon(page) && !PageAnonExclusive(page));
+
/*
* Clear the pages cpupid information as the existing
* information potentially belongs to a now completely
@@ -3274,6 +3281,13 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
struct page *page = vmf->page;
/*
+ * If the page is exclusive to this process we must reuse the
+ * page without further checks.
+ */
+ if (PageAnonExclusive(page))
+ goto reuse;
+
+ /*
* We have to verify under page lock: these early checks are
* just an optimization to avoid locking the page and freeing
* the swapcache if there is little hope that we can reuse.
@@ -3305,6 +3319,7 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
*/
page_move_anon_rmap(page, vma);
unlock_page(page);
+reuse:
wp_page_reuse(vmf);
return VM_FAULT_WRITE;
} else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
@@ -3696,11 +3711,12 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
* that are certainly not shared because we just allocated them without
* exposing them to the swapcache.
*/
- if ((vmf->flags & FAULT_FLAG_WRITE) && !PageKsm(page) &&
- (page != swapcache || page_count(page) == 1)) {
- pte = maybe_mkwrite(pte_mkdirty(pte), vma);
- vmf->flags &= ~FAULT_FLAG_WRITE;
- ret |= VM_FAULT_WRITE;
+ if (!PageKsm(page) && (page != swapcache || page_count(page) == 1)) {
+ if (vmf->flags & FAULT_FLAG_WRITE) {
+ pte = maybe_mkwrite(pte_mkdirty(pte), vma);
+ vmf->flags &= ~FAULT_FLAG_WRITE;
+ ret |= VM_FAULT_WRITE;
+ }
rmap_flags |= RMAP_EXCLUSIVE;
}
flush_icache_page(vma, page);
@@ -3720,6 +3736,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
page_add_anon_rmap(page, vma, vmf->address, rmap_flags);
}
+ VM_BUG_ON(!PageAnon(page) || (pte_write(pte) && !PageAnonExclusive(page)));
set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte);
diff --git a/mm/migrate.c b/mm/migrate.c
index 92e932f08be5..b2678279eb43 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -177,6 +177,7 @@ static bool remove_migration_pte(struct folio *folio,
DEFINE_FOLIO_VMA_WALK(pvmw, old, vma, addr, PVMW_SYNC | PVMW_MIGRATION);
while (page_vma_mapped_walk(&pvmw)) {
+ rmap_t rmap_flags = RMAP_NONE;
pte_t pte;
swp_entry_t entry;
struct page *new;
@@ -211,6 +212,9 @@ static bool remove_migration_pte(struct folio *folio,
else if (pte_swp_uffd_wp(*pvmw.pte))
pte = pte_mkuffd_wp(pte);
+ if (folio_test_anon(folio) && !is_readable_migration_entry(entry))
+ rmap_flags |= RMAP_EXCLUSIVE;
+
if (unlikely(is_device_private_page(new))) {
if (pte_write(pte))
entry = make_writable_device_private_entry(
@@ -233,7 +237,7 @@ static bool remove_migration_pte(struct folio *folio,
pte = arch_make_huge_pte(pte, shift, vma->vm_flags);
if (folio_test_anon(folio))
hugepage_add_anon_rmap(new, vma, pvmw.address,
- RMAP_NONE);
+ rmap_flags);
else
page_dup_file_rmap(new, true);
set_huge_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
@@ -242,7 +246,7 @@ static bool remove_migration_pte(struct folio *folio,
{
if (folio_test_anon(folio))
page_add_anon_rmap(new, vma, pvmw.address,
- RMAP_NONE);
+ rmap_flags);
else
page_add_file_rmap(new, vma, false);
set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
@@ -514,6 +518,12 @@ void folio_migrate_flags(struct folio *newfolio, struct folio *folio)
folio_set_workingset(newfolio);
if (folio_test_checked(folio))
folio_set_checked(newfolio);
+ /*
+ * PG_anon_exclusive (-> PG_mappedtodisk) is always migrated via
+ * migration entries. We can still have PG_anon_exclusive set on an
+ * effectively unmapped and unreferenced first sub-pages of an
+ * anonymous THP: we can simply copy it here via PG_mappedtodisk.
+ */
if (folio_test_mappedtodisk(folio))
folio_set_mappedtodisk(newfolio);
diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index fb6d7d5499f5..5052093d0262 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -184,15 +184,34 @@ again:
* set up a special migration page table entry now.
*/
if (trylock_page(page)) {
+ bool anon_exclusive;
pte_t swp_pte;
+ anon_exclusive = PageAnon(page) && PageAnonExclusive(page);
+ if (anon_exclusive) {
+ flush_cache_page(vma, addr, pte_pfn(*ptep));
+ ptep_clear_flush(vma, addr, ptep);
+
+ if (page_try_share_anon_rmap(page)) {
+ set_pte_at(mm, addr, ptep, pte);
+ unlock_page(page);
+ put_page(page);
+ mpfn = 0;
+ goto next;
+ }
+ } else {
+ ptep_get_and_clear(mm, addr, ptep);
+ }
+
migrate->cpages++;
- ptep_get_and_clear(mm, addr, ptep);
/* Setup special migration page table entry */
if (mpfn & MIGRATE_PFN_WRITE)
entry = make_writable_migration_entry(
page_to_pfn(page));
+ else if (anon_exclusive)
+ entry = make_readable_exclusive_migration_entry(
+ page_to_pfn(page));
else
entry = make_readable_migration_entry(
page_to_pfn(page));
diff --git a/mm/mprotect.c b/mm/mprotect.c
index b69ce7a7b2b7..56060acdabd3 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -152,6 +152,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
pages++;
} else if (is_swap_pte(oldpte)) {
swp_entry_t entry = pte_to_swp_entry(oldpte);
+ struct page *page = pfn_swap_entry_to_page(entry);
pte_t newpte;
if (is_writable_migration_entry(entry)) {
@@ -159,8 +160,11 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
* A protection check is difficult so
* just be safe and disable write
*/
- entry = make_readable_migration_entry(
- swp_offset(entry));
+ if (PageAnon(page))
+ entry = make_readable_exclusive_migration_entry(
+ swp_offset(entry));
+ else
+ entry = make_readable_migration_entry(swp_offset(entry));
newpte = swp_entry_to_pte(entry);
if (pte_swp_soft_dirty(oldpte))
newpte = pte_swp_mksoft_dirty(newpte);
diff --git a/mm/rmap.c b/mm/rmap.c
index 90f92c53476f..0d63e7ce35cc 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1088,6 +1088,7 @@ int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff,
void page_move_anon_rmap(struct page *page, struct vm_area_struct *vma)
{
struct anon_vma *anon_vma = vma->anon_vma;
+ struct page *subpage = page;
page = compound_head(page);
@@ -1101,6 +1102,7 @@ void page_move_anon_rmap(struct page *page, struct vm_area_struct *vma)
* folio_test_anon()) will not see one without the other.
*/
WRITE_ONCE(page->mapping, (struct address_space *) anon_vma);
+ SetPageAnonExclusive(subpage);
}
/**
@@ -1118,7 +1120,7 @@ static void __page_set_anon_rmap(struct page *page,
BUG_ON(!anon_vma);
if (PageAnon(page))
- return;
+ goto out;
/*
* If the page isn't exclusively mapped into this vma,
@@ -1137,6 +1139,9 @@ static void __page_set_anon_rmap(struct page *page,
anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
WRITE_ONCE(page->mapping, (struct address_space *) anon_vma);
page->index = linear_page_index(vma, address);
+out:
+ if (exclusive)
+ SetPageAnonExclusive(page);
}
/**
@@ -1198,6 +1203,8 @@ void page_add_anon_rmap(struct page *page,
} else {
first = atomic_inc_and_test(&page->_mapcount);
}
+ VM_BUG_ON_PAGE(!first && (flags & RMAP_EXCLUSIVE), page);
+ VM_BUG_ON_PAGE(!first && PageAnonExclusive(page), page);
if (first) {
int nr = compound ? thp_nr_pages(page) : 1;
@@ -1459,7 +1466,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
pte_t pteval;
struct page *subpage;
- bool ret = true;
+ bool anon_exclusive, ret = true;
struct mmu_notifier_range range;
enum ttu_flags flags = (enum ttu_flags)(long)arg;
@@ -1515,6 +1522,8 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
subpage = folio_page(folio,
pte_pfn(*pvmw.pte) - folio_pfn(folio));
address = pvmw.address;
+ anon_exclusive = folio_test_anon(folio) &&
+ PageAnonExclusive(subpage);
if (folio_test_hugetlb(folio) && !folio_test_anon(folio)) {
/*
@@ -1550,9 +1559,12 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
}
}
- /* Nuke the page table entry. */
+ /*
+ * Nuke the page table entry. When having to clear
+ * PageAnonExclusive(), we always have to flush.
+ */
flush_cache_page(vma, address, pte_pfn(*pvmw.pte));
- if (should_defer_flush(mm, flags)) {
+ if (should_defer_flush(mm, flags) && !anon_exclusive) {
/*
* We clear the PTE but do not flush so potentially
* a remote CPU could still be writing to the folio.
@@ -1677,6 +1689,24 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
page_vma_mapped_walk_done(&pvmw);
break;
}
+ if (anon_exclusive &&
+ page_try_share_anon_rmap(subpage)) {
+ swap_free(entry);
+ set_pte_at(mm, address, pvmw.pte, pteval);
+ ret = false;
+ page_vma_mapped_walk_done(&pvmw);
+ break;
+ }
+ /*
+ * Note: We *don't* remember yet if the page was mapped
+ * exclusively in the swap entry, so swapin code has
+ * to re-determine that manually and might detect the
+ * page as possibly shared, for example, if there are
+ * other references on the page or if the page is under
+ * writeback. We made sure that there are no GUP pins
+ * on the page that would rely on it, so for GUP pins
+ * this is fine.
+ */
if (list_empty(&mm->mmlist)) {
spin_lock(&mmlist_lock);
if (list_empty(&mm->mmlist))
@@ -1776,7 +1806,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
pte_t pteval;
struct page *subpage;
- bool ret = true;
+ bool anon_exclusive, ret = true;
struct mmu_notifier_range range;
enum ttu_flags flags = (enum ttu_flags)(long)arg;
@@ -1837,6 +1867,8 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
subpage = folio_page(folio,
pte_pfn(*pvmw.pte) - folio_pfn(folio));
address = pvmw.address;
+ anon_exclusive = folio_test_anon(folio) &&
+ PageAnonExclusive(subpage);
if (folio_test_hugetlb(folio) && !folio_test_anon(folio)) {
/*
@@ -1888,6 +1920,9 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
swp_entry_t entry;
pte_t swp_pte;
+ if (anon_exclusive)
+ BUG_ON(page_try_share_anon_rmap(subpage));
+
/*
* Store the pfn of the page in a special migration
* pte. do_swap_page() will wait until the migration
@@ -1896,6 +1931,8 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
entry = pte_to_swp_entry(pteval);
if (is_writable_device_private_entry(entry))
entry = make_writable_migration_entry(pfn);
+ else if (anon_exclusive)
+ entry = make_readable_exclusive_migration_entry(pfn);
else
entry = make_readable_migration_entry(pfn);
swp_pte = swp_entry_to_pte(entry);
@@ -1960,6 +1997,15 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
page_vma_mapped_walk_done(&pvmw);
break;
}
+ VM_BUG_ON_PAGE(pte_write(pteval) && folio_test_anon(folio) &&
+ !anon_exclusive, subpage);
+ if (anon_exclusive &&
+ page_try_share_anon_rmap(subpage)) {
+ set_pte_at(mm, address, pvmw.pte, pteval);
+ ret = false;
+ page_vma_mapped_walk_done(&pvmw);
+ break;
+ }
/*
* Store the pfn of the page in a special migration
@@ -1969,6 +2015,9 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
if (pte_write(pteval))
entry = make_writable_migration_entry(
page_to_pfn(subpage));
+ else if (anon_exclusive)
+ entry = make_readable_exclusive_migration_entry(
+ page_to_pfn(subpage));
else
entry = make_readable_migration_entry(
page_to_pfn(subpage));
@@ -2405,6 +2454,8 @@ void hugepage_add_anon_rmap(struct page *page, struct vm_area_struct *vma,
BUG_ON(!anon_vma);
/* address might be in next vma when migration races vma_adjust */
first = atomic_inc_and_test(compound_mapcount_ptr(page));
+ VM_BUG_ON_PAGE(!first && (flags & RMAP_EXCLUSIVE), page);
+ VM_BUG_ON_PAGE(!first && PageAnonExclusive(page), page);
if (first)
__page_set_anon_rmap(page, vma, address,
!!(flags & RMAP_EXCLUSIVE));