summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2022-12-03 00:39:38 +0300
committerLinus Torvalds <torvalds@linux-foundation.org>2022-12-03 00:39:38 +0300
commitbdaa78c6aa861f0e8c612a0b2272423d92f0071c (patch)
tree3a4c4e15fb169955c8c07d20bdd8cef84ef0f35a /mm
parent6647e76ab623b2b3fb2efe03a86e9c9046c52c33 (diff)
parent1d351f1894342c378b96bb9ed89f8debb1e24e9f (diff)
downloadlinux-bdaa78c6aa861f0e8c612a0b2272423d92f0071c.tar.xz
Merge tag 'mm-hotfixes-stable-2022-12-02' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Pull misc hotfixes from Andrew Morton: "15 hotfixes, 11 marked cc:stable. Only three or four of the latter address post-6.0 issues, which is hopefully a sign that things are converging" * tag 'mm-hotfixes-stable-2022-12-02' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: revert "kbuild: fix -Wimplicit-function-declaration in license_is_gpl_compatible" Kconfig.debug: provide a little extra FRAME_WARN leeway when KASAN is enabled drm/amdgpu: temporarily disable broken Clang builds due to blown stack-frame mm/khugepaged: invoke MMU notifiers in shmem/file collapse paths mm/khugepaged: fix GUP-fast interaction by sending IPI mm/khugepaged: take the right locks for page table retraction mm: migrate: fix THP's mapcount on isolation mm: introduce arch_has_hw_nonleaf_pmd_young() mm: add dummy pmd_young() for architectures not having it mm/damon/sysfs: fix wrong empty schemes assumption under online tuning in damon_sysfs_set_schemes() tools/vm/slabinfo-gnuplot: use "grep -E" instead of "egrep" nilfs2: fix NULL pointer dereference in nilfs_palloc_commit_free_entry() hugetlb: don't delete vma_lock in hugetlb MADV_DONTNEED processing madvise: use zap_page_range_single for madvise dontneed mm: replace VM_WARN_ON to pr_warn if the node is offline with __GFP_THISNODE
Diffstat (limited to 'mm')
-rw-r--r--mm/compaction.c22
-rw-r--r--mm/damon/sysfs.c46
-rw-r--r--mm/hugetlb.c27
-rw-r--r--mm/khugepaged.c62
-rw-r--r--mm/madvise.c6
-rw-r--r--mm/memory.c25
-rw-r--r--mm/mmu_gather.c4
-rw-r--r--mm/vmscan.c10
8 files changed, 150 insertions, 52 deletions
diff --git a/mm/compaction.c b/mm/compaction.c
index c51f7f545afe..1f6da31dd9a5 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -985,28 +985,28 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
}
/*
+ * Be careful not to clear PageLRU until after we're
+ * sure the page is not being freed elsewhere -- the
+ * page release code relies on it.
+ */
+ if (unlikely(!get_page_unless_zero(page)))
+ goto isolate_fail;
+
+ /*
* Migration will fail if an anonymous page is pinned in memory,
* so avoid taking lru_lock and isolating it unnecessarily in an
* admittedly racy check.
*/
mapping = page_mapping(page);
- if (!mapping && page_count(page) > page_mapcount(page))
- goto isolate_fail;
+ if (!mapping && (page_count(page) - 1) > total_mapcount(page))
+ goto isolate_fail_put;
/*
* Only allow to migrate anonymous pages in GFP_NOFS context
* because those do not depend on fs locks.
*/
if (!(cc->gfp_mask & __GFP_FS) && mapping)
- goto isolate_fail;
-
- /*
- * Be careful not to clear PageLRU until after we're
- * sure the page is not being freed elsewhere -- the
- * page release code relies on it.
- */
- if (unlikely(!get_page_unless_zero(page)))
- goto isolate_fail;
+ goto isolate_fail_put;
/* Only take pages on LRU: a check now makes later tests safe */
if (!PageLRU(page))
diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index 5ce403378c20..07e5f1bdf025 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -2283,12 +2283,54 @@ static struct damos *damon_sysfs_mk_scheme(
&wmarks);
}
+static void damon_sysfs_update_scheme(struct damos *scheme,
+ struct damon_sysfs_scheme *sysfs_scheme)
+{
+ struct damon_sysfs_access_pattern *access_pattern =
+ sysfs_scheme->access_pattern;
+ struct damon_sysfs_quotas *sysfs_quotas = sysfs_scheme->quotas;
+ struct damon_sysfs_weights *sysfs_weights = sysfs_quotas->weights;
+ struct damon_sysfs_watermarks *sysfs_wmarks = sysfs_scheme->watermarks;
+
+ scheme->pattern.min_sz_region = access_pattern->sz->min;
+ scheme->pattern.max_sz_region = access_pattern->sz->max;
+ scheme->pattern.min_nr_accesses = access_pattern->nr_accesses->min;
+ scheme->pattern.max_nr_accesses = access_pattern->nr_accesses->max;
+ scheme->pattern.min_age_region = access_pattern->age->min;
+ scheme->pattern.max_age_region = access_pattern->age->max;
+
+ scheme->action = sysfs_scheme->action;
+
+ scheme->quota.ms = sysfs_quotas->ms;
+ scheme->quota.sz = sysfs_quotas->sz;
+ scheme->quota.reset_interval = sysfs_quotas->reset_interval_ms;
+ scheme->quota.weight_sz = sysfs_weights->sz;
+ scheme->quota.weight_nr_accesses = sysfs_weights->nr_accesses;
+ scheme->quota.weight_age = sysfs_weights->age;
+
+ scheme->wmarks.metric = sysfs_wmarks->metric;
+ scheme->wmarks.interval = sysfs_wmarks->interval_us;
+ scheme->wmarks.high = sysfs_wmarks->high;
+ scheme->wmarks.mid = sysfs_wmarks->mid;
+ scheme->wmarks.low = sysfs_wmarks->low;
+}
+
static int damon_sysfs_set_schemes(struct damon_ctx *ctx,
struct damon_sysfs_schemes *sysfs_schemes)
{
- int i;
+ struct damos *scheme, *next;
+ int i = 0;
+
+ damon_for_each_scheme_safe(scheme, next, ctx) {
+ if (i < sysfs_schemes->nr)
+ damon_sysfs_update_scheme(scheme,
+ sysfs_schemes->schemes_arr[i]);
+ else
+ damon_destroy_scheme(scheme);
+ i++;
+ }
- for (i = 0; i < sysfs_schemes->nr; i++) {
+ for (; i < sysfs_schemes->nr; i++) {
struct damos *scheme, *next;
scheme = damon_sysfs_mk_scheme(sysfs_schemes->schemes_arr[i]);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index f1385c3b6c96..e36ca75311a5 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -5206,17 +5206,22 @@ void __unmap_hugepage_range_final(struct mmu_gather *tlb,
__unmap_hugepage_range(tlb, vma, start, end, ref_page, zap_flags);
- /*
- * Unlock and free the vma lock before releasing i_mmap_rwsem. When
- * the vma_lock is freed, this makes the vma ineligible for pmd
- * sharing. And, i_mmap_rwsem is required to set up pmd sharing.
- * This is important as page tables for this unmapped range will
- * be asynchrously deleted. If the page tables are shared, there
- * will be issues when accessed by someone else.
- */
- __hugetlb_vma_unlock_write_free(vma);
-
- i_mmap_unlock_write(vma->vm_file->f_mapping);
+ if (zap_flags & ZAP_FLAG_UNMAP) { /* final unmap */
+ /*
+ * Unlock and free the vma lock before releasing i_mmap_rwsem.
+ * When the vma_lock is freed, this makes the vma ineligible
+ * for pmd sharing. And, i_mmap_rwsem is required to set up
+ * pmd sharing. This is important as page tables for this
+ * unmapped range will be asynchrously deleted. If the page
+ * tables are shared, there will be issues when accessed by
+ * someone else.
+ */
+ __hugetlb_vma_unlock_write_free(vma);
+ i_mmap_unlock_write(vma->vm_file->f_mapping);
+ } else {
+ i_mmap_unlock_write(vma->vm_file->f_mapping);
+ hugetlb_vma_unlock_write(vma);
+ }
}
void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index a8d5ef2a77d2..3703a56571c1 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1051,6 +1051,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
_pmd = pmdp_collapse_flush(vma, address, pmd);
spin_unlock(pmd_ptl);
mmu_notifier_invalidate_range_end(&range);
+ tlb_remove_table_sync_one();
spin_lock(pte_ptl);
result = __collapse_huge_page_isolate(vma, address, pte, cc,
@@ -1379,16 +1380,43 @@ static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr,
return SCAN_SUCCEED;
}
+/*
+ * A note about locking:
+ * Trying to take the page table spinlocks would be useless here because those
+ * are only used to synchronize:
+ *
+ * - modifying terminal entries (ones that point to a data page, not to another
+ * page table)
+ * - installing *new* non-terminal entries
+ *
+ * Instead, we need roughly the same kind of protection as free_pgtables() or
+ * mm_take_all_locks() (but only for a single VMA):
+ * The mmap lock together with this VMA's rmap locks covers all paths towards
+ * the page table entries we're messing with here, except for hardware page
+ * table walks and lockless_pages_from_mm().
+ */
static void collapse_and_free_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, pmd_t *pmdp)
{
- spinlock_t *ptl;
pmd_t pmd;
+ struct mmu_notifier_range range;
mmap_assert_write_locked(mm);
- ptl = pmd_lock(vma->vm_mm, pmdp);
+ if (vma->vm_file)
+ lockdep_assert_held_write(&vma->vm_file->f_mapping->i_mmap_rwsem);
+ /*
+ * All anon_vmas attached to the VMA have the same root and are
+ * therefore locked by the same lock.
+ */
+ if (vma->anon_vma)
+ lockdep_assert_held_write(&vma->anon_vma->root->rwsem);
+
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, NULL, mm, addr,
+ addr + HPAGE_PMD_SIZE);
+ mmu_notifier_invalidate_range_start(&range);
pmd = pmdp_collapse_flush(vma, addr, pmdp);
- spin_unlock(ptl);
+ tlb_remove_table_sync_one();
+ mmu_notifier_invalidate_range_end(&range);
mm_dec_nr_ptes(mm);
page_table_check_pte_clear_range(mm, addr, pmd);
pte_free(mm, pmd_pgtable(pmd));
@@ -1439,6 +1467,14 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
if (!hugepage_vma_check(vma, vma->vm_flags, false, false, false))
return SCAN_VMA_CHECK;
+ /*
+ * Symmetry with retract_page_tables(): Exclude MAP_PRIVATE mappings
+ * that got written to. Without this, we'd have to also lock the
+ * anon_vma if one exists.
+ */
+ if (vma->anon_vma)
+ return SCAN_VMA_CHECK;
+
/* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */
if (userfaultfd_wp(vma))
return SCAN_PTE_UFFD_WP;
@@ -1472,6 +1508,20 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
goto drop_hpage;
}
+ /*
+ * We need to lock the mapping so that from here on, only GUP-fast and
+ * hardware page walks can access the parts of the page tables that
+ * we're operating on.
+ * See collapse_and_free_pmd().
+ */
+ i_mmap_lock_write(vma->vm_file->f_mapping);
+
+ /*
+ * This spinlock should be unnecessary: Nobody else should be accessing
+ * the page tables under spinlock protection here, only
+ * lockless_pages_from_mm() and the hardware page walker can access page
+ * tables while all the high-level locks are held in write mode.
+ */
start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl);
result = SCAN_FAIL;
@@ -1526,6 +1576,8 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
/* step 4: remove pte entries */
collapse_and_free_pmd(mm, vma, haddr, pmd);
+ i_mmap_unlock_write(vma->vm_file->f_mapping);
+
maybe_install_pmd:
/* step 5: install pmd entry */
result = install_pmd
@@ -1539,6 +1591,7 @@ drop_hpage:
abort:
pte_unmap_unlock(start_pte, ptl);
+ i_mmap_unlock_write(vma->vm_file->f_mapping);
goto drop_hpage;
}
@@ -1595,7 +1648,8 @@ static int retract_page_tables(struct address_space *mapping, pgoff_t pgoff,
* An alternative would be drop the check, but check that page
* table is clear before calling pmdp_collapse_flush() under
* ptl. It has higher chance to recover THP for the VMA, but
- * has higher cost too.
+ * has higher cost too. It would also probably require locking
+ * the anon_vma.
*/
if (vma->anon_vma) {
result = SCAN_PAGE_ANON;
diff --git a/mm/madvise.c b/mm/madvise.c
index c7105ec6d08c..b913ba6efc10 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -772,8 +772,8 @@ static int madvise_free_single_vma(struct vm_area_struct *vma,
* Application no longer needs these pages. If the pages are dirty,
* it's OK to just throw them away. The app will be more careful about
* data it wants to keep. Be sure to free swap resources too. The
- * zap_page_range call sets things up for shrink_active_list to actually free
- * these pages later if no one else has touched them in the meantime,
+ * zap_page_range_single call sets things up for shrink_active_list to actually
+ * free these pages later if no one else has touched them in the meantime,
* although we could add these pages to a global reuse list for
* shrink_active_list to pick up before reclaiming other pages.
*
@@ -790,7 +790,7 @@ static int madvise_free_single_vma(struct vm_area_struct *vma,
static long madvise_dontneed_single_vma(struct vm_area_struct *vma,
unsigned long start, unsigned long end)
{
- zap_page_range(vma, start, end - start);
+ zap_page_range_single(vma, start, end - start, NULL);
return 0;
}
diff --git a/mm/memory.c b/mm/memory.c
index 8a6d5c823f91..8c8420934d60 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1341,15 +1341,6 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
return ret;
}
-/*
- * Parameter block passed down to zap_pte_range in exceptional cases.
- */
-struct zap_details {
- struct folio *single_folio; /* Locked folio to be unmapped */
- bool even_cows; /* Zap COWed private pages too? */
- zap_flags_t zap_flags; /* Extra flags for zapping */
-};
-
/* Whether we should zap all COWed (private) pages too */
static inline bool should_zap_cows(struct zap_details *details)
{
@@ -1720,7 +1711,7 @@ void unmap_vmas(struct mmu_gather *tlb, struct maple_tree *mt,
{
struct mmu_notifier_range range;
struct zap_details details = {
- .zap_flags = ZAP_FLAG_DROP_MARKER,
+ .zap_flags = ZAP_FLAG_DROP_MARKER | ZAP_FLAG_UNMAP,
/* Careful - we need to zap private pages too! */
.even_cows = true,
};
@@ -1774,19 +1765,27 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start,
*
* The range must fit into one VMA.
*/
-static void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
+void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
unsigned long size, struct zap_details *details)
{
+ const unsigned long end = address + size;
struct mmu_notifier_range range;
struct mmu_gather tlb;
lru_add_drain();
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
- address, address + size);
+ address, end);
+ if (is_vm_hugetlb_page(vma))
+ adjust_range_if_pmd_sharing_possible(vma, &range.start,
+ &range.end);
tlb_gather_mmu(&tlb, vma->vm_mm);
update_hiwater_rss(vma->vm_mm);
mmu_notifier_invalidate_range_start(&range);
- unmap_single_vma(&tlb, vma, address, range.end, details);
+ /*
+ * unmap 'address-end' not 'range.start-range.end' as range
+ * could have been expanded for hugetlb pmd sharing.
+ */
+ unmap_single_vma(&tlb, vma, address, end, details);
mmu_notifier_invalidate_range_end(&range);
tlb_finish_mmu(&tlb);
}
diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
index add4244e5790..3a2c3f8cad2f 100644
--- a/mm/mmu_gather.c
+++ b/mm/mmu_gather.c
@@ -153,7 +153,7 @@ static void tlb_remove_table_smp_sync(void *arg)
/* Simply deliver the interrupt */
}
-static void tlb_remove_table_sync_one(void)
+void tlb_remove_table_sync_one(void)
{
/*
* This isn't an RCU grace period and hence the page-tables cannot be
@@ -177,8 +177,6 @@ static void tlb_remove_table_free(struct mmu_table_batch *batch)
#else /* !CONFIG_MMU_GATHER_RCU_TABLE_FREE */
-static void tlb_remove_table_sync_one(void) { }
-
static void tlb_remove_table_free(struct mmu_table_batch *batch)
{
__tlb_remove_table_free(batch);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 026199c047e0..8fcc5fa768c0 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -3987,7 +3987,7 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area
goto next;
if (!pmd_trans_huge(pmd[i])) {
- if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) &&
+ if (arch_has_hw_nonleaf_pmd_young() &&
get_cap(LRU_GEN_NONLEAF_YOUNG))
pmdp_test_and_clear_young(vma, addr, pmd + i);
goto next;
@@ -4085,14 +4085,14 @@ restart:
#endif
walk->mm_stats[MM_NONLEAF_TOTAL]++;
-#ifdef CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG
- if (get_cap(LRU_GEN_NONLEAF_YOUNG)) {
+ if (arch_has_hw_nonleaf_pmd_young() &&
+ get_cap(LRU_GEN_NONLEAF_YOUNG)) {
if (!pmd_young(val))
continue;
walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos);
}
-#endif
+
if (!walk->force_scan && !test_bloom_filter(walk->lruvec, walk->max_seq, pmd + i))
continue;
@@ -5392,7 +5392,7 @@ static ssize_t show_enabled(struct kobject *kobj, struct kobj_attribute *attr, c
if (arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK))
caps |= BIT(LRU_GEN_MM_WALK);
- if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) && get_cap(LRU_GEN_NONLEAF_YOUNG))
+ if (arch_has_hw_nonleaf_pmd_young() && get_cap(LRU_GEN_NONLEAF_YOUNG))
caps |= BIT(LRU_GEN_NONLEAF_YOUNG);
return snprintf(buf, PAGE_SIZE, "0x%04x\n", caps);