summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2023-06-28 20:28:11 +0300
committerLinus Torvalds <torvalds@linux-foundation.org>2023-06-28 20:28:11 +0300
commit6e17c6de3ddf3073741d9c91a796ee696914d8a0 (patch)
tree2c425707f78642625dbe2c824c7fded2021e3dc7 /mm
parent6aeadf7896bff4ca230702daba8788455e6b866e (diff)
parentacc72d59c7509540c27c49625cb4b5a8db1f1a84 (diff)
downloadlinux-6e17c6de3ddf3073741d9c91a796ee696914d8a0.tar.xz
Merge tag 'mm-stable-2023-06-24-19-15' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Pull mm updates from Andrew Morton: - Yosry Ahmed brought back some cgroup v1 stats in OOM logs - Yosry has also eliminated cgroup's atomic rstat flushing - Nhat Pham adds the new cachestat() syscall. It provides userspace with the ability to query pagecache status - a similar concept to mincore() but more powerful and with improved usability - Mel Gorman provides more optimizations for compaction, reducing the prevalence of page rescanning - Lorenzo Stoakes has done some maintanance work on the get_user_pages() interface - Liam Howlett continues with cleanups and maintenance work to the maple tree code. Peng Zhang also does some work on maple tree - Johannes Weiner has done some cleanup work on the compaction code - David Hildenbrand has contributed additional selftests for get_user_pages() - Thomas Gleixner has contributed some maintenance and optimization work for the vmalloc code - Baolin Wang has provided some compaction cleanups, - SeongJae Park continues maintenance work on the DAMON code - Huang Ying has done some maintenance on the swap code's usage of device refcounting - Christoph Hellwig has some cleanups for the filemap/directio code - Ryan Roberts provides two patch series which yield some rationalization of the kernel's access to pte entries - use the provided APIs rather than open-coding accesses - Lorenzo Stoakes has some fixes to the interaction between pagecache and directio access to file mappings - John Hubbard has a series of fixes to the MM selftesting code - ZhangPeng continues the folio conversion campaign - Hugh Dickins has been working on the pagetable handling code, mainly with a view to reducing the load on the mmap_lock - Catalin Marinas has reduced the arm64 kmalloc() minimum alignment from 128 to 8 - Domenico Cerasuolo has improved the zswap reclaim mechanism by reorganizing the LRU management - Matthew Wilcox provides some fixups to make gfs2 work better with the buffer_head code - Vishal Moola also has done some folio conversion work - Matthew Wilcox has removed the remnants of the pagevec code - their functionality is migrated over to struct folio_batch * tag 'mm-stable-2023-06-24-19-15' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (380 commits) mm/hugetlb: remove hugetlb_set_page_subpool() mm: nommu: correct the range of mmap_sem_read_lock in task_mem() hugetlb: revert use of page_cache_next_miss() Revert "page cache: fix page_cache_next/prev_miss off by one" mm/vmscan: fix root proactive reclaim unthrottling unbalanced node mm: memcg: rename and document global_reclaim() mm: kill [add|del]_page_to_lru_list() mm: compaction: convert to use a folio in isolate_migratepages_block() mm: zswap: fix double invalidate with exclusive loads mm: remove unnecessary pagevec includes mm: remove references to pagevec mm: rename invalidate_mapping_pagevec to mapping_try_invalidate mm: remove struct pagevec net: convert sunrpc from pagevec to folio_batch i915: convert i915_gpu_error to use a folio_batch pagevec: rename fbatch_count() mm: remove check_move_unevictable_pages() drm: convert drm_gem_put_pages() to use a folio_batch i915: convert shmem_sg_free_table() to use a folio_batch scatterlist: add sg_set_folio() ...
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig16
-rw-r--r--mm/Makefile4
-rw-r--r--mm/backing-dev.c17
-rw-r--r--mm/cma.c4
-rw-r--r--mm/compaction.c334
-rw-r--r--mm/damon/core-test.h24
-rw-r--r--mm/damon/ops-common.c32
-rw-r--r--mm/damon/ops-common.h4
-rw-r--r--mm/damon/paddr.c6
-rw-r--r--mm/damon/vaddr.c26
-rw-r--r--mm/debug.c9
-rw-r--r--mm/debug_page_alloc.c59
-rw-r--r--mm/debug_vm_pgtable.c9
-rw-r--r--mm/dmapool.c10
-rw-r--r--mm/early_ioremap.c8
-rw-r--r--mm/fadvise.c17
-rw-r--r--mm/fail_page_alloc.c66
-rw-r--r--mm/filemap.c450
-rw-r--r--mm/frontswap.c10
-rw-r--r--mm/gup.c406
-rw-r--r--mm/gup_test.c27
-rw-r--r--mm/highmem.c12
-rw-r--r--mm/hmm.c6
-rw-r--r--mm/huge_memory.c56
-rw-r--r--mm/hugetlb.c126
-rw-r--r--mm/hugetlb_vmemmap.c17
-rw-r--r--mm/internal.h87
-rw-r--r--mm/kasan/common.c2
-rw-r--r--mm/kasan/generic.c76
-rw-r--r--mm/kasan/init.c9
-rw-r--r--mm/kasan/kasan.h159
-rw-r--r--mm/kasan/report.c44
-rw-r--r--mm/kasan/report_generic.c12
-rw-r--r--mm/kasan/report_hw_tags.c2
-rw-r--r--mm/kasan/report_sw_tags.c2
-rw-r--r--mm/kasan/shadow.c46
-rw-r--r--mm/kasan/sw_tags.c20
-rw-r--r--mm/kasan/tags.c2
-rw-r--r--mm/khugepaged.c125
-rw-r--r--mm/kmsan/core.c6
-rw-r--r--mm/kmsan/instrumentation.c2
-rw-r--r--mm/ksm.c38
-rw-r--r--mm/madvise.c150
-rw-r--r--mm/mapping_dirty_helpers.c38
-rw-r--r--mm/memblock.c33
-rw-r--r--mm/memcontrol.c253
-rw-r--r--mm/memory-failure.c45
-rw-r--r--mm/memory-tiers.c3
-rw-r--r--mm/memory.c341
-rw-r--r--mm/memory_hotplug.c42
-rw-r--r--mm/mempolicy.c28
-rw-r--r--mm/migrate.c382
-rw-r--r--mm/migrate_device.c46
-rw-r--r--mm/mincore.c11
-rw-r--r--mm/mlock.c10
-rw-r--r--mm/mm_init.c154
-rw-r--r--mm/mmap.c222
-rw-r--r--mm/mprotect.c87
-rw-r--r--mm/mremap.c32
-rw-r--r--mm/oom_kill.c8
-rw-r--r--mm/page-writeback.c6
-rw-r--r--mm/page_alloc.c873
-rw-r--r--mm/page_isolation.c33
-rw-r--r--mm/page_owner.c2
-rw-r--r--mm/page_table_check.c6
-rw-r--r--mm/page_vma_mapped.c114
-rw-r--r--mm/pagewalk.c33
-rw-r--r--mm/percpu-internal.h11
-rw-r--r--mm/pgtable-generic.c58
-rw-r--r--mm/process_vm_access.c2
-rw-r--r--mm/ptdump.c2
-rw-r--r--mm/readahead.c1
-rw-r--r--mm/rmap.c36
-rw-r--r--mm/secretmem.c4
-rw-r--r--mm/shmem.c7
-rw-r--r--mm/show_mem.c429
-rw-r--r--mm/slab.c6
-rw-r--r--mm/slab.h5
-rw-r--r--mm/slab_common.c41
-rw-r--r--mm/sparse-vmemmap.c8
-rw-r--r--mm/sparse.c10
-rw-r--r--mm/swap.c20
-rw-r--r--mm/swap_state.c87
-rw-r--r--mm/swapfile.c109
-rw-r--r--mm/truncate.c27
-rw-r--r--mm/userfaultfd.c12
-rw-r--r--mm/vmalloc.c130
-rw-r--r--mm/vmscan.c197
-rw-r--r--mm/vmstat.c15
-rw-r--r--mm/workingset.c158
-rw-r--r--mm/z3fold.c249
-rw-r--r--mm/zbud.c167
-rw-r--r--mm/zpool.c48
-rw-r--r--mm/zsmalloc.c408
-rw-r--r--mm/zswap.c239
95 files changed, 3779 insertions, 4046 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 7672a22647b4..12f32f8d26bf 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -46,6 +46,22 @@ config ZSWAP_DEFAULT_ON
The selection made here can be overridden by using the kernel
command line 'zswap.enabled=' option.
+config ZSWAP_EXCLUSIVE_LOADS_DEFAULT_ON
+ bool "Invalidate zswap entries when pages are loaded"
+ depends on ZSWAP
+ help
+ If selected, exclusive loads for zswap will be enabled at boot,
+ otherwise it will be disabled.
+
+ If exclusive loads are enabled, when a page is loaded from zswap,
+ the zswap entry is invalidated at once, as opposed to leaving it
+ in zswap until the swap entry is freed.
+
+ This avoids having two copies of the same page in memory
+ (compressed and uncompressed) after faulting in a page from zswap.
+ The cost is that if the page was never dirtied and needs to be
+ swapped out again, it will be re-compressed.
+
choice
prompt "Default compressor"
depends on ZSWAP
diff --git a/mm/Makefile b/mm/Makefile
index e29afc890cde..678530a07326 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -51,7 +51,7 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
readahead.o swap.o truncate.o vmscan.o shmem.o \
util.o mmzone.o vmstat.o backing-dev.o \
mm_init.o percpu.o slab_common.o \
- compaction.o \
+ compaction.o show_mem.o\
interval_tree.o list_lru.o workingset.o \
debug.o gup.o mmap_lock.o $(mmu-y)
@@ -89,6 +89,7 @@ obj-$(CONFIG_KASAN) += kasan/
obj-$(CONFIG_KFENCE) += kfence/
obj-$(CONFIG_KMSAN) += kmsan/
obj-$(CONFIG_FAILSLAB) += failslab.o
+obj-$(CONFIG_FAIL_PAGE_ALLOC) += fail_page_alloc.o
obj-$(CONFIG_MEMTEST) += memtest.o
obj-$(CONFIG_MIGRATION) += migrate.o
obj-$(CONFIG_NUMA) += memory-tiers.o
@@ -123,6 +124,7 @@ obj-$(CONFIG_SECRETMEM) += secretmem.o
obj-$(CONFIG_CMA_SYSFS) += cma_sysfs.o
obj-$(CONFIG_USERFAULTFD) += userfaultfd.o
obj-$(CONFIG_IDLE_PAGE_TRACKING) += page_idle.o
+obj-$(CONFIG_DEBUG_PAGEALLOC) += debug_page_alloc.o
obj-$(CONFIG_DEBUG_PAGE_REF) += debug_page_ref.o
obj-$(CONFIG_DAMON) += damon/
obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 7da9727fcdf3..3ffc3cfa7a14 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -20,7 +20,6 @@
struct backing_dev_info noop_backing_dev_info;
EXPORT_SYMBOL_GPL(noop_backing_dev_info);
-static struct class *bdi_class;
static const char *bdi_unknown_name = "(unknown)";
/*
@@ -345,13 +344,19 @@ static struct attribute *bdi_dev_attrs[] = {
};
ATTRIBUTE_GROUPS(bdi_dev);
+static const struct class bdi_class = {
+ .name = "bdi",
+ .dev_groups = bdi_dev_groups,
+};
+
static __init int bdi_class_init(void)
{
- bdi_class = class_create("bdi");
- if (IS_ERR(bdi_class))
- return PTR_ERR(bdi_class);
+ int ret;
+
+ ret = class_register(&bdi_class);
+ if (ret)
+ return ret;
- bdi_class->dev_groups = bdi_dev_groups;
bdi_debug_init();
return 0;
@@ -1001,7 +1006,7 @@ int bdi_register_va(struct backing_dev_info *bdi, const char *fmt, va_list args)
return 0;
vsnprintf(bdi->dev_name, sizeof(bdi->dev_name), fmt, args);
- dev = device_create(bdi_class, NULL, MKDEV(0, 0), bdi, bdi->dev_name);
+ dev = device_create(&bdi_class, NULL, MKDEV(0, 0), bdi, bdi->dev_name);
if (IS_ERR(dev))
return PTR_ERR(dev);
diff --git a/mm/cma.c b/mm/cma.c
index 6268d6620254..a4cfe995e11e 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -483,8 +483,8 @@ struct page *cma_alloc(struct cma *cma, unsigned long count,
if (ret != -EBUSY)
break;
- pr_debug("%s(): memory range at %p is busy, retrying\n",
- __func__, pfn_to_page(pfn));
+ pr_debug("%s(): memory range at pfn 0x%lx %p is busy, retrying\n",
+ __func__, pfn, pfn_to_page(pfn));
trace_cma_alloc_busy_retry(cma->name, pfn, pfn_to_page(pfn),
count, align);
diff --git a/mm/compaction.c b/mm/compaction.c
index c8bcdea15f5f..dbc9f86b1934 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -229,6 +229,33 @@ static void reset_cached_positions(struct zone *zone)
pageblock_start_pfn(zone_end_pfn(zone) - 1);
}
+#ifdef CONFIG_SPARSEMEM
+/*
+ * If the PFN falls into an offline section, return the start PFN of the
+ * next online section. If the PFN falls into an online section or if
+ * there is no next online section, return 0.
+ */
+static unsigned long skip_offline_sections(unsigned long start_pfn)
+{
+ unsigned long start_nr = pfn_to_section_nr(start_pfn);
+
+ if (online_section_nr(start_nr))
+ return 0;
+
+ while (++start_nr <= __highest_present_section_nr) {
+ if (online_section_nr(start_nr))
+ return section_nr_to_pfn(start_nr);
+ }
+
+ return 0;
+}
+#else
+static unsigned long skip_offline_sections(unsigned long start_pfn)
+{
+ return 0;
+}
+#endif
+
/*
* Compound pages of >= pageblock_order should consistently be skipped until
* released. It is always pointless to compact pages of such order (if they are
@@ -392,18 +419,14 @@ void reset_isolation_suitable(pg_data_t *pgdat)
* Sets the pageblock skip bit if it was clear. Note that this is a hint as
* locks are not required for read/writers. Returns true if it was already set.
*/
-static bool test_and_set_skip(struct compact_control *cc, struct page *page,
- unsigned long pfn)
+static bool test_and_set_skip(struct compact_control *cc, struct page *page)
{
bool skip;
- /* Do no update if skip hint is being ignored */
+ /* Do not update if skip hint is being ignored */
if (cc->ignore_skip_hint)
return false;
- if (!pageblock_aligned(pfn))
- return false;
-
skip = get_pageblock_skip(page);
if (!skip && !cc->no_set_skip_hint)
set_pageblock_skip(page);
@@ -440,9 +463,6 @@ static void update_pageblock_skip(struct compact_control *cc,
if (cc->no_set_skip_hint)
return;
- if (!page)
- return;
-
set_pageblock_skip(page);
/* Update where async and sync compaction should restart */
@@ -470,8 +490,7 @@ static void update_cached_migrate(struct compact_control *cc, unsigned long pfn)
{
}
-static bool test_and_set_skip(struct compact_control *cc, struct page *page,
- unsigned long pfn)
+static bool test_and_set_skip(struct compact_control *cc, struct page *page)
{
return false;
}
@@ -745,8 +764,9 @@ isolate_freepages_range(struct compact_control *cc,
}
/* Similar to reclaim, but different enough that they don't share logic */
-static bool too_many_isolated(pg_data_t *pgdat)
+static bool too_many_isolated(struct compact_control *cc)
{
+ pg_data_t *pgdat = cc->zone->zone_pgdat;
bool too_many;
unsigned long active, inactive, isolated;
@@ -758,6 +778,17 @@ static bool too_many_isolated(pg_data_t *pgdat)
isolated = node_page_state(pgdat, NR_ISOLATED_FILE) +
node_page_state(pgdat, NR_ISOLATED_ANON);
+ /*
+ * Allow GFP_NOFS to isolate past the limit set for regular
+ * compaction runs. This prevents an ABBA deadlock when other
+ * compactors have already isolated to the limit, but are
+ * blocked on filesystem locks held by the GFP_NOFS thread.
+ */
+ if (cc->gfp_mask & __GFP_FS) {
+ inactive >>= 3;
+ active >>= 3;
+ }
+
too_many = isolated > (inactive + active) / 2;
if (!too_many)
wake_throttle_isolated(pgdat);
@@ -791,6 +822,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
struct lruvec *lruvec;
unsigned long flags = 0;
struct lruvec *locked = NULL;
+ struct folio *folio = NULL;
struct page *page = NULL, *valid_page = NULL;
struct address_space *mapping;
unsigned long start_pfn = low_pfn;
@@ -806,7 +838,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
* list by either parallel reclaimers or compaction. If there are,
* delay for some time until fewer pages are isolated
*/
- while (unlikely(too_many_isolated(pgdat))) {
+ while (unlikely(too_many_isolated(cc))) {
/* stop isolation if there are still pages not migrated */
if (cc->nr_migratepages)
return -EAGAIN;
@@ -887,7 +919,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
if (!valid_page && pageblock_aligned(low_pfn)) {
if (!isolation_suitable(cc, page)) {
low_pfn = end_pfn;
- page = NULL;
+ folio = NULL;
goto isolate_abort;
}
valid_page = page;
@@ -919,7 +951,8 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
* Hugepage was successfully isolated and placed
* on the cc->migratepages list.
*/
- low_pfn += compound_nr(page) - 1;
+ folio = page_folio(page);
+ low_pfn += folio_nr_pages(folio) - 1;
goto isolate_success_no_list;
}
@@ -987,8 +1020,10 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
locked = NULL;
}
- if (isolate_movable_page(page, mode))
+ if (isolate_movable_page(page, mode)) {
+ folio = page_folio(page);
goto isolate_success;
+ }
}
goto isolate_fail;
@@ -999,7 +1034,8 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
* sure the page is not being freed elsewhere -- the
* page release code relies on it.
*/
- if (unlikely(!get_page_unless_zero(page)))
+ folio = folio_get_nontail_page(page);
+ if (unlikely(!folio))
goto isolate_fail;
/*
@@ -1007,8 +1043,8 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
* so avoid taking lru_lock and isolating it unnecessarily in an
* admittedly racy check.
*/
- mapping = page_mapping(page);
- if (!mapping && (page_count(page) - 1) > total_mapcount(page))
+ mapping = folio_mapping(folio);
+ if (!mapping && (folio_ref_count(folio) - 1) > folio_mapcount(folio))
goto isolate_fail_put;
/*
@@ -1019,11 +1055,11 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
goto isolate_fail_put;
/* Only take pages on LRU: a check now makes later tests safe */
- if (!PageLRU(page))
+ if (!folio_test_lru(folio))
goto isolate_fail_put;
/* Compaction might skip unevictable pages but CMA takes them */
- if (!(mode & ISOLATE_UNEVICTABLE) && PageUnevictable(page))
+ if (!(mode & ISOLATE_UNEVICTABLE) && folio_test_unevictable(folio))
goto isolate_fail_put;
/*
@@ -1032,10 +1068,10 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
* it will be able to migrate without blocking - clean pages
* for the most part. PageWriteback would require blocking.
*/
- if ((mode & ISOLATE_ASYNC_MIGRATE) && PageWriteback(page))
+ if ((mode & ISOLATE_ASYNC_MIGRATE) && folio_test_writeback(folio))
goto isolate_fail_put;
- if ((mode & ISOLATE_ASYNC_MIGRATE) && PageDirty(page)) {
+ if ((mode & ISOLATE_ASYNC_MIGRATE) && folio_test_dirty(folio)) {
bool migrate_dirty;
/*
@@ -1047,22 +1083,22 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
* the page lock until after the page is removed
* from the page cache.
*/
- if (!trylock_page(page))
+ if (!folio_trylock(folio))
goto isolate_fail_put;
- mapping = page_mapping(page);
+ mapping = folio_mapping(folio);
migrate_dirty = !mapping ||
mapping->a_ops->migrate_folio;
- unlock_page(page);
+ folio_unlock(folio);
if (!migrate_dirty)
goto isolate_fail_put;
}
- /* Try isolate the page */
- if (!TestClearPageLRU(page))
+ /* Try isolate the folio */
+ if (!folio_test_clear_lru(folio))
goto isolate_fail_put;
- lruvec = folio_lruvec(page_folio(page));
+ lruvec = folio_lruvec(folio);
/* If we already hold the lock, we can skip some rechecking */
if (lruvec != locked) {
@@ -1072,44 +1108,49 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
compact_lock_irqsave(&lruvec->lru_lock, &flags, cc);
locked = lruvec;
- lruvec_memcg_debug(lruvec, page_folio(page));
+ lruvec_memcg_debug(lruvec, folio);
- /* Try get exclusive access under lock */
- if (!skip_updated) {
+ /*
+ * Try get exclusive access under lock. If marked for
+ * skip, the scan is aborted unless the current context
+ * is a rescan to reach the end of the pageblock.
+ */
+ if (!skip_updated && valid_page) {
skip_updated = true;
- if (test_and_set_skip(cc, page, low_pfn))
+ if (test_and_set_skip(cc, valid_page) &&
+ !cc->finish_pageblock) {
goto isolate_abort;
+ }
}
/*
- * Page become compound since the non-locked check,
- * and it's on LRU. It can only be a THP so the order
- * is safe to read and it's 0 for tail pages.
+ * folio become large since the non-locked check,
+ * and it's on LRU.
*/
- if (unlikely(PageCompound(page) && !cc->alloc_contig)) {
- low_pfn += compound_nr(page) - 1;
- nr_scanned += compound_nr(page) - 1;
- SetPageLRU(page);
+ if (unlikely(folio_test_large(folio) && !cc->alloc_contig)) {
+ low_pfn += folio_nr_pages(folio) - 1;
+ nr_scanned += folio_nr_pages(folio) - 1;
+ folio_set_lru(folio);
goto isolate_fail_put;
}
}
- /* The whole page is taken off the LRU; skip the tail pages. */
- if (PageCompound(page))
- low_pfn += compound_nr(page) - 1;
+ /* The folio is taken off the LRU */
+ if (folio_test_large(folio))
+ low_pfn += folio_nr_pages(folio) - 1;
/* Successfully isolated */
- del_page_from_lru_list(page, lruvec);
- mod_node_page_state(page_pgdat(page),
- NR_ISOLATED_ANON + page_is_file_lru(page),
- thp_nr_pages(page));
+ lruvec_del_folio(lruvec, folio);
+ node_stat_mod_folio(folio,
+ NR_ISOLATED_ANON + folio_is_file_lru(folio),
+ folio_nr_pages(folio));
isolate_success:
- list_add(&page->lru, &cc->migratepages);
+ list_add(&folio->lru, &cc->migratepages);
isolate_success_no_list:
- cc->nr_migratepages += compound_nr(page);
- nr_isolated += compound_nr(page);
- nr_scanned += compound_nr(page) - 1;
+ cc->nr_migratepages += folio_nr_pages(folio);
+ nr_isolated += folio_nr_pages(folio);
+ nr_scanned += folio_nr_pages(folio) - 1;
/*
* Avoid isolating too much unless this block is being
@@ -1131,7 +1172,7 @@ isolate_fail_put:
unlock_page_lruvec_irqrestore(locked, flags);
locked = NULL;
}
- put_page(page);
+ folio_put(folio);
isolate_fail:
if (!skip_on_failure && ret != -ENOMEM)
@@ -1172,14 +1213,14 @@ isolate_fail:
if (unlikely(low_pfn > end_pfn))
low_pfn = end_pfn;
- page = NULL;
+ folio = NULL;
isolate_abort:
if (locked)
unlock_page_lruvec_irqrestore(locked, flags);
- if (page) {
- SetPageLRU(page);
- put_page(page);
+ if (folio) {
+ folio_set_lru(folio);
+ folio_put(folio);
}
/*
@@ -1191,7 +1232,7 @@ isolate_abort:
* rescanned twice in a row.
*/
if (low_pfn == end_pfn && (!nr_isolated || cc->finish_pageblock)) {
- if (valid_page && !skip_updated)
+ if (!cc->no_set_skip_hint && valid_page && !skip_updated)
set_pageblock_skip(valid_page);
update_cached_migrate(cc, low_pfn);
}
@@ -1379,7 +1420,7 @@ fast_isolate_around(struct compact_control *cc, unsigned long pfn)
isolate_freepages_block(cc, &start_pfn, end_pfn, &cc->freepages, 1, false);
/* Skip this pageblock in the future as it's full or nearly full */
- if (cc->nr_freepages < cc->nr_migratepages)
+ if (start_pfn == end_pfn)
set_pageblock_skip(page);
return;
@@ -1403,11 +1444,10 @@ static int next_search_order(struct compact_control *cc, int order)
return order;
}
-static unsigned long
-fast_isolate_freepages(struct compact_control *cc)
+static void fast_isolate_freepages(struct compact_control *cc)
{
unsigned int limit = max(1U, freelist_scan_limit(cc) >> 1);
- unsigned int nr_scanned = 0;
+ unsigned int nr_scanned = 0, total_isolated = 0;
unsigned long low_pfn, min_pfn, highest = 0;
unsigned long nr_isolated = 0;
unsigned long distance;
@@ -1417,7 +1457,7 @@ fast_isolate_freepages(struct compact_control *cc)
/* Full compaction passes in a negative order */
if (cc->order <= 0)
- return cc->free_pfn;
+ return;
/*
* If starting the scan, use a deeper search and use the highest
@@ -1506,6 +1546,7 @@ fast_isolate_freepages(struct compact_control *cc)
set_page_private(page, order);
nr_isolated = 1 << order;
nr_scanned += nr_isolated - 1;
+ total_isolated += nr_isolated;
cc->nr_freepages += nr_isolated;
list_add_tail(&page->lru, &cc->freepages);
count_compact_events(COMPACTISOLATED, nr_isolated);
@@ -1518,6 +1559,10 @@ fast_isolate_freepages(struct compact_control *cc)
spin_unlock_irqrestore(&cc->zone->lock, flags);
+ /* Skip fast search if enough freepages isolated */
+ if (cc->nr_freepages >= cc->nr_migratepages)
+ break;
+
/*
* Smaller scan on next order so the total scan is related
* to freelist_scan_limit.
@@ -1526,6 +1571,9 @@ fast_isolate_freepages(struct compact_control *cc)
limit = max(1U, limit >> 1);
}
+ trace_mm_compaction_fast_isolate_freepages(min_pfn, cc->free_pfn,
+ nr_scanned, total_isolated);
+
if (!page) {
cc->fast_search_fail++;
if (scan_start) {
@@ -1556,11 +1604,10 @@ fast_isolate_freepages(struct compact_control *cc)
cc->total_free_scanned += nr_scanned;
if (!page)
- return cc->free_pfn;
+ return;
low_pfn = page_to_pfn(page);
fast_isolate_around(cc, low_pfn);
- return low_pfn;
}
/*
@@ -1684,11 +1731,10 @@ splitmap:
* This is a migrate-callback that "allocates" freepages by taking pages
* from the isolated freelists in the block we are migrating to.
*/
-static struct page *compaction_alloc(struct page *migratepage,
- unsigned long data)
+static struct folio *compaction_alloc(struct folio *src, unsigned long data)
{
struct compact_control *cc = (struct compact_control *)data;
- struct page *freepage;
+ struct folio *dst;
if (list_empty(&cc->freepages)) {
isolate_freepages(cc);
@@ -1697,11 +1743,11 @@ static struct page *compaction_alloc(struct page *migratepage,
return NULL;
}
- freepage = list_entry(cc->freepages.next, struct page, lru);
- list_del(&freepage->lru);
+ dst = list_entry(cc->freepages.next, struct folio, lru);
+ list_del(&dst->lru);
cc->nr_freepages--;
- return freepage;
+ return dst;
}
/*
@@ -1709,11 +1755,11 @@ static struct page *compaction_alloc(struct page *migratepage,
* freelist. All pages on the freelist are from the same zone, so there is no
* special handling needed for NUMA.
*/
-static void compaction_free(struct page *page, unsigned long data)
+static void compaction_free(struct folio *dst, unsigned long data)
{
struct compact_control *cc = (struct compact_control *)data;
- list_add(&page->lru, &cc->freepages);
+ list_add(&dst->lru, &cc->freepages);
cc->nr_freepages++;
}
@@ -1736,6 +1782,7 @@ static int sysctl_compact_unevictable_allowed __read_mostly = CONFIG_COMPACT_UNE
*/
static unsigned int __read_mostly sysctl_compaction_proactiveness = 20;
static int sysctl_extfrag_threshold = 500;
+static int __read_mostly sysctl_compact_memory;
static inline void
update_fast_start_pfn(struct compact_control *cc, unsigned long pfn)
@@ -1864,7 +1911,6 @@ static unsigned long fast_find_migrateblock(struct compact_control *cc)
pfn = cc->zone->zone_start_pfn;
cc->fast_search_fail = 0;
found_block = true;
- set_pageblock_skip(freepage);
break;
}
}
@@ -1940,8 +1986,14 @@ static isolate_migrate_t isolate_migratepages(struct compact_control *cc)
page = pageblock_pfn_to_page(block_start_pfn,
block_end_pfn, cc->zone);
- if (!page)
+ if (!page) {
+ unsigned long next_pfn;
+
+ next_pfn = skip_offline_sections(block_start_pfn);
+ if (next_pfn)
+ block_end_pfn = min(next_pfn, cc->free_pfn);
continue;
+ }
/*
* If isolation recently failed, do not retry. Only check the
@@ -2193,25 +2245,11 @@ static enum compact_result compact_finished(struct compact_control *cc)
return ret;
}
-static enum compact_result __compaction_suitable(struct zone *zone, int order,
- unsigned int alloc_flags,
- int highest_zoneidx,
- unsigned long wmark_target)
+static bool __compaction_suitable(struct zone *zone, int order,
+ int highest_zoneidx,
+ unsigned long wmark_target)
{
unsigned long watermark;
-
- if (is_via_compact_memory(order))
- return COMPACT_CONTINUE;
-
- watermark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK);
- /*
- * If watermarks for high-order allocation are already met, there
- * should be no need for compaction at all.
- */
- if (zone_watermark_ok(zone, order, watermark, highest_zoneidx,
- alloc_flags))
- return COMPACT_SUCCESS;
-
/*
* Watermarks for order-0 must be met for compaction to be able to
* isolate free pages for migration targets. This means that the
@@ -2229,29 +2267,20 @@ static enum compact_result __compaction_suitable(struct zone *zone, int order,
watermark = (order > PAGE_ALLOC_COSTLY_ORDER) ?
low_wmark_pages(zone) : min_wmark_pages(zone);
watermark += compact_gap(order);
- if (!__zone_watermark_ok(zone, 0, watermark, highest_zoneidx,
- ALLOC_CMA, wmark_target))
- return COMPACT_SKIPPED;
-
- return COMPACT_CONTINUE;
+ return __zone_watermark_ok(zone, 0, watermark, highest_zoneidx,
+ ALLOC_CMA, wmark_target);
}
/*
* compaction_suitable: Is this suitable to run compaction on this zone now?
- * Returns
- * COMPACT_SKIPPED - If there are too few free pages for compaction
- * COMPACT_SUCCESS - If the allocation would succeed without compaction
- * COMPACT_CONTINUE - If compaction should run now
*/
-enum compact_result compaction_suitable(struct zone *zone, int order,
- unsigned int alloc_flags,
- int highest_zoneidx)
+bool compaction_suitable(struct zone *zone, int order, int highest_zoneidx)
{
- enum compact_result ret;
- int fragindex;
+ enum compact_result compact_result;
+ bool suitable;
- ret = __compaction_suitable(zone, order, alloc_flags, highest_zoneidx,
- zone_page_state(zone, NR_FREE_PAGES));
+ suitable = __compaction_suitable(zone, order, highest_zoneidx,
+ zone_page_state(zone, NR_FREE_PAGES));
/*
* fragmentation index determines if allocation failures are due to
* low memory or external fragmentation
@@ -2268,17 +2297,24 @@ enum compact_result compaction_suitable(struct zone *zone, int order,
* excessive compaction for costly orders, but it should not be at the
* expense of system stability.
*/
- if (ret == COMPACT_CONTINUE && (order > PAGE_ALLOC_COSTLY_ORDER)) {
- fragindex = fragmentation_index(zone, order);
- if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)
- ret = COMPACT_NOT_SUITABLE_ZONE;
+ if (suitable) {
+ compact_result = COMPACT_CONTINUE;
+ if (order > PAGE_ALLOC_COSTLY_ORDER) {
+ int fragindex = fragmentation_index(zone, order);
+
+ if (fragindex >= 0 &&
+ fragindex <= sysctl_extfrag_threshold) {
+ suitable = false;
+ compact_result = COMPACT_NOT_SUITABLE_ZONE;
+ }
+ }
+ } else {
+ compact_result = COMPACT_SKIPPED;
}
- trace_mm_compaction_suitable(zone, order, ret);
- if (ret == COMPACT_NOT_SUITABLE_ZONE)
- ret = COMPACT_SKIPPED;
+ trace_mm_compaction_suitable(zone, order, compact_result);
- return ret;
+ return suitable;
}
bool compaction_zonelist_suitable(struct alloc_context *ac, int order,
@@ -2294,7 +2330,6 @@ bool compaction_zonelist_suitable(struct alloc_context *ac, int order,
for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
ac->highest_zoneidx, ac->nodemask) {
unsigned long available;
- enum compact_result compact_result;
/*
* Do not consider all the reclaimable memory because we do not
@@ -2304,9 +2339,8 @@ bool compaction_zonelist_suitable(struct alloc_context *ac, int order,
*/
available = zone_reclaimable_pages(zone) / order;
available += zone_page_state_snapshot(zone, NR_FREE_PAGES);
- compact_result = __compaction_suitable(zone, order, alloc_flags,
- ac->highest_zoneidx, available);
- if (compact_result == COMPACT_CONTINUE)
+ if (__compaction_suitable(zone, order, ac->highest_zoneidx,
+ available))
return true;
}
@@ -2336,11 +2370,22 @@ compact_zone(struct compact_control *cc, struct capture_control *capc)
INIT_LIST_HEAD(&cc->migratepages);
cc->migratetype = gfp_migratetype(cc->gfp_mask);
- ret = compaction_suitable(cc->zone, cc->order, cc->alloc_flags,
- cc->highest_zoneidx);
- /* Compaction is likely to fail */
- if (ret == COMPACT_SUCCESS || ret == COMPACT_SKIPPED)
- return ret;
+
+ if (!is_via_compact_memory(cc->order)) {
+ unsigned long watermark;
+
+ /* Allocation can already succeed, nothing to do */
+ watermark = wmark_pages(cc->zone,
+ cc->alloc_flags & ALLOC_WMARK_MASK);
+ if (zone_watermark_ok(cc->zone, cc->order, watermark,
+ cc->highest_zoneidx, cc->alloc_flags))
+ return COMPACT_SUCCESS;
+
+ /* Compaction is likely to fail */
+ if (!compaction_suitable(cc->zone, cc->order,
+ cc->highest_zoneidx))
+ return COMPACT_SKIPPED;
+ }
/*
* Clear pageblock skip if there were failures recently and compaction
@@ -2456,7 +2501,8 @@ rescan:
}
/*
* If an ASYNC or SYNC_LIGHT fails to migrate a page
- * within the current order-aligned block, scan the
+ * within the current order-aligned block and
+ * fast_find_migrateblock may be used then scan the
* remainder of the pageblock. This will mark the
* pageblock "skip" to avoid rescanning in the near
* future. This will isolate more pages than necessary
@@ -2464,8 +2510,9 @@ rescan:
* fast_find_migrateblock revisiting blocks that were
* recently partially scanned.
*/
- if (cc->direct_compaction && !cc->finish_pageblock &&
- (cc->mode < MIGRATE_SYNC)) {
+ if (!pageblock_aligned(cc->migrate_pfn) &&
+ !cc->ignore_skip_hint && !cc->finish_pageblock &&
+ (cc->mode < MIGRATE_SYNC)) {
cc->finish_pageblock = true;
/*
@@ -2780,6 +2827,15 @@ static int compaction_proactiveness_sysctl_handler(struct ctl_table *table, int
static int sysctl_compaction_handler(struct ctl_table *table, int write,
void *buffer, size_t *length, loff_t *ppos)
{
+ int ret;
+
+ ret = proc_dointvec(table, write, buffer, length, ppos);
+ if (ret)
+ return ret;
+
+ if (sysctl_compact_memory != 1)
+ return -EINVAL;
+
if (write)
compact_nodes();
@@ -2833,8 +2889,14 @@ static bool kcompactd_node_suitable(pg_data_t *pgdat)
if (!populated_zone(zone))
continue;
- if (compaction_suitable(zone, pgdat->kcompactd_max_order, 0,
- highest_zoneidx) == COMPACT_CONTINUE)
+ /* Allocation can already succeed, check other zones */
+ if (zone_watermark_ok(zone, pgdat->kcompactd_max_order,
+ min_wmark_pages(zone),
+ highest_zoneidx, 0))
+ continue;
+
+ if (compaction_suitable(zone, pgdat->kcompactd_max_order,
+ highest_zoneidx))
return true;
}
@@ -2871,8 +2933,12 @@ static void kcompactd_do_work(pg_data_t *pgdat)
if (compaction_deferred(zone, cc.order))
continue;
- if (compaction_suitable(zone, cc.order, 0, zoneid) !=
- COMPACT_CONTINUE)
+ /* Allocation can already succeed, nothing to do */
+ if (zone_watermark_ok(zone, cc.order,
+ min_wmark_pages(zone), zoneid, 0))
+ continue;
+
+ if (!compaction_suitable(zone, cc.order, zoneid))
continue;
if (kthread_should_stop())
@@ -3021,7 +3087,7 @@ static int kcompactd(void *p)
* This kcompactd start function will be called by init and node-hot-add.
* On node-hot-add, kcompactd will moved to proper cpus if cpus are hot-added.
*/
-void kcompactd_run(int nid)
+void __meminit kcompactd_run(int nid)
{
pg_data_t *pgdat = NODE_DATA(nid);
@@ -3039,7 +3105,7 @@ void kcompactd_run(int nid)
* Called by memory hotplug when all memory in a node is offlined. Caller must
* be holding mem_hotplug_begin/done().
*/
-void kcompactd_stop(int nid)
+void __meminit kcompactd_stop(int nid)
{
struct task_struct *kcompactd = NODE_DATA(nid)->kcompactd;
@@ -3095,7 +3161,7 @@ static int proc_dointvec_minmax_warn_RT_change(struct ctl_table *table,
static struct ctl_table vm_compaction[] = {
{
.procname = "compact_memory",
- .data = NULL,
+ .data = &sysctl_compact_memory,
.maxlen = sizeof(int),
.mode = 0200,
.proc_handler = sysctl_compaction_handler,
diff --git a/mm/damon/core-test.h b/mm/damon/core-test.h
index fae64d32b925..c11210124344 100644
--- a/mm/damon/core-test.h
+++ b/mm/damon/core-test.h
@@ -318,6 +318,29 @@ static void damon_test_update_monitoring_result(struct kunit *test)
KUNIT_EXPECT_EQ(test, r->age, 20);
}
+static void damon_test_set_attrs(struct kunit *test)
+{
+ struct damon_ctx ctx;
+ struct damon_attrs valid_attrs = {
+ .min_nr_regions = 10, .max_nr_regions = 1000,
+ .sample_interval = 5000, .aggr_interval = 100000,};
+ struct damon_attrs invalid_attrs;
+
+ KUNIT_EXPECT_EQ(test, damon_set_attrs(&ctx, &valid_attrs), 0);
+
+ invalid_attrs = valid_attrs;
+ invalid_attrs.min_nr_regions = 1;
+ KUNIT_EXPECT_EQ(test, damon_set_attrs(&ctx, &invalid_attrs), -EINVAL);
+
+ invalid_attrs = valid_attrs;
+ invalid_attrs.max_nr_regions = 9;
+ KUNIT_EXPECT_EQ(test, damon_set_attrs(&ctx, &invalid_attrs), -EINVAL);
+
+ invalid_attrs = valid_attrs;
+ invalid_attrs.aggr_interval = 4999;
+ KUNIT_EXPECT_EQ(test, damon_set_attrs(&ctx, &invalid_attrs), -EINVAL);
+}
+
static struct kunit_case damon_test_cases[] = {
KUNIT_CASE(damon_test_target),
KUNIT_CASE(damon_test_regions),
@@ -329,6 +352,7 @@ static struct kunit_case damon_test_cases[] = {
KUNIT_CASE(damon_test_ops_registration),
KUNIT_CASE(damon_test_set_regions),
KUNIT_CASE(damon_test_update_monitoring_result),
+ KUNIT_CASE(damon_test_set_attrs),
{},
};
diff --git a/mm/damon/ops-common.c b/mm/damon/ops-common.c
index cc63cf953636..e940802a15a4 100644
--- a/mm/damon/ops-common.c
+++ b/mm/damon/ops-common.c
@@ -37,51 +37,29 @@ struct folio *damon_get_folio(unsigned long pfn)
return folio;
}
-void damon_ptep_mkold(pte_t *pte, struct mm_struct *mm, unsigned long addr)
+void damon_ptep_mkold(pte_t *pte, struct vm_area_struct *vma, unsigned long addr)
{
- bool referenced = false;
- struct folio *folio = damon_get_folio(pte_pfn(*pte));
+ struct folio *folio = damon_get_folio(pte_pfn(ptep_get(pte)));
if (!folio)
return;
- if (pte_young(*pte)) {
- referenced = true;
- *pte = pte_mkold(*pte);
- }
-
-#ifdef CONFIG_MMU_NOTIFIER
- if (mmu_notifier_clear_young(mm, addr, addr + PAGE_SIZE))
- referenced = true;
-#endif /* CONFIG_MMU_NOTIFIER */
-
- if (referenced)
+ if (ptep_clear_young_notify(vma, addr, pte))
folio_set_young(folio);
folio_set_idle(folio);
folio_put(folio);
}
-void damon_pmdp_mkold(pmd_t *pmd, struct mm_struct *mm, unsigned long addr)
+void damon_pmdp_mkold(pmd_t *pmd, struct vm_area_struct *vma, unsigned long addr)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- bool referenced = false;
struct folio *folio = damon_get_folio(pmd_pfn(*pmd));
if (!folio)
return;
- if (pmd_young(*pmd)) {
- referenced = true;
- *pmd = pmd_mkold(*pmd);
- }
-
-#ifdef CONFIG_MMU_NOTIFIER
- if (mmu_notifier_clear_young(mm, addr, addr + HPAGE_PMD_SIZE))
- referenced = true;
-#endif /* CONFIG_MMU_NOTIFIER */
-
- if (referenced)
+ if (pmdp_clear_young_notify(vma, addr, pmd))
folio_set_young(folio);
folio_set_idle(folio);
diff --git a/mm/damon/ops-common.h b/mm/damon/ops-common.h
index 14f4bc69f29b..18d837d11bce 100644
--- a/mm/damon/ops-common.h
+++ b/mm/damon/ops-common.h
@@ -9,8 +9,8 @@
struct folio *damon_get_folio(unsigned long pfn);
-void damon_ptep_mkold(pte_t *pte, struct mm_struct *mm, unsigned long addr);
-void damon_pmdp_mkold(pmd_t *pmd, struct mm_struct *mm, unsigned long addr);
+void damon_ptep_mkold(pte_t *pte, struct vm_area_struct *vma, unsigned long addr);
+void damon_pmdp_mkold(pmd_t *pmd, struct vm_area_struct *vma, unsigned long addr);
int damon_cold_score(struct damon_ctx *c, struct damon_region *r,
struct damos *s);
diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c
index 467b99166b43..40801e38fcf0 100644
--- a/mm/damon/paddr.c
+++ b/mm/damon/paddr.c
@@ -24,9 +24,9 @@ static bool __damon_pa_mkold(struct folio *folio, struct vm_area_struct *vma,
while (page_vma_mapped_walk(&pvmw)) {
addr = pvmw.address;
if (pvmw.pte)
- damon_ptep_mkold(pvmw.pte, vma->vm_mm, addr);
+ damon_ptep_mkold(pvmw.pte, vma, addr);
else
- damon_pmdp_mkold(pvmw.pmd, vma->vm_mm, addr);
+ damon_pmdp_mkold(pvmw.pmd, vma, addr);
}
return true;
}
@@ -89,7 +89,7 @@ static bool __damon_pa_young(struct folio *folio, struct vm_area_struct *vma,
while (page_vma_mapped_walk(&pvmw)) {
addr = pvmw.address;
if (pvmw.pte) {
- *accessed = pte_young(*pvmw.pte) ||
+ *accessed = pte_young(ptep_get(pvmw.pte)) ||
!folio_test_idle(folio) ||
mmu_notifier_test_young(vma->vm_mm, addr);
} else {
diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
index 1fec16d7263e..2fcc9731528a 100644
--- a/mm/damon/vaddr.c
+++ b/mm/damon/vaddr.c
@@ -311,19 +311,21 @@ static int damon_mkold_pmd_entry(pmd_t *pmd, unsigned long addr,
}
if (pmd_trans_huge(*pmd)) {
- damon_pmdp_mkold(pmd, walk->mm, addr);
+ damon_pmdp_mkold(pmd, walk->vma, addr);
spin_unlock(ptl);
return 0;
}
spin_unlock(ptl);
}
- if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
- return 0;
pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
- if (!pte_present(*pte))
+ if (!pte) {
+ walk->action = ACTION_AGAIN;
+ return 0;
+ }
+ if (!pte_present(ptep_get(pte)))
goto out;
- damon_ptep_mkold(pte, walk->mm, addr);
+ damon_ptep_mkold(pte, walk->vma, addr);
out:
pte_unmap_unlock(pte, ptl);
return 0;
@@ -431,6 +433,7 @@ static int damon_young_pmd_entry(pmd_t *pmd, unsigned long addr,
unsigned long next, struct mm_walk *walk)
{
pte_t *pte;
+ pte_t ptent;
spinlock_t *ptl;
struct folio *folio;
struct damon_young_walk_private *priv = walk->private;
@@ -464,15 +467,18 @@ huge_out:
regular_page:
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
- if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
- return -EINVAL;
pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
- if (!pte_present(*pte))
+ if (!pte) {
+ walk->action = ACTION_AGAIN;
+ return 0;
+ }
+ ptent = ptep_get(pte);
+ if (!pte_present(ptent))
goto out;
- folio = damon_get_folio(pte_pfn(*pte));
+ folio = damon_get_folio(pte_pfn(ptent));
if (!folio)
goto out;
- if (pte_young(*pte) || !folio_test_idle(folio) ||
+ if (pte_young(ptent) || !folio_test_idle(folio) ||
mmu_notifier_test_young(walk->mm, addr))
priv->young = true;
*priv->folio_sz = folio_size(folio);
diff --git a/mm/debug.c b/mm/debug.c
index c7b228097bd9..ee533a5ceb79 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -268,4 +268,13 @@ void page_init_poison(struct page *page, size_t size)
if (page_init_poisoning)
memset(page, PAGE_POISON_PATTERN, size);
}
+
+void vma_iter_dump_tree(const struct vma_iterator *vmi)
+{
+#if defined(CONFIG_DEBUG_VM_MAPLE_TREE)
+ mas_dump(&vmi->mas);
+ mt_dump(vmi->mas.tree, mt_dump_hex);
+#endif /* CONFIG_DEBUG_VM_MAPLE_TREE */
+}
+
#endif /* CONFIG_DEBUG_VM */
diff --git a/mm/debug_page_alloc.c b/mm/debug_page_alloc.c
new file mode 100644
index 000000000000..f9d145730fd1
--- /dev/null
+++ b/mm/debug_page_alloc.c
@@ -0,0 +1,59 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/mm.h>
+#include <linux/page-isolation.h>
+
+unsigned int _debug_guardpage_minorder;
+
+bool _debug_pagealloc_enabled_early __read_mostly
+ = IS_ENABLED(CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT);
+EXPORT_SYMBOL(_debug_pagealloc_enabled_early);
+DEFINE_STATIC_KEY_FALSE(_debug_pagealloc_enabled);
+EXPORT_SYMBOL(_debug_pagealloc_enabled);
+
+DEFINE_STATIC_KEY_FALSE(_debug_guardpage_enabled);
+
+static int __init early_debug_pagealloc(char *buf)
+{
+ return kstrtobool(buf, &_debug_pagealloc_enabled_early);
+}
+early_param("debug_pagealloc", early_debug_pagealloc);
+
+static int __init debug_guardpage_minorder_setup(char *buf)
+{
+ unsigned long res;
+
+ if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) {
+ pr_err("Bad debug_guardpage_minorder value\n");
+ return 0;
+ }
+ _debug_guardpage_minorder = res;
+ pr_info("Setting debug_guardpage_minorder to %lu\n", res);
+ return 0;
+}
+early_param("debug_guardpage_minorder", debug_guardpage_minorder_setup);
+
+bool __set_page_guard(struct zone *zone, struct page *page, unsigned int order,
+ int migratetype)
+{
+ if (order >= debug_guardpage_minorder())
+ return false;
+
+ __SetPageGuard(page);
+ INIT_LIST_HEAD(&page->buddy_list);
+ set_page_private(page, order);
+ /* Guard pages are not available for any usage */
+ if (!is_migrate_isolate(migratetype))
+ __mod_zone_freepage_state(zone, -(1 << order), migratetype);
+
+ return true;
+}
+
+void __clear_page_guard(struct zone *zone, struct page *page, unsigned int order,
+ int migratetype)
+{
+ __ClearPageGuard(page);
+
+ set_page_private(page, 0);
+ if (!is_migrate_isolate(migratetype))
+ __mod_zone_freepage_state(zone, (1 << order), migratetype);
+}
diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index c54177aabebd..ee119e33fef1 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -138,6 +138,9 @@ static void __init pte_advanced_tests(struct pgtable_debug_args *args)
return;
pr_debug("Validating PTE advanced\n");
+ if (WARN_ON(!args->ptep))
+ return;
+
pte = pfn_pte(args->pte_pfn, args->page_prot);
set_pte_at(args->mm, args->vaddr, args->ptep, pte);
flush_dcache_page(page);
@@ -619,6 +622,9 @@ static void __init pte_clear_tests(struct pgtable_debug_args *args)
* the unexpected overhead of cache flushing is acceptable.
*/
pr_debug("Validating PTE clear\n");
+ if (WARN_ON(!args->ptep))
+ return;
+
#ifndef CONFIG_RISCV
pte = __pte(pte_val(pte) | RANDOM_ORVALUE);
#endif
@@ -1377,7 +1383,8 @@ static int __init debug_vm_pgtable(void)
args.ptep = pte_offset_map_lock(args.mm, args.pmdp, args.vaddr, &ptl);
pte_clear_tests(&args);
pte_advanced_tests(&args);
- pte_unmap_unlock(args.ptep, ptl);
+ if (args.ptep)
+ pte_unmap_unlock(args.ptep, ptl);
ptl = pmd_lock(args.mm, args.pmdp);
pmd_clear_tests(&args);
diff --git a/mm/dmapool.c b/mm/dmapool.c
index d2b0f8fc9649..a151a21e571b 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -226,7 +226,7 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev,
{
struct dma_pool *retval;
size_t allocation;
- bool empty = false;
+ bool empty;
if (!dev)
return NULL;
@@ -276,8 +276,7 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev,
*/
mutex_lock(&pools_reg_lock);
mutex_lock(&pools_lock);
- if (list_empty(&dev->dma_pools))
- empty = true;
+ empty = list_empty(&dev->dma_pools);
list_add(&retval->pools, &dev->dma_pools);
mutex_unlock(&pools_lock);
if (empty) {
@@ -361,7 +360,7 @@ static struct dma_page *pool_alloc_page(struct dma_pool *pool, gfp_t mem_flags)
void dma_pool_destroy(struct dma_pool *pool)
{
struct dma_page *page, *tmp;
- bool empty = false, busy = false;
+ bool empty, busy = false;
if (unlikely(!pool))
return;
@@ -369,8 +368,7 @@ void dma_pool_destroy(struct dma_pool *pool)
mutex_lock(&pools_reg_lock);
mutex_lock(&pools_lock);
list_del(&pool->pools);
- if (list_empty(&pool->dev->dma_pools))
- empty = true;
+ empty = list_empty(&pool->dev->dma_pools);
mutex_unlock(&pools_lock);
if (empty)
device_remove_file(pool->dev, &dev_attr_pools);
diff --git a/mm/early_ioremap.c b/mm/early_ioremap.c
index 9bc12e526ed0..ce06b2884789 100644
--- a/mm/early_ioremap.c
+++ b/mm/early_ioremap.c
@@ -72,12 +72,10 @@ void __init early_ioremap_setup(void)
{
int i;
- for (i = 0; i < FIX_BTMAPS_SLOTS; i++)
- if (WARN_ON(prev_map[i]))
- break;
-
- for (i = 0; i < FIX_BTMAPS_SLOTS; i++)
+ for (i = 0; i < FIX_BTMAPS_SLOTS; i++) {
+ WARN_ON_ONCE(prev_map[i]);
slot_virt[i] = __fix_to_virt(FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*i);
+ }
}
static int __init check_early_ioremap_leak(void)
diff --git a/mm/fadvise.c b/mm/fadvise.c
index fb7c5f43fd2a..6c39d42f16dc 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -14,7 +14,6 @@
#include <linux/mm.h>
#include <linux/pagemap.h>
#include <linux/backing-dev.h>
-#include <linux/pagevec.h>
#include <linux/fadvise.h>
#include <linux/writeback.h>
#include <linux/syscalls.h>
@@ -143,7 +142,7 @@ int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice)
}
if (end_index >= start_index) {
- unsigned long nr_pagevec = 0;
+ unsigned long nr_failed = 0;
/*
* It's common to FADV_DONTNEED right after
@@ -156,17 +155,15 @@ int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice)
*/
lru_add_drain();
- invalidate_mapping_pagevec(mapping,
- start_index, end_index,
- &nr_pagevec);
+ mapping_try_invalidate(mapping, start_index, end_index,
+ &nr_failed);
/*
- * If fewer pages were invalidated than expected then
- * it is possible that some of the pages were on
- * a per-cpu pagevec for a remote CPU. Drain all
- * pagevecs and try again.
+ * The failures may be due to the folio being
+ * in the LRU cache of a remote CPU. Drain all
+ * caches and try again.
*/
- if (nr_pagevec) {
+ if (nr_failed) {
lru_add_drain_all();
invalidate_mapping_pages(mapping, start_index,
end_index);
diff --git a/mm/fail_page_alloc.c b/mm/fail_page_alloc.c
new file mode 100644
index 000000000000..b1b09cce9394
--- /dev/null
+++ b/mm/fail_page_alloc.c
@@ -0,0 +1,66 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/fault-inject.h>
+#include <linux/mm.h>
+
+static struct {
+ struct fault_attr attr;
+
+ bool ignore_gfp_highmem;
+ bool ignore_gfp_reclaim;
+ u32 min_order;
+} fail_page_alloc = {
+ .attr = FAULT_ATTR_INITIALIZER,
+ .ignore_gfp_reclaim = true,
+ .ignore_gfp_highmem = true,
+ .min_order = 1,
+};
+
+static int __init setup_fail_page_alloc(char *str)
+{
+ return setup_fault_attr(&fail_page_alloc.attr, str);
+}
+__setup("fail_page_alloc=", setup_fail_page_alloc);
+
+bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
+{
+ int flags = 0;
+
+ if (order < fail_page_alloc.min_order)
+ return false;
+ if (gfp_mask & __GFP_NOFAIL)
+ return false;
+ if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))
+ return false;
+ if (fail_page_alloc.ignore_gfp_reclaim &&
+ (gfp_mask & __GFP_DIRECT_RECLAIM))
+ return false;
+
+ /* See comment in __should_failslab() */
+ if (gfp_mask & __GFP_NOWARN)
+ flags |= FAULT_NOWARN;
+
+ return should_fail_ex(&fail_page_alloc.attr, 1 << order, flags);
+}
+
+#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
+
+static int __init fail_page_alloc_debugfs(void)
+{
+ umode_t mode = S_IFREG | 0600;
+ struct dentry *dir;
+
+ dir = fault_create_debugfs_attr("fail_page_alloc", NULL,
+ &fail_page_alloc.attr);
+
+ debugfs_create_bool("ignore-gfp-wait", mode, dir,
+ &fail_page_alloc.ignore_gfp_reclaim);
+ debugfs_create_bool("ignore-gfp-highmem", mode, dir,
+ &fail_page_alloc.ignore_gfp_highmem);
+ debugfs_create_u32("min-order", mode, dir, &fail_page_alloc.min_order);
+
+ return 0;
+}
+
+late_initcall(fail_page_alloc_debugfs);
+
+#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
diff --git a/mm/filemap.c b/mm/filemap.c
index 00f01d8ead47..9e44a49bbd74 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -22,6 +22,7 @@
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/swapops.h>
+#include <linux/syscalls.h>
#include <linux/mman.h>
#include <linux/pagemap.h>
#include <linux/file.h>
@@ -58,6 +59,8 @@
#include <asm/mman.h>
+#include "swap.h"
+
/*
* Shared mappings implemented 30.11.1994. It's not fully working yet,
* though.
@@ -114,7 +117,7 @@
* ->i_pages lock (page_remove_rmap->set_page_dirty)
* bdi.wb->list_lock (page_remove_rmap->set_page_dirty)
* ->inode->i_lock (page_remove_rmap->set_page_dirty)
- * ->memcg->move_lock (page_remove_rmap->lock_page_memcg)
+ * ->memcg->move_lock (page_remove_rmap->folio_memcg_lock)
* bdi.wb->list_lock (zap_pte_range->set_page_dirty)
* ->inode->i_lock (zap_pte_range->set_page_dirty)
* ->private_lock (zap_pte_range->block_dirty_folio)
@@ -1359,8 +1362,6 @@ repeat:
/**
* migration_entry_wait_on_locked - Wait for a migration entry to be removed
* @entry: migration swap entry.
- * @ptep: mapped pte pointer. Will return with the ptep unmapped. Only required
- * for pte entries, pass NULL for pmd entries.
* @ptl: already locked ptl. This function will drop the lock.
*
* Wait for a migration entry referencing the given page to be removed. This is
@@ -1369,13 +1370,13 @@ repeat:
* should be called while holding the ptl for the migration entry referencing
* the page.
*
- * Returns after unmapping and unlocking the pte/ptl with pte_unmap_unlock().
+ * Returns after unlocking the ptl.
*
* This follows the same logic as folio_wait_bit_common() so see the comments
* there.
*/
-void migration_entry_wait_on_locked(swp_entry_t entry, pte_t *ptep,
- spinlock_t *ptl)
+void migration_entry_wait_on_locked(swp_entry_t entry, spinlock_t *ptl)
+ __releases(ptl)
{
struct wait_page_queue wait_page;
wait_queue_entry_t *wait = &wait_page.wait;
@@ -1409,10 +1410,7 @@ void migration_entry_wait_on_locked(swp_entry_t entry, pte_t *ptep,
* a valid reference to the page, and it must take the ptl to remove the
* migration entry. So the page is valid until the ptl is dropped.
*/
- if (ptep)
- pte_unmap_unlock(ptep, ptl);
- else
- spin_unlock(ptl);
+ spin_unlock(ptl);
for (;;) {
unsigned int flags;
@@ -1625,36 +1623,6 @@ void folio_end_writeback(struct folio *folio)
}
EXPORT_SYMBOL(folio_end_writeback);
-/*
- * After completing I/O on a page, call this routine to update the page
- * flags appropriately
- */
-void page_endio(struct page *page, bool is_write, int err)
-{
- struct folio *folio = page_folio(page);
-
- if (!is_write) {
- if (!err) {
- folio_mark_uptodate(folio);
- } else {
- folio_clear_uptodate(folio);
- folio_set_error(folio);
- }
- folio_unlock(folio);
- } else {
- if (err) {
- struct address_space *mapping;
-
- folio_set_error(folio);
- mapping = folio_mapping(folio);
- if (mapping)
- mapping_set_error(mapping, err);
- }
- folio_end_writeback(folio);
- }
-}
-EXPORT_SYMBOL_GPL(page_endio);
-
/**
* __folio_lock - Get a lock on the folio, assuming we need to sleep to get it.
* @folio: The folio to lock
@@ -1760,9 +1728,7 @@ bool __folio_lock_or_retry(struct folio *folio, struct mm_struct *mm,
*
* Return: The index of the gap if found, otherwise an index outside the
* range specified (in which case 'return - index >= max_scan' will be true).
- * In the rare case of index wrap-around, 0 will be returned. 0 will also
- * be returned if index == 0 and there is a gap at the index. We can not
- * wrap-around if passed index == 0.
+ * In the rare case of index wrap-around, 0 will be returned.
*/
pgoff_t page_cache_next_miss(struct address_space *mapping,
pgoff_t index, unsigned long max_scan)
@@ -1772,13 +1738,12 @@ pgoff_t page_cache_next_miss(struct address_space *mapping,
while (max_scan--) {
void *entry = xas_next(&xas);
if (!entry || xa_is_value(entry))
- return xas.xa_index;
- if (xas.xa_index == 0 && index != 0)
- return xas.xa_index;
+ break;
+ if (xas.xa_index == 0)
+ break;
}
- /* No gaps in range and no wrap-around, return index beyond range */
- return xas.xa_index + 1;
+ return xas.xa_index;
}
EXPORT_SYMBOL(page_cache_next_miss);
@@ -1799,9 +1764,7 @@ EXPORT_SYMBOL(page_cache_next_miss);
*
* Return: The index of the gap if found, otherwise an index outside the
* range specified (in which case 'index - return >= max_scan' will be true).
- * In the rare case of wrap-around, ULONG_MAX will be returned. ULONG_MAX
- * will also be returned if index == ULONG_MAX and there is a gap at the
- * index. We can not wrap-around if passed index == ULONG_MAX.
+ * In the rare case of wrap-around, ULONG_MAX will be returned.
*/
pgoff_t page_cache_prev_miss(struct address_space *mapping,
pgoff_t index, unsigned long max_scan)
@@ -1811,13 +1774,12 @@ pgoff_t page_cache_prev_miss(struct address_space *mapping,
while (max_scan--) {
void *entry = xas_prev(&xas);
if (!entry || xa_is_value(entry))
- return xas.xa_index;
- if (xas.xa_index == ULONG_MAX && index != ULONG_MAX)
- return xas.xa_index;
+ break;
+ if (xas.xa_index == ULONG_MAX)
+ break;
}
- /* No gaps in range and no wrap-around, return index beyond range */
- return xas.xa_index - 1;
+ return xas.xa_index;
}
EXPORT_SYMBOL(page_cache_prev_miss);
@@ -2767,6 +2729,48 @@ put_folios:
}
EXPORT_SYMBOL_GPL(filemap_read);
+int kiocb_write_and_wait(struct kiocb *iocb, size_t count)
+{
+ struct address_space *mapping = iocb->ki_filp->f_mapping;
+ loff_t pos = iocb->ki_pos;
+ loff_t end = pos + count - 1;
+
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ if (filemap_range_needs_writeback(mapping, pos, end))
+ return -EAGAIN;
+ return 0;
+ }
+
+ return filemap_write_and_wait_range(mapping, pos, end);
+}
+
+int kiocb_invalidate_pages(struct kiocb *iocb, size_t count)
+{
+ struct address_space *mapping = iocb->ki_filp->f_mapping;
+ loff_t pos = iocb->ki_pos;
+ loff_t end = pos + count - 1;
+ int ret;
+
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ /* we could block if there are any pages in the range */
+ if (filemap_range_has_page(mapping, pos, end))
+ return -EAGAIN;
+ } else {
+ ret = filemap_write_and_wait_range(mapping, pos, end);
+ if (ret)
+ return ret;
+ }
+
+ /*
+ * After a write we want buffered reads to be sure to go to disk to get
+ * the new data. We invalidate clean cached page from the region we're
+ * about to write. We do this *before* the write so that we can return
+ * without clobbering -EIOCBQUEUED from ->direct_IO().
+ */
+ return invalidate_inode_pages2_range(mapping, pos >> PAGE_SHIFT,
+ end >> PAGE_SHIFT);
+}
+
/**
* generic_file_read_iter - generic filesystem read routine
* @iocb: kernel I/O control block
@@ -2802,18 +2806,9 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
struct address_space *mapping = file->f_mapping;
struct inode *inode = mapping->host;
- if (iocb->ki_flags & IOCB_NOWAIT) {
- if (filemap_range_needs_writeback(mapping, iocb->ki_pos,
- iocb->ki_pos + count - 1))
- return -EAGAIN;
- } else {
- retval = filemap_write_and_wait_range(mapping,
- iocb->ki_pos,
- iocb->ki_pos + count - 1);
- if (retval < 0)
- return retval;
- }
-
+ retval = kiocb_write_and_wait(iocb, count);
+ if (retval < 0)
+ return retval;
file_accessed(file);
retval = mapping->a_ops->direct_IO(iocb, iter);
@@ -3436,13 +3431,6 @@ static bool filemap_map_pmd(struct vm_fault *vmf, struct folio *folio,
if (pmd_none(*vmf->pmd))
pmd_install(mm, vmf->pmd, &vmf->prealloc_pte);
- /* See comment in handle_pte_fault() */
- if (pmd_devmap_trans_unstable(vmf->pmd)) {
- folio_unlock(folio);
- folio_put(folio);
- return true;
- }
-
return false;
}
@@ -3529,6 +3517,11 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf,
addr = vma->vm_start + ((start_pgoff - vma->vm_pgoff) << PAGE_SHIFT);
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, addr, &vmf->ptl);
+ if (!vmf->pte) {
+ folio_unlock(folio);
+ folio_put(folio);
+ goto out;
+ }
do {
again:
page = folio_file_page(folio, xas.xa_index);
@@ -3547,7 +3540,7 @@ again:
* handled in the specific fault path, and it'll prohibit the
* fault-around logic.
*/
- if (!pte_none(*vmf->pte))
+ if (!pte_none(ptep_get(vmf->pte)))
goto unlock;
/* We're about to handle the fault */
@@ -3806,7 +3799,7 @@ EXPORT_SYMBOL(read_cache_page_gfp);
/*
* Warn about a page cache invalidation failure during a direct I/O write.
*/
-void dio_warn_stale_pagecache(struct file *filp)
+static void dio_warn_stale_pagecache(struct file *filp)
{
static DEFINE_RATELIMIT_STATE(_rs, 86400 * HZ, DEFAULT_RATELIMIT_BURST);
char pathname[128];
@@ -3823,48 +3816,33 @@ void dio_warn_stale_pagecache(struct file *filp)
}
}
-ssize_t
-generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
+void kiocb_invalidate_post_direct_write(struct kiocb *iocb, size_t count)
{
- struct file *file = iocb->ki_filp;
- struct address_space *mapping = file->f_mapping;
- struct inode *inode = mapping->host;
- loff_t pos = iocb->ki_pos;
- ssize_t written;
- size_t write_len;
- pgoff_t end;
+ struct address_space *mapping = iocb->ki_filp->f_mapping;
- write_len = iov_iter_count(from);
- end = (pos + write_len - 1) >> PAGE_SHIFT;
+ if (mapping->nrpages &&
+ invalidate_inode_pages2_range(mapping,
+ iocb->ki_pos >> PAGE_SHIFT,
+ (iocb->ki_pos + count - 1) >> PAGE_SHIFT))
+ dio_warn_stale_pagecache(iocb->ki_filp);
+}
- if (iocb->ki_flags & IOCB_NOWAIT) {
- /* If there are pages to writeback, return */
- if (filemap_range_has_page(file->f_mapping, pos,
- pos + write_len - 1))
- return -EAGAIN;
- } else {
- written = filemap_write_and_wait_range(mapping, pos,
- pos + write_len - 1);
- if (written)
- goto out;
- }
+ssize_t
+generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
+{
+ struct address_space *mapping = iocb->ki_filp->f_mapping;
+ size_t write_len = iov_iter_count(from);
+ ssize_t written;
/*
- * After a write we want buffered reads to be sure to go to disk to get
- * the new data. We invalidate clean cached page from the region we're
- * about to write. We do this *before* the write so that we can return
- * without clobbering -EIOCBQUEUED from ->direct_IO().
- */
- written = invalidate_inode_pages2_range(mapping,
- pos >> PAGE_SHIFT, end);
- /*
* If a page can not be invalidated, return 0 to fall back
* to buffered write.
*/
+ written = kiocb_invalidate_pages(iocb, write_len);
if (written) {
if (written == -EBUSY)
return 0;
- goto out;
+ return written;
}
written = mapping->a_ops->direct_IO(iocb, from);
@@ -3886,11 +3864,11 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
*
* Skip invalidation for async writes or if mapping has no pages.
*/
- if (written > 0 && mapping->nrpages &&
- invalidate_inode_pages2_range(mapping, pos >> PAGE_SHIFT, end))
- dio_warn_stale_pagecache(file);
-
if (written > 0) {
+ struct inode *inode = mapping->host;
+ loff_t pos = iocb->ki_pos;
+
+ kiocb_invalidate_post_direct_write(iocb, written);
pos += written;
write_len -= written;
if (pos > i_size_read(inode) && !S_ISBLK(inode->i_mode)) {
@@ -3901,7 +3879,6 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
}
if (written != -EIOCBQUEUED)
iov_iter_revert(from, write_len - iov_iter_count(from));
-out:
return written;
}
EXPORT_SYMBOL(generic_file_direct_write);
@@ -3980,7 +3957,10 @@ again:
balance_dirty_pages_ratelimited(mapping);
} while (iov_iter_count(i));
- return written ? written : status;
+ if (!written)
+ return status;
+ iocb->ki_pos += written;
+ return written;
}
EXPORT_SYMBOL(generic_perform_write);
@@ -4009,25 +3989,19 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
- struct inode *inode = mapping->host;
- ssize_t written = 0;
- ssize_t err;
- ssize_t status;
-
- /* We can write back this queue in page reclaim */
- current->backing_dev_info = inode_to_bdi(inode);
- err = file_remove_privs(file);
- if (err)
- goto out;
+ struct inode *inode = mapping->host;
+ ssize_t ret;
- err = file_update_time(file);
- if (err)
- goto out;
+ ret = file_remove_privs(file);
+ if (ret)
+ return ret;
- if (iocb->ki_flags & IOCB_DIRECT) {
- loff_t pos, endbyte;
+ ret = file_update_time(file);
+ if (ret)
+ return ret;
- written = generic_file_direct_write(iocb, from);
+ if (iocb->ki_flags & IOCB_DIRECT) {
+ ret = generic_file_direct_write(iocb, from);
/*
* If the write stopped short of completing, fall back to
* buffered writes. Some filesystems do this for writes to
@@ -4035,49 +4009,13 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
* not succeed (even if it did, DAX does not handle dirty
* page-cache pages correctly).
*/
- if (written < 0 || !iov_iter_count(from) || IS_DAX(inode))
- goto out;
-
- pos = iocb->ki_pos;
- status = generic_perform_write(iocb, from);
- /*
- * If generic_perform_write() returned a synchronous error
- * then we want to return the number of bytes which were
- * direct-written, or the error code if that was zero. Note
- * that this differs from normal direct-io semantics, which
- * will return -EFOO even if some bytes were written.
- */
- if (unlikely(status < 0)) {
- err = status;
- goto out;
- }
- /*
- * We need to ensure that the page cache pages are written to
- * disk and invalidated to preserve the expected O_DIRECT
- * semantics.
- */
- endbyte = pos + status - 1;
- err = filemap_write_and_wait_range(mapping, pos, endbyte);
- if (err == 0) {
- iocb->ki_pos = endbyte + 1;
- written += status;
- invalidate_mapping_pages(mapping,
- pos >> PAGE_SHIFT,
- endbyte >> PAGE_SHIFT);
- } else {
- /*
- * We don't know how much we wrote, so just return
- * the number of bytes which were direct-written
- */
- }
- } else {
- written = generic_perform_write(iocb, from);
- if (likely(written > 0))
- iocb->ki_pos += written;
+ if (ret < 0 || !iov_iter_count(from) || IS_DAX(inode))
+ return ret;
+ return direct_write_fallback(iocb, from, ret,
+ generic_perform_write(iocb, from));
}
-out:
- current->backing_dev_info = NULL;
- return written ? written : err;
+
+ return generic_perform_write(iocb, from);
}
EXPORT_SYMBOL(__generic_file_write_iter);
@@ -4142,3 +4080,171 @@ bool filemap_release_folio(struct folio *folio, gfp_t gfp)
return try_to_free_buffers(folio);
}
EXPORT_SYMBOL(filemap_release_folio);
+
+#ifdef CONFIG_CACHESTAT_SYSCALL
+/**
+ * filemap_cachestat() - compute the page cache statistics of a mapping
+ * @mapping: The mapping to compute the statistics for.
+ * @first_index: The starting page cache index.
+ * @last_index: The final page index (inclusive).
+ * @cs: the cachestat struct to write the result to.
+ *
+ * This will query the page cache statistics of a mapping in the
+ * page range of [first_index, last_index] (inclusive). The statistics
+ * queried include: number of dirty pages, number of pages marked for
+ * writeback, and the number of (recently) evicted pages.
+ */
+static void filemap_cachestat(struct address_space *mapping,
+ pgoff_t first_index, pgoff_t last_index, struct cachestat *cs)
+{
+ XA_STATE(xas, &mapping->i_pages, first_index);
+ struct folio *folio;
+
+ rcu_read_lock();
+ xas_for_each(&xas, folio, last_index) {
+ unsigned long nr_pages;
+ pgoff_t folio_first_index, folio_last_index;
+
+ if (xas_retry(&xas, folio))
+ continue;
+
+ if (xa_is_value(folio)) {
+ /* page is evicted */
+ void *shadow = (void *)folio;
+ bool workingset; /* not used */
+ int order = xa_get_order(xas.xa, xas.xa_index);
+
+ nr_pages = 1 << order;
+ folio_first_index = round_down(xas.xa_index, 1 << order);
+ folio_last_index = folio_first_index + nr_pages - 1;
+
+ /* Folios might straddle the range boundaries, only count covered pages */
+ if (folio_first_index < first_index)
+ nr_pages -= first_index - folio_first_index;
+
+ if (folio_last_index > last_index)
+ nr_pages -= folio_last_index - last_index;
+
+ cs->nr_evicted += nr_pages;
+
+#ifdef CONFIG_SWAP /* implies CONFIG_MMU */
+ if (shmem_mapping(mapping)) {
+ /* shmem file - in swap cache */
+ swp_entry_t swp = radix_to_swp_entry(folio);
+
+ shadow = get_shadow_from_swap_cache(swp);
+ }
+#endif
+ if (workingset_test_recent(shadow, true, &workingset))
+ cs->nr_recently_evicted += nr_pages;
+
+ goto resched;
+ }
+
+ nr_pages = folio_nr_pages(folio);
+ folio_first_index = folio_pgoff(folio);
+ folio_last_index = folio_first_index + nr_pages - 1;
+
+ /* Folios might straddle the range boundaries, only count covered pages */
+ if (folio_first_index < first_index)
+ nr_pages -= first_index - folio_first_index;
+
+ if (folio_last_index > last_index)
+ nr_pages -= folio_last_index - last_index;
+
+ /* page is in cache */
+ cs->nr_cache += nr_pages;
+
+ if (folio_test_dirty(folio))
+ cs->nr_dirty += nr_pages;
+
+ if (folio_test_writeback(folio))
+ cs->nr_writeback += nr_pages;
+
+resched:
+ if (need_resched()) {
+ xas_pause(&xas);
+ cond_resched_rcu();
+ }
+ }
+ rcu_read_unlock();
+}
+
+/*
+ * The cachestat(2) system call.
+ *
+ * cachestat() returns the page cache statistics of a file in the
+ * bytes range specified by `off` and `len`: number of cached pages,
+ * number of dirty pages, number of pages marked for writeback,
+ * number of evicted pages, and number of recently evicted pages.
+ *
+ * An evicted page is a page that is previously in the page cache
+ * but has been evicted since. A page is recently evicted if its last
+ * eviction was recent enough that its reentry to the cache would
+ * indicate that it is actively being used by the system, and that
+ * there is memory pressure on the system.
+ *
+ * `off` and `len` must be non-negative integers. If `len` > 0,
+ * the queried range is [`off`, `off` + `len`]. If `len` == 0,
+ * we will query in the range from `off` to the end of the file.
+ *
+ * The `flags` argument is unused for now, but is included for future
+ * extensibility. User should pass 0 (i.e no flag specified).
+ *
+ * Currently, hugetlbfs is not supported.
+ *
+ * Because the status of a page can change after cachestat() checks it
+ * but before it returns to the application, the returned values may
+ * contain stale information.
+ *
+ * return values:
+ * zero - success
+ * -EFAULT - cstat or cstat_range points to an illegal address
+ * -EINVAL - invalid flags
+ * -EBADF - invalid file descriptor
+ * -EOPNOTSUPP - file descriptor is of a hugetlbfs file
+ */
+SYSCALL_DEFINE4(cachestat, unsigned int, fd,
+ struct cachestat_range __user *, cstat_range,
+ struct cachestat __user *, cstat, unsigned int, flags)
+{
+ struct fd f = fdget(fd);
+ struct address_space *mapping;
+ struct cachestat_range csr;
+ struct cachestat cs;
+ pgoff_t first_index, last_index;
+
+ if (!f.file)
+ return -EBADF;
+
+ if (copy_from_user(&csr, cstat_range,
+ sizeof(struct cachestat_range))) {
+ fdput(f);
+ return -EFAULT;
+ }
+
+ /* hugetlbfs is not supported */
+ if (is_file_hugepages(f.file)) {
+ fdput(f);
+ return -EOPNOTSUPP;
+ }
+
+ if (flags != 0) {
+ fdput(f);
+ return -EINVAL;
+ }
+
+ first_index = csr.off >> PAGE_SHIFT;
+ last_index =
+ csr.len == 0 ? ULONG_MAX : (csr.off + csr.len - 1) >> PAGE_SHIFT;
+ memset(&cs, 0, sizeof(struct cachestat));
+ mapping = f.file->f_mapping;
+ filemap_cachestat(mapping, first_index, last_index, &cs);
+ fdput(f);
+
+ if (copy_to_user(cstat, &cs, sizeof(struct cachestat)))
+ return -EFAULT;
+
+ return 0;
+}
+#endif /* CONFIG_CACHESTAT_SYSCALL */
diff --git a/mm/frontswap.c b/mm/frontswap.c
index 279e55b4ed87..2fb5df3384b8 100644
--- a/mm/frontswap.c
+++ b/mm/frontswap.c
@@ -206,6 +206,7 @@ int __frontswap_load(struct page *page)
int type = swp_type(entry);
struct swap_info_struct *sis = swap_info[type];
pgoff_t offset = swp_offset(entry);
+ bool exclusive = false;
VM_BUG_ON(!frontswap_ops);
VM_BUG_ON(!PageLocked(page));
@@ -215,9 +216,14 @@ int __frontswap_load(struct page *page)
return -1;
/* Try loading from each implementation, until one succeeds. */
- ret = frontswap_ops->load(type, offset, page);
- if (ret == 0)
+ ret = frontswap_ops->load(type, offset, page, &exclusive);
+ if (ret == 0) {
inc_frontswap_loads();
+ if (exclusive) {
+ SetPageDirty(page);
+ __frontswap_clear(sis, offset);
+ }
+ }
return ret;
}
diff --git a/mm/gup.c b/mm/gup.c
index 0814576b7366..48c1659314b0 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -18,6 +18,7 @@
#include <linux/migrate.h>
#include <linux/mm_inline.h>
#include <linux/sched/mm.h>
+#include <linux/shmem_fs.h>
#include <asm/mmu_context.h>
#include <asm/tlbflush.h>
@@ -124,65 +125,65 @@ retry:
*/
struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags)
{
+ struct folio *folio;
+
+ if (WARN_ON_ONCE((flags & (FOLL_GET | FOLL_PIN)) == 0))
+ return NULL;
+
if (unlikely(!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(page)))
return NULL;
if (flags & FOLL_GET)
return try_get_folio(page, refs);
- else if (flags & FOLL_PIN) {
- struct folio *folio;
- /*
- * Don't take a pin on the zero page - it's not going anywhere
- * and it is used in a *lot* of places.
- */
- if (is_zero_page(page))
- return page_folio(page);
+ /* FOLL_PIN is set */
- /*
- * Can't do FOLL_LONGTERM + FOLL_PIN gup fast path if not in a
- * right zone, so fail and let the caller fall back to the slow
- * path.
- */
- if (unlikely((flags & FOLL_LONGTERM) &&
- !is_longterm_pinnable_page(page)))
- return NULL;
+ /*
+ * Don't take a pin on the zero page - it's not going anywhere
+ * and it is used in a *lot* of places.
+ */
+ if (is_zero_page(page))
+ return page_folio(page);
- /*
- * CAUTION: Don't use compound_head() on the page before this
- * point, the result won't be stable.
- */
- folio = try_get_folio(page, refs);
- if (!folio)
- return NULL;
+ folio = try_get_folio(page, refs);
+ if (!folio)
+ return NULL;
- /*
- * When pinning a large folio, use an exact count to track it.
- *
- * However, be sure to *also* increment the normal folio
- * refcount field at least once, so that the folio really
- * is pinned. That's why the refcount from the earlier
- * try_get_folio() is left intact.
- */
- if (folio_test_large(folio))
- atomic_add(refs, &folio->_pincount);
- else
- folio_ref_add(folio,
- refs * (GUP_PIN_COUNTING_BIAS - 1));
- /*
- * Adjust the pincount before re-checking the PTE for changes.
- * This is essentially a smp_mb() and is paired with a memory
- * barrier in page_try_share_anon_rmap().
- */
- smp_mb__after_atomic();
+ /*
+ * Can't do FOLL_LONGTERM + FOLL_PIN gup fast path if not in a
+ * right zone, so fail and let the caller fall back to the slow
+ * path.
+ */
+ if (unlikely((flags & FOLL_LONGTERM) &&
+ !folio_is_longterm_pinnable(folio))) {
+ if (!put_devmap_managed_page_refs(&folio->page, refs))
+ folio_put_refs(folio, refs);
+ return NULL;
+ }
- node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, refs);
+ /*
+ * When pinning a large folio, use an exact count to track it.
+ *
+ * However, be sure to *also* increment the normal folio
+ * refcount field at least once, so that the folio really
+ * is pinned. That's why the refcount from the earlier
+ * try_get_folio() is left intact.
+ */
+ if (folio_test_large(folio))
+ atomic_add(refs, &folio->_pincount);
+ else
+ folio_ref_add(folio,
+ refs * (GUP_PIN_COUNTING_BIAS - 1));
+ /*
+ * Adjust the pincount before re-checking the PTE for changes.
+ * This is essentially a smp_mb() and is paired with a memory
+ * barrier in page_try_share_anon_rmap().
+ */
+ smp_mb__after_atomic();
- return folio;
- }
+ node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, refs);
- WARN_ON_ONCE(1);
- return NULL;
+ return folio;
}
static void gup_put_folio(struct folio *folio, int refs, unsigned int flags)
@@ -520,13 +521,14 @@ static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address,
pte_t *pte, unsigned int flags)
{
if (flags & FOLL_TOUCH) {
- pte_t entry = *pte;
+ pte_t orig_entry = ptep_get(pte);
+ pte_t entry = orig_entry;
if (flags & FOLL_WRITE)
entry = pte_mkdirty(entry);
entry = pte_mkyoung(entry);
- if (!pte_same(*pte, entry)) {
+ if (!pte_same(orig_entry, entry)) {
set_pte_at(vma->vm_mm, address, pte, entry);
update_mmu_cache(vma, address, pte);
}
@@ -588,11 +590,11 @@ static struct page *follow_page_pte(struct vm_area_struct *vma,
if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
(FOLL_PIN | FOLL_GET)))
return ERR_PTR(-EINVAL);
- if (unlikely(pmd_bad(*pmd)))
- return no_page_table(vma, flags);
ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
- pte = *ptep;
+ if (!ptep)
+ return no_page_table(vma, flags);
+ pte = ptep_get(ptep);
if (!pte_present(pte))
goto no_page;
if (pte_protnone(pte) && !gup_can_follow_protnone(flags))
@@ -697,11 +699,7 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
struct mm_struct *mm = vma->vm_mm;
pmd = pmd_offset(pudp, address);
- /*
- * The READ_ONCE() will stabilize the pmdval in a register or
- * on the stack so that it will stop changing under the code.
- */
- pmdval = READ_ONCE(*pmd);
+ pmdval = pmdp_get_lockless(pmd);
if (pmd_none(pmdval))
return no_page_table(vma, flags);
if (!pmd_present(pmdval))
@@ -729,21 +727,10 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
}
if (flags & FOLL_SPLIT_PMD) {
- int ret;
- page = pmd_page(*pmd);
- if (is_huge_zero_page(page)) {
- spin_unlock(ptl);
- ret = 0;
- split_huge_pmd(vma, pmd, address);
- if (pmd_trans_unstable(pmd))
- ret = -EBUSY;
- } else {
- spin_unlock(ptl);
- split_huge_pmd(vma, pmd, address);
- ret = pte_alloc(mm, pmd) ? -ENOMEM : 0;
- }
-
- return ret ? ERR_PTR(ret) :
+ spin_unlock(ptl);
+ split_huge_pmd(vma, pmd, address);
+ /* If pmd was left empty, stuff a page table in there quickly */
+ return pte_alloc(mm, pmd) ? ERR_PTR(-ENOMEM) :
follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
}
page = follow_trans_huge_pmd(vma, address, pmd, flags);
@@ -879,6 +866,7 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address,
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
+ pte_t entry;
int ret = -EFAULT;
/* user gate pages are read-only */
@@ -899,18 +887,20 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address,
pmd = pmd_offset(pud, address);
if (!pmd_present(*pmd))
return -EFAULT;
- VM_BUG_ON(pmd_trans_huge(*pmd));
pte = pte_offset_map(pmd, address);
- if (pte_none(*pte))
+ if (!pte)
+ return -EFAULT;
+ entry = ptep_get(pte);
+ if (pte_none(entry))
goto unmap;
*vma = get_gate_vma(mm);
if (!page)
goto out;
- *page = vm_normal_page(*vma, address, *pte);
+ *page = vm_normal_page(*vma, address, entry);
if (!*page) {
- if ((gup_flags & FOLL_DUMP) || !is_zero_pfn(pte_pfn(*pte)))
+ if ((gup_flags & FOLL_DUMP) || !is_zero_pfn(pte_pfn(entry)))
goto unmap;
- *page = pte_page(*pte);
+ *page = pte_page(entry);
}
ret = try_grab_page(*page, gup_flags);
if (unlikely(ret))
@@ -1003,16 +993,54 @@ static int faultin_page(struct vm_area_struct *vma,
return 0;
}
+/*
+ * Writing to file-backed mappings which require folio dirty tracking using GUP
+ * is a fundamentally broken operation, as kernel write access to GUP mappings
+ * do not adhere to the semantics expected by a file system.
+ *
+ * Consider the following scenario:-
+ *
+ * 1. A folio is written to via GUP which write-faults the memory, notifying
+ * the file system and dirtying the folio.
+ * 2. Later, writeback is triggered, resulting in the folio being cleaned and
+ * the PTE being marked read-only.
+ * 3. The GUP caller writes to the folio, as it is mapped read/write via the
+ * direct mapping.
+ * 4. The GUP caller, now done with the page, unpins it and sets it dirty
+ * (though it does not have to).
+ *
+ * This results in both data being written to a folio without writenotify, and
+ * the folio being dirtied unexpectedly (if the caller decides to do so).
+ */
+static bool writable_file_mapping_allowed(struct vm_area_struct *vma,
+ unsigned long gup_flags)
+{
+ /*
+ * If we aren't pinning then no problematic write can occur. A long term
+ * pin is the most egregious case so this is the case we disallow.
+ */
+ if ((gup_flags & (FOLL_PIN | FOLL_LONGTERM)) !=
+ (FOLL_PIN | FOLL_LONGTERM))
+ return true;
+
+ /*
+ * If the VMA does not require dirty tracking then no problematic write
+ * can occur either.
+ */
+ return !vma_needs_dirty_tracking(vma);
+}
+
static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
{
vm_flags_t vm_flags = vma->vm_flags;
int write = (gup_flags & FOLL_WRITE);
int foreign = (gup_flags & FOLL_REMOTE);
+ bool vma_anon = vma_is_anonymous(vma);
if (vm_flags & (VM_IO | VM_PFNMAP))
return -EFAULT;
- if (gup_flags & FOLL_ANON && !vma_is_anonymous(vma))
+ if ((gup_flags & FOLL_ANON) && !vma_anon)
return -EFAULT;
if ((gup_flags & FOLL_LONGTERM) && vma_is_fsdax(vma))
@@ -1022,6 +1050,10 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
return -EFAULT;
if (write) {
+ if (!vma_anon &&
+ !writable_file_mapping_allowed(vma, gup_flags))
+ return -EFAULT;
+
if (!(vm_flags & VM_WRITE)) {
if (!(gup_flags & FOLL_FORCE))
return -EFAULT;
@@ -1068,8 +1100,6 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
* @pages: array that receives pointers to the pages pinned.
* Should be at least nr_pages long. Or NULL, if caller
* only intends to ensure the pages are faulted in.
- * @vmas: array of pointers to vmas corresponding to each page.
- * Or NULL if the caller does not require them.
* @locked: whether we're still with the mmap_lock held
*
* Returns either number of pages pinned (which may be less than the
@@ -1083,8 +1113,6 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
*
* The caller is responsible for releasing returned @pages, via put_page().
*
- * @vmas are valid only as long as mmap_lock is held.
- *
* Must be called with mmap_lock held. It may be released. See below.
*
* __get_user_pages walks a process's page tables and takes a reference to
@@ -1120,7 +1148,7 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
static long __get_user_pages(struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
- struct vm_area_struct **vmas, int *locked)
+ int *locked)
{
long ret = 0, i = 0;
struct vm_area_struct *vma = NULL;
@@ -1160,9 +1188,9 @@ static long __get_user_pages(struct mm_struct *mm,
goto out;
if (is_vm_hugetlb_page(vma)) {
- i = follow_hugetlb_page(mm, vma, pages, vmas,
- &start, &nr_pages, i,
- gup_flags, locked);
+ i = follow_hugetlb_page(mm, vma, pages,
+ &start, &nr_pages, i,
+ gup_flags, locked);
if (!*locked) {
/*
* We've got a VM_FAULT_RETRY
@@ -1227,10 +1255,6 @@ retry:
ctx.page_mask = 0;
}
next_page:
- if (vmas) {
- vmas[i] = vma;
- ctx.page_mask = 0;
- }
page_increm = 1 + (~(start >> PAGE_SHIFT) & ctx.page_mask);
if (page_increm > nr_pages)
page_increm = nr_pages;
@@ -1385,7 +1409,6 @@ static __always_inline long __get_user_pages_locked(struct mm_struct *mm,
unsigned long start,
unsigned long nr_pages,
struct page **pages,
- struct vm_area_struct **vmas,
int *locked,
unsigned int flags)
{
@@ -1423,7 +1446,7 @@ static __always_inline long __get_user_pages_locked(struct mm_struct *mm,
pages_done = 0;
for (;;) {
ret = __get_user_pages(mm, start, nr_pages, flags, pages,
- vmas, locked);
+ locked);
if (!(flags & FOLL_UNLOCKABLE)) {
/* VM_FAULT_RETRY couldn't trigger, bypass */
pages_done = ret;
@@ -1487,7 +1510,7 @@ retry:
*locked = 1;
ret = __get_user_pages(mm, start, 1, flags | FOLL_TRIED,
- pages, NULL, locked);
+ pages, locked);
if (!*locked) {
/* Continue to retry until we succeeded */
BUG_ON(ret != 0);
@@ -1585,7 +1608,7 @@ long populate_vma_page_range(struct vm_area_struct *vma,
* not result in a stack expansion that recurses back here.
*/
ret = __get_user_pages(mm, start, nr_pages, gup_flags,
- NULL, NULL, locked ? locked : &local_locked);
+ NULL, locked ? locked : &local_locked);
lru_add_drain();
return ret;
}
@@ -1643,7 +1666,7 @@ long faultin_vma_page_range(struct vm_area_struct *vma, unsigned long start,
return -EINVAL;
ret = __get_user_pages(mm, start, nr_pages, gup_flags,
- NULL, NULL, locked);
+ NULL, locked);
lru_add_drain();
return ret;
}
@@ -1711,8 +1734,7 @@ int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
#else /* CONFIG_MMU */
static long __get_user_pages_locked(struct mm_struct *mm, unsigned long start,
unsigned long nr_pages, struct page **pages,
- struct vm_area_struct **vmas, int *locked,
- unsigned int foll_flags)
+ int *locked, unsigned int foll_flags)
{
struct vm_area_struct *vma;
bool must_unlock = false;
@@ -1756,8 +1778,7 @@ static long __get_user_pages_locked(struct mm_struct *mm, unsigned long start,
if (pages[i])
get_page(pages[i]);
}
- if (vmas)
- vmas[i] = vma;
+
start = (start + PAGE_SIZE) & PAGE_MASK;
}
@@ -1938,8 +1959,7 @@ struct page *get_dump_page(unsigned long addr)
int locked = 0;
int ret;
- ret = __get_user_pages_locked(current->mm, addr, 1, &page, NULL,
- &locked,
+ ret = __get_user_pages_locked(current->mm, addr, 1, &page, &locked,
FOLL_FORCE | FOLL_DUMP | FOLL_GET);
return (ret == 1) ? page : NULL;
}
@@ -2112,7 +2132,6 @@ static long __gup_longterm_locked(struct mm_struct *mm,
unsigned long start,
unsigned long nr_pages,
struct page **pages,
- struct vm_area_struct **vmas,
int *locked,
unsigned int gup_flags)
{
@@ -2120,13 +2139,13 @@ static long __gup_longterm_locked(struct mm_struct *mm,
long rc, nr_pinned_pages;
if (!(gup_flags & FOLL_LONGTERM))
- return __get_user_pages_locked(mm, start, nr_pages, pages, vmas,
+ return __get_user_pages_locked(mm, start, nr_pages, pages,
locked, gup_flags);
flags = memalloc_pin_save();
do {
nr_pinned_pages = __get_user_pages_locked(mm, start, nr_pages,
- pages, vmas, locked,
+ pages, locked,
gup_flags);
if (nr_pinned_pages <= 0) {
rc = nr_pinned_pages;
@@ -2144,9 +2163,8 @@ static long __gup_longterm_locked(struct mm_struct *mm,
* Check that the given flags are valid for the exported gup/pup interface, and
* update them with the required flags that the caller must have set.
*/
-static bool is_valid_gup_args(struct page **pages, struct vm_area_struct **vmas,
- int *locked, unsigned int *gup_flags_p,
- unsigned int to_set)
+static bool is_valid_gup_args(struct page **pages, int *locked,
+ unsigned int *gup_flags_p, unsigned int to_set)
{
unsigned int gup_flags = *gup_flags_p;
@@ -2188,13 +2206,6 @@ static bool is_valid_gup_args(struct page **pages, struct vm_area_struct **vmas,
(gup_flags & FOLL_PCI_P2PDMA)))
return false;
- /*
- * Can't use VMAs with locked, as locked allows GUP to unlock
- * which invalidates the vmas array
- */
- if (WARN_ON_ONCE(vmas && (gup_flags & FOLL_UNLOCKABLE)))
- return false;
-
*gup_flags_p = gup_flags;
return true;
}
@@ -2209,8 +2220,6 @@ static bool is_valid_gup_args(struct page **pages, struct vm_area_struct **vmas,
* @pages: array that receives pointers to the pages pinned.
* Should be at least nr_pages long. Or NULL, if caller
* only intends to ensure the pages are faulted in.
- * @vmas: array of pointers to vmas corresponding to each page.
- * Or NULL if the caller does not require them.
* @locked: pointer to lock flag indicating whether lock is held and
* subsequently whether VM_FAULT_RETRY functionality can be
* utilised. Lock must initially be held.
@@ -2225,8 +2234,6 @@ static bool is_valid_gup_args(struct page **pages, struct vm_area_struct **vmas,
*
* The caller is responsible for releasing returned @pages, via put_page().
*
- * @vmas are valid only as long as mmap_lock is held.
- *
* Must be called with mmap_lock held for read or write.
*
* get_user_pages_remote walks a process's page tables and takes a reference
@@ -2263,15 +2270,15 @@ static bool is_valid_gup_args(struct page **pages, struct vm_area_struct **vmas,
long get_user_pages_remote(struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
- struct vm_area_struct **vmas, int *locked)
+ int *locked)
{
int local_locked = 1;
- if (!is_valid_gup_args(pages, vmas, locked, &gup_flags,
+ if (!is_valid_gup_args(pages, locked, &gup_flags,
FOLL_TOUCH | FOLL_REMOTE))
return -EINVAL;
- return __get_user_pages_locked(mm, start, nr_pages, pages, vmas,
+ return __get_user_pages_locked(mm, start, nr_pages, pages,
locked ? locked : &local_locked,
gup_flags);
}
@@ -2281,7 +2288,7 @@ EXPORT_SYMBOL(get_user_pages_remote);
long get_user_pages_remote(struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
- struct vm_area_struct **vmas, int *locked)
+ int *locked)
{
return 0;
}
@@ -2295,8 +2302,6 @@ long get_user_pages_remote(struct mm_struct *mm,
* @pages: array that receives pointers to the pages pinned.
* Should be at least nr_pages long. Or NULL, if caller
* only intends to ensure the pages are faulted in.
- * @vmas: array of pointers to vmas corresponding to each page.
- * Or NULL if the caller does not require them.
*
* This is the same as get_user_pages_remote(), just with a less-flexible
* calling convention where we assume that the mm being operated on belongs to
@@ -2304,16 +2309,15 @@ long get_user_pages_remote(struct mm_struct *mm,
* obviously don't pass FOLL_REMOTE in here.
*/
long get_user_pages(unsigned long start, unsigned long nr_pages,
- unsigned int gup_flags, struct page **pages,
- struct vm_area_struct **vmas)
+ unsigned int gup_flags, struct page **pages)
{
int locked = 1;
- if (!is_valid_gup_args(pages, vmas, NULL, &gup_flags, FOLL_TOUCH))
+ if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_TOUCH))
return -EINVAL;
return __get_user_pages_locked(current->mm, start, nr_pages, pages,
- vmas, &locked, gup_flags);
+ &locked, gup_flags);
}
EXPORT_SYMBOL(get_user_pages);
@@ -2337,12 +2341,12 @@ long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
{
int locked = 0;
- if (!is_valid_gup_args(pages, NULL, NULL, &gup_flags,
+ if (!is_valid_gup_args(pages, NULL, &gup_flags,
FOLL_TOUCH | FOLL_UNLOCKABLE))
return -EINVAL;
return __get_user_pages_locked(current->mm, start, nr_pages, pages,
- NULL, &locked, gup_flags);
+ &locked, gup_flags);
}
EXPORT_SYMBOL(get_user_pages_unlocked);
@@ -2381,6 +2385,82 @@ EXPORT_SYMBOL(get_user_pages_unlocked);
*/
#ifdef CONFIG_HAVE_FAST_GUP
+/*
+ * Used in the GUP-fast path to determine whether a pin is permitted for a
+ * specific folio.
+ *
+ * This call assumes the caller has pinned the folio, that the lowest page table
+ * level still points to this folio, and that interrupts have been disabled.
+ *
+ * Writing to pinned file-backed dirty tracked folios is inherently problematic
+ * (see comment describing the writable_file_mapping_allowed() function). We
+ * therefore try to avoid the most egregious case of a long-term mapping doing
+ * so.
+ *
+ * This function cannot be as thorough as that one as the VMA is not available
+ * in the fast path, so instead we whitelist known good cases and if in doubt,
+ * fall back to the slow path.
+ */
+static bool folio_fast_pin_allowed(struct folio *folio, unsigned int flags)
+{
+ struct address_space *mapping;
+ unsigned long mapping_flags;
+
+ /*
+ * If we aren't pinning then no problematic write can occur. A long term
+ * pin is the most egregious case so this is the one we disallow.
+ */
+ if ((flags & (FOLL_PIN | FOLL_LONGTERM | FOLL_WRITE)) !=
+ (FOLL_PIN | FOLL_LONGTERM | FOLL_WRITE))
+ return true;
+
+ /* The folio is pinned, so we can safely access folio fields. */
+
+ if (WARN_ON_ONCE(folio_test_slab(folio)))
+ return false;
+
+ /* hugetlb mappings do not require dirty-tracking. */
+ if (folio_test_hugetlb(folio))
+ return true;
+
+ /*
+ * GUP-fast disables IRQs. When IRQS are disabled, RCU grace periods
+ * cannot proceed, which means no actions performed under RCU can
+ * proceed either.
+ *
+ * inodes and thus their mappings are freed under RCU, which means the
+ * mapping cannot be freed beneath us and thus we can safely dereference
+ * it.
+ */
+ lockdep_assert_irqs_disabled();
+
+ /*
+ * However, there may be operations which _alter_ the mapping, so ensure
+ * we read it once and only once.
+ */
+ mapping = READ_ONCE(folio->mapping);
+
+ /*
+ * The mapping may have been truncated, in any case we cannot determine
+ * if this mapping is safe - fall back to slow path to determine how to
+ * proceed.
+ */
+ if (!mapping)
+ return false;
+
+ /* Anonymous folios pose no problem. */
+ mapping_flags = (unsigned long)mapping & PAGE_MAPPING_FLAGS;
+ if (mapping_flags)
+ return mapping_flags & PAGE_MAPPING_ANON;
+
+ /*
+ * At this point, we know the mapping is non-null and points to an
+ * address_space object. The only remaining whitelisted file system is
+ * shmem.
+ */
+ return shmem_mapping(mapping);
+}
+
static void __maybe_unused undo_dev_pagemap(int *nr, int nr_start,
unsigned int flags,
struct page **pages)
@@ -2425,6 +2505,8 @@ static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
pte_t *ptep, *ptem;
ptem = ptep = pte_offset_map(&pmd, addr);
+ if (!ptep)
+ return 0;
do {
pte_t pte = ptep_get_lockless(ptep);
struct page *page;
@@ -2461,7 +2543,12 @@ static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
}
if (unlikely(pmd_val(pmd) != pmd_val(*pmdp)) ||
- unlikely(pte_val(pte) != pte_val(*ptep))) {
+ unlikely(pte_val(pte) != pte_val(ptep_get(ptep)))) {
+ gup_put_folio(folio, 1, flags);
+ goto pte_unmap;
+ }
+
+ if (!folio_fast_pin_allowed(folio, flags)) {
gup_put_folio(folio, 1, flags);
goto pte_unmap;
}
@@ -2653,7 +2740,12 @@ static int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
if (!folio)
return 0;
- if (unlikely(pte_val(pte) != pte_val(*ptep))) {
+ if (unlikely(pte_val(pte) != pte_val(ptep_get(ptep)))) {
+ gup_put_folio(folio, refs, flags);
+ return 0;
+ }
+
+ if (!folio_fast_pin_allowed(folio, flags)) {
gup_put_folio(folio, refs, flags);
return 0;
}
@@ -2724,6 +2816,10 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
return 0;
}
+ if (!folio_fast_pin_allowed(folio, flags)) {
+ gup_put_folio(folio, refs, flags);
+ return 0;
+ }
if (!pmd_write(orig) && gup_must_unshare(NULL, flags, &folio->page)) {
gup_put_folio(folio, refs, flags);
return 0;
@@ -2764,6 +2860,11 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
return 0;
}
+ if (!folio_fast_pin_allowed(folio, flags)) {
+ gup_put_folio(folio, refs, flags);
+ return 0;
+ }
+
if (!pud_write(orig) && gup_must_unshare(NULL, flags, &folio->page)) {
gup_put_folio(folio, refs, flags);
return 0;
@@ -2799,6 +2900,16 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
return 0;
}
+ if (!pgd_write(orig) && gup_must_unshare(NULL, flags, &folio->page)) {
+ gup_put_folio(folio, refs, flags);
+ return 0;
+ }
+
+ if (!folio_fast_pin_allowed(folio, flags)) {
+ gup_put_folio(folio, refs, flags);
+ return 0;
+ }
+
*nr += refs;
folio_set_referenced(folio);
return 1;
@@ -3013,7 +3124,7 @@ static int internal_get_user_pages_fast(unsigned long start,
start = untagged_addr(start) & PAGE_MASK;
len = nr_pages << PAGE_SHIFT;
if (check_add_overflow(start, len, &end))
- return 0;
+ return -EOVERFLOW;
if (end > TASK_SIZE_MAX)
return -EFAULT;
if (unlikely(!access_ok((void __user *)start, len)))
@@ -3027,7 +3138,7 @@ static int internal_get_user_pages_fast(unsigned long start,
start += nr_pinned << PAGE_SHIFT;
pages += nr_pinned;
ret = __gup_longterm_locked(current->mm, start, nr_pages - nr_pinned,
- pages, NULL, &locked,
+ pages, &locked,
gup_flags | FOLL_TOUCH | FOLL_UNLOCKABLE);
if (ret < 0) {
/*
@@ -3069,7 +3180,7 @@ int get_user_pages_fast_only(unsigned long start, int nr_pages,
* FOLL_FAST_ONLY is required in order to match the API description of
* this routine: no fall back to regular ("slow") GUP.
*/
- if (!is_valid_gup_args(pages, NULL, NULL, &gup_flags,
+ if (!is_valid_gup_args(pages, NULL, &gup_flags,
FOLL_GET | FOLL_FAST_ONLY))
return -EINVAL;
@@ -3102,7 +3213,7 @@ int get_user_pages_fast(unsigned long start, int nr_pages,
* FOLL_GET, because gup fast is always a "pin with a +1 page refcount"
* request.
*/
- if (!is_valid_gup_args(pages, NULL, NULL, &gup_flags, FOLL_GET))
+ if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_GET))
return -EINVAL;
return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages);
}
@@ -3130,7 +3241,7 @@ EXPORT_SYMBOL_GPL(get_user_pages_fast);
int pin_user_pages_fast(unsigned long start, int nr_pages,
unsigned int gup_flags, struct page **pages)
{
- if (!is_valid_gup_args(pages, NULL, NULL, &gup_flags, FOLL_PIN))
+ if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_PIN))
return -EINVAL;
return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages);
}
@@ -3145,8 +3256,6 @@ EXPORT_SYMBOL_GPL(pin_user_pages_fast);
* @gup_flags: flags modifying lookup behaviour
* @pages: array that receives pointers to the pages pinned.
* Should be at least nr_pages long.
- * @vmas: array of pointers to vmas corresponding to each page.
- * Or NULL if the caller does not require them.
* @locked: pointer to lock flag indicating whether lock is held and
* subsequently whether VM_FAULT_RETRY functionality can be
* utilised. Lock must initially be held.
@@ -3164,14 +3273,14 @@ EXPORT_SYMBOL_GPL(pin_user_pages_fast);
long pin_user_pages_remote(struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
- struct vm_area_struct **vmas, int *locked)
+ int *locked)
{
int local_locked = 1;
- if (!is_valid_gup_args(pages, vmas, locked, &gup_flags,
+ if (!is_valid_gup_args(pages, locked, &gup_flags,
FOLL_PIN | FOLL_TOUCH | FOLL_REMOTE))
return 0;
- return __gup_longterm_locked(mm, start, nr_pages, pages, vmas,
+ return __gup_longterm_locked(mm, start, nr_pages, pages,
locked ? locked : &local_locked,
gup_flags);
}
@@ -3185,8 +3294,6 @@ EXPORT_SYMBOL(pin_user_pages_remote);
* @gup_flags: flags modifying lookup behaviour
* @pages: array that receives pointers to the pages pinned.
* Should be at least nr_pages long.
- * @vmas: array of pointers to vmas corresponding to each page.
- * Or NULL if the caller does not require them.
*
* Nearly the same as get_user_pages(), except that FOLL_TOUCH is not set, and
* FOLL_PIN is set.
@@ -3198,15 +3305,14 @@ EXPORT_SYMBOL(pin_user_pages_remote);
* pins in it and unpin_user_page*() will not remove pins from it.
*/
long pin_user_pages(unsigned long start, unsigned long nr_pages,
- unsigned int gup_flags, struct page **pages,
- struct vm_area_struct **vmas)
+ unsigned int gup_flags, struct page **pages)
{
int locked = 1;
- if (!is_valid_gup_args(pages, vmas, NULL, &gup_flags, FOLL_PIN))
+ if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_PIN))
return 0;
return __gup_longterm_locked(current->mm, start, nr_pages,
- pages, vmas, &locked, gup_flags);
+ pages, &locked, gup_flags);
}
EXPORT_SYMBOL(pin_user_pages);
@@ -3223,11 +3329,11 @@ long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
{
int locked = 0;
- if (!is_valid_gup_args(pages, NULL, NULL, &gup_flags,
+ if (!is_valid_gup_args(pages, NULL, &gup_flags,
FOLL_PIN | FOLL_TOUCH | FOLL_UNLOCKABLE))
return 0;
- return __gup_longterm_locked(current->mm, start, nr_pages, pages, NULL,
+ return __gup_longterm_locked(current->mm, start, nr_pages, pages,
&locked, gup_flags);
}
EXPORT_SYMBOL(pin_user_pages_unlocked);
diff --git a/mm/gup_test.c b/mm/gup_test.c
index c0421b786dcd..eeb3f4d87c51 100644
--- a/mm/gup_test.c
+++ b/mm/gup_test.c
@@ -40,24 +40,25 @@ static void verify_dma_pinned(unsigned int cmd, struct page **pages,
unsigned long nr_pages)
{
unsigned long i;
- struct page *page;
+ struct folio *folio;
switch (cmd) {
case PIN_FAST_BENCHMARK:
case PIN_BASIC_TEST:
case PIN_LONGTERM_BENCHMARK:
for (i = 0; i < nr_pages; i++) {
- page = pages[i];
- if (WARN(!page_maybe_dma_pinned(page),
+ folio = page_folio(pages[i]);
+
+ if (WARN(!folio_maybe_dma_pinned(folio),
"pages[%lu] is NOT dma-pinned\n", i)) {
- dump_page(page, "gup_test failure");
+ dump_page(&folio->page, "gup_test failure");
break;
} else if (cmd == PIN_LONGTERM_BENCHMARK &&
- WARN(!is_longterm_pinnable_page(page),
+ WARN(!folio_is_longterm_pinnable(folio),
"pages[%lu] is NOT pinnable but pinned\n",
i)) {
- dump_page(page, "gup_test failure");
+ dump_page(&folio->page, "gup_test failure");
break;
}
}
@@ -139,29 +140,27 @@ static int __gup_test_ioctl(unsigned int cmd,
pages + i);
break;
case GUP_BASIC_TEST:
- nr = get_user_pages(addr, nr, gup->gup_flags, pages + i,
- NULL);
+ nr = get_user_pages(addr, nr, gup->gup_flags, pages + i);
break;
case PIN_FAST_BENCHMARK:
nr = pin_user_pages_fast(addr, nr, gup->gup_flags,
pages + i);
break;
case PIN_BASIC_TEST:
- nr = pin_user_pages(addr, nr, gup->gup_flags, pages + i,
- NULL);
+ nr = pin_user_pages(addr, nr, gup->gup_flags, pages + i);
break;
case PIN_LONGTERM_BENCHMARK:
nr = pin_user_pages(addr, nr,
gup->gup_flags | FOLL_LONGTERM,
- pages + i, NULL);
+ pages + i);
break;
case DUMP_USER_PAGES_TEST:
if (gup->test_flags & GUP_TEST_FLAG_DUMP_PAGES_USE_PIN)
nr = pin_user_pages(addr, nr, gup->gup_flags,
- pages + i, NULL);
+ pages + i);
else
nr = get_user_pages(addr, nr, gup->gup_flags,
- pages + i, NULL);
+ pages + i);
break;
default:
ret = -EINVAL;
@@ -271,7 +270,7 @@ static inline int pin_longterm_test_start(unsigned long arg)
gup_flags, pages);
else
cur_pages = pin_user_pages(addr, remaining_pages,
- gup_flags, pages, NULL);
+ gup_flags, pages);
if (cur_pages < 0) {
pin_longterm_test_stop();
ret = cur_pages;
diff --git a/mm/highmem.c b/mm/highmem.c
index db251e77f98f..e19269093a93 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -161,7 +161,7 @@ struct page *__kmap_to_page(void *vaddr)
/* kmap() mappings */
if (WARN_ON_ONCE(addr >= PKMAP_ADDR(0) &&
addr < PKMAP_ADDR(LAST_PKMAP)))
- return pte_page(pkmap_page_table[PKMAP_NR(addr)]);
+ return pte_page(ptep_get(&pkmap_page_table[PKMAP_NR(addr)]));
/* kmap_local_page() mappings */
if (WARN_ON_ONCE(base >= __fix_to_virt(FIX_KMAP_END) &&
@@ -191,6 +191,7 @@ static void flush_all_zero_pkmaps(void)
for (i = 0; i < LAST_PKMAP; i++) {
struct page *page;
+ pte_t ptent;
/*
* zero means we don't have anything to do,
@@ -203,7 +204,8 @@ static void flush_all_zero_pkmaps(void)
pkmap_count[i] = 0;
/* sanity check */
- BUG_ON(pte_none(pkmap_page_table[i]));
+ ptent = ptep_get(&pkmap_page_table[i]);
+ BUG_ON(pte_none(ptent));
/*
* Don't need an atomic fetch-and-clear op here;
@@ -212,7 +214,7 @@ static void flush_all_zero_pkmaps(void)
* getting the kmap_lock (which is held here).
* So no dangers, even with speculative execution.
*/
- page = pte_page(pkmap_page_table[i]);
+ page = pte_page(ptent);
pte_clear(&init_mm, PKMAP_ADDR(i), &pkmap_page_table[i]);
set_page_address(page, NULL);
@@ -511,7 +513,7 @@ static inline bool kmap_high_unmap_local(unsigned long vaddr)
{
#ifdef ARCH_NEEDS_KMAP_HIGH_GET
if (vaddr >= PKMAP_ADDR(0) && vaddr < PKMAP_ADDR(LAST_PKMAP)) {
- kunmap_high(pte_page(pkmap_page_table[PKMAP_NR(vaddr)]));
+ kunmap_high(pte_page(ptep_get(&pkmap_page_table[PKMAP_NR(vaddr)])));
return true;
}
#endif
@@ -548,7 +550,7 @@ void *__kmap_local_pfn_prot(unsigned long pfn, pgprot_t prot)
idx = arch_kmap_local_map_idx(kmap_local_idx_push(), pfn);
vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
kmap_pte = kmap_get_pte(vaddr, idx);
- BUG_ON(!pte_none(*kmap_pte));
+ BUG_ON(!pte_none(ptep_get(kmap_pte)));
pteval = pfn_pte(pfn, prot);
arch_kmap_local_set_pte(&init_mm, vaddr, kmap_pte, pteval);
arch_kmap_local_post_map(vaddr, pteval);
diff --git a/mm/hmm.c b/mm/hmm.c
index 6a151c09de5e..855e25e59d8f 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -228,7 +228,7 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
struct hmm_range *range = hmm_vma_walk->range;
unsigned int required_fault;
unsigned long cpu_flags;
- pte_t pte = *ptep;
+ pte_t pte = ptep_get(ptep);
uint64_t pfn_req_flags = *hmm_pfn;
if (pte_none_mostly(pte)) {
@@ -332,7 +332,7 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,
pmd_t pmd;
again:
- pmd = READ_ONCE(*pmdp);
+ pmd = pmdp_get_lockless(pmdp);
if (pmd_none(pmd))
return hmm_vma_walk_hole(start, end, -1, walk);
@@ -381,6 +381,8 @@ again:
}
ptep = pte_offset_map(pmdp, addr);
+ if (!ptep)
+ goto again;
for (; addr < end; addr += PAGE_SIZE, ptep++, hmm_pfns++) {
int r;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 624671aaa60d..eb3678360b97 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -583,7 +583,7 @@ void prep_transhuge_page(struct page *page)
VM_BUG_ON_FOLIO(folio_order(folio) < 2, folio);
INIT_LIST_HEAD(&folio->_deferred_list);
- set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR);
+ folio_set_compound_dtor(folio, TRANSHUGE_PAGE_DTOR);
}
static inline bool is_transparent_hugepage(struct page *page)
@@ -1344,7 +1344,7 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
/*
* See do_wp_page(): we can only reuse the folio exclusively if
* there are no additional references. Note that we always drain
- * the LRU pagevecs immediately after adding a THP.
+ * the LRU cache immediately after adding a THP.
*/
if (folio_ref_count(folio) >
1 + folio_test_swapcache(folio) * folio_nr_pages(folio))
@@ -1760,9 +1760,10 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
/*
* The destination pmd shouldn't be established, free_pgtables()
- * should have release it.
+ * should have released it; but move_page_tables() might have already
+ * inserted a page table, if racing against shmem/file collapse.
*/
- if (WARN_ON(!pmd_none(*new_pmd))) {
+ if (!pmd_none(*new_pmd)) {
VM_BUG_ON(pmd_trans_huge(*new_pmd));
return false;
}
@@ -2036,6 +2037,8 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
struct mm_struct *mm = vma->vm_mm;
pgtable_t pgtable;
pmd_t _pmd, old_pmd;
+ unsigned long addr;
+ pte_t *pte;
int i;
/*
@@ -2051,17 +2054,20 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
pgtable = pgtable_trans_huge_withdraw(mm, pmd);
pmd_populate(mm, &_pmd, pgtable);
- for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
- pte_t *pte, entry;
- entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot);
+ pte = pte_offset_map(&_pmd, haddr);
+ VM_BUG_ON(!pte);
+ for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
+ pte_t entry;
+
+ entry = pfn_pte(my_zero_pfn(addr), vma->vm_page_prot);
entry = pte_mkspecial(entry);
if (pmd_uffd_wp(old_pmd))
entry = pte_mkuffd_wp(entry);
- pte = pte_offset_map(&_pmd, haddr);
- VM_BUG_ON(!pte_none(*pte));
- set_pte_at(mm, haddr, pte, entry);
- pte_unmap(pte);
+ VM_BUG_ON(!pte_none(ptep_get(pte)));
+ set_pte_at(mm, addr, pte, entry);
+ pte++;
}
+ pte_unmap(pte - 1);
smp_wmb(); /* make pte visible before pmd */
pmd_populate(mm, pmd, pgtable);
}
@@ -2076,6 +2082,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
bool young, write, soft_dirty, pmd_migration = false, uffd_wp = false;
bool anon_exclusive = false, dirty = false;
unsigned long addr;
+ pte_t *pte;
int i;
VM_BUG_ON(haddr & ~HPAGE_PMD_MASK);
@@ -2204,8 +2211,10 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
pgtable = pgtable_trans_huge_withdraw(mm, pmd);
pmd_populate(mm, &_pmd, pgtable);
+ pte = pte_offset_map(&_pmd, haddr);
+ VM_BUG_ON(!pte);
for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
- pte_t entry, *pte;
+ pte_t entry;
/*
* Note that NUMA hinting access restrictions are not
* transferred to avoid any possibility of altering
@@ -2248,11 +2257,11 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
entry = pte_mkuffd_wp(entry);
page_add_anon_rmap(page + i, vma, addr, false);
}
- pte = pte_offset_map(&_pmd, addr);
- BUG_ON(!pte_none(*pte));
+ VM_BUG_ON(!pte_none(ptep_get(pte)));
set_pte_at(mm, addr, pte, entry);
- pte_unmap(pte);
+ pte++;
}
+ pte_unmap(pte - 1);
if (!pmd_migration)
page_remove_rmap(page, vma, true);
@@ -2792,12 +2801,19 @@ void free_transhuge_page(struct page *page)
struct deferred_split *ds_queue = get_deferred_split_queue(folio);
unsigned long flags;
- spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
- if (!list_empty(&folio->_deferred_list)) {
- ds_queue->split_queue_len--;
- list_del(&folio->_deferred_list);
+ /*
+ * At this point, there is no one trying to add the folio to
+ * deferred_list. If folio is not in deferred_list, it's safe
+ * to check without acquiring the split_queue_lock.
+ */
+ if (data_race(!list_empty(&folio->_deferred_list))) {
+ spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
+ if (!list_empty(&folio->_deferred_list)) {
+ ds_queue->split_queue_len--;
+ list_del(&folio->_deferred_list);
+ }
+ spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
}
- spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
free_compound_page(page);
}
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index f154019e6b84..bce28cca73a1 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1489,7 +1489,6 @@ static void __destroy_compound_gigantic_folio(struct folio *folio,
set_page_refcounted(p);
}
- folio_set_order(folio, 0);
__folio_clear_head(folio);
}
@@ -1951,9 +1950,6 @@ static bool __prep_compound_gigantic_folio(struct folio *folio,
struct page *p;
__folio_clear_reserved(folio);
- __folio_set_head(folio);
- /* we rely on prep_new_hugetlb_folio to set the destructor */
- folio_set_order(folio, order);
for (i = 0; i < nr_pages; i++) {
p = folio_page(folio, i);
@@ -1999,6 +1995,9 @@ static bool __prep_compound_gigantic_folio(struct folio *folio,
if (i != 0)
set_compound_head(p, &folio->page);
}
+ __folio_set_head(folio);
+ /* we rely on prep_new_hugetlb_folio to set the destructor */
+ folio_set_order(folio, order);
atomic_set(&folio->_entire_mapcount, -1);
atomic_set(&folio->_nr_pages_mapped, 0);
atomic_set(&folio->_pincount, 0);
@@ -2017,8 +2016,6 @@ out_error:
p = folio_page(folio, j);
__ClearPageReserved(p);
}
- folio_set_order(folio, 0);
- __folio_clear_head(folio);
return false;
}
@@ -5016,7 +5013,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
struct vm_area_struct *src_vma)
{
pte_t *src_pte, *dst_pte, entry;
- struct page *ptepage;
+ struct folio *pte_folio;
unsigned long addr;
bool cow = is_cow_mapping(src_vma->vm_flags);
struct hstate *h = hstate_vma(src_vma);
@@ -5115,8 +5112,8 @@ again:
set_huge_pte_at(dst, addr, dst_pte, entry);
} else {
entry = huge_ptep_get(src_pte);
- ptepage = pte_page(entry);
- get_page(ptepage);
+ pte_folio = page_folio(pte_page(entry));
+ folio_get(pte_folio);
/*
* Failing to duplicate the anon rmap is a rare case
@@ -5128,10 +5125,10 @@ again:
* need to be without the pgtable locks since we could
* sleep during the process.
*/
- if (!PageAnon(ptepage)) {
- page_dup_file_rmap(ptepage, true);
- } else if (page_try_dup_anon_rmap(ptepage, true,
- src_vma)) {
+ if (!folio_test_anon(pte_folio)) {
+ page_dup_file_rmap(&pte_folio->page, true);
+ } else if (page_try_dup_anon_rmap(&pte_folio->page,
+ true, src_vma)) {
pte_t src_pte_old = entry;
struct folio *new_folio;
@@ -5140,14 +5137,14 @@ again:
/* Do not use reserve as it's private owned */
new_folio = alloc_hugetlb_folio(dst_vma, addr, 1);
if (IS_ERR(new_folio)) {
- put_page(ptepage);
+ folio_put(pte_folio);
ret = PTR_ERR(new_folio);
break;
}
ret = copy_user_large_folio(new_folio,
- page_folio(ptepage),
- addr, dst_vma);
- put_page(ptepage);
+ pte_folio,
+ addr, dst_vma);
+ folio_put(pte_folio);
if (ret) {
folio_put(new_folio);
break;
@@ -5540,7 +5537,7 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma,
const bool unshare = flags & FAULT_FLAG_UNSHARE;
pte_t pte = huge_ptep_get(ptep);
struct hstate *h = hstate_vma(vma);
- struct page *old_page;
+ struct folio *old_folio;
struct folio *new_folio;
int outside_reserve = 0;
vm_fault_t ret = 0;
@@ -5571,7 +5568,7 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma,
return 0;
}
- old_page = pte_page(pte);
+ old_folio = page_folio(pte_page(pte));
delayacct_wpcopy_start();
@@ -5580,17 +5577,17 @@ retry_avoidcopy:
* If no-one else is actually using this page, we're the exclusive
* owner and can reuse this page.
*/
- if (page_mapcount(old_page) == 1 && PageAnon(old_page)) {
- if (!PageAnonExclusive(old_page))
- page_move_anon_rmap(old_page, vma);
+ if (folio_mapcount(old_folio) == 1 && folio_test_anon(old_folio)) {
+ if (!PageAnonExclusive(&old_folio->page))
+ page_move_anon_rmap(&old_folio->page, vma);
if (likely(!unshare))
set_huge_ptep_writable(vma, haddr, ptep);
delayacct_wpcopy_end();
return 0;
}
- VM_BUG_ON_PAGE(PageAnon(old_page) && PageAnonExclusive(old_page),
- old_page);
+ VM_BUG_ON_PAGE(folio_test_anon(old_folio) &&
+ PageAnonExclusive(&old_folio->page), &old_folio->page);
/*
* If the process that created a MAP_PRIVATE mapping is about to
@@ -5602,10 +5599,10 @@ retry_avoidcopy:
* of the full address range.
*/
if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) &&
- page_folio(old_page) != pagecache_folio)
+ old_folio != pagecache_folio)
outside_reserve = 1;
- get_page(old_page);
+ folio_get(old_folio);
/*
* Drop page table lock as buddy allocator may be called. It will
@@ -5627,7 +5624,7 @@ retry_avoidcopy:
pgoff_t idx;
u32 hash;
- put_page(old_page);
+ folio_put(old_folio);
/*
* Drop hugetlb_fault_mutex and vma_lock before
* unmapping. unmapping needs to hold vma_lock
@@ -5642,7 +5639,7 @@ retry_avoidcopy:
hugetlb_vma_unlock_read(vma);
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
- unmap_ref_private(mm, vma, old_page, haddr);
+ unmap_ref_private(mm, vma, &old_folio->page, haddr);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
hugetlb_vma_lock_read(vma);
@@ -5672,7 +5669,7 @@ retry_avoidcopy:
goto out_release_all;
}
- if (copy_user_large_folio(new_folio, page_folio(old_page), address, vma)) {
+ if (copy_user_large_folio(new_folio, old_folio, address, vma)) {
ret = VM_FAULT_HWPOISON_LARGE;
goto out_release_all;
}
@@ -5694,14 +5691,14 @@ retry_avoidcopy:
/* Break COW or unshare */
huge_ptep_clear_flush(vma, haddr, ptep);
mmu_notifier_invalidate_range(mm, range.start, range.end);
- page_remove_rmap(old_page, vma, true);
+ page_remove_rmap(&old_folio->page, vma, true);
hugepage_add_new_anon_rmap(new_folio, vma, haddr);
if (huge_pte_uffd_wp(pte))
newpte = huge_pte_mkuffd_wp(newpte);
set_huge_pte_at(mm, haddr, ptep, newpte);
folio_set_hugetlb_migratable(new_folio);
/* Make the old page be freed below */
- new_folio = page_folio(old_page);
+ new_folio = old_folio;
}
spin_unlock(ptl);
mmu_notifier_invalidate_range_end(&range);
@@ -5710,11 +5707,11 @@ out_release_all:
* No restore in case of successful pagetable update (Break COW or
* unshare)
*/
- if (new_folio != page_folio(old_page))
+ if (new_folio != old_folio)
restore_reserve_on_error(h, vma, haddr, new_folio);
folio_put(new_folio);
out_release_old:
- put_page(old_page);
+ folio_put(old_folio);
spin_lock(ptl); /* Caller expects lock to be held */
@@ -5731,13 +5728,13 @@ static bool hugetlbfs_pagecache_present(struct hstate *h,
{
struct address_space *mapping = vma->vm_file->f_mapping;
pgoff_t idx = vma_hugecache_offset(h, vma, address);
- bool present;
-
- rcu_read_lock();
- present = page_cache_next_miss(mapping, idx, 1) != idx;
- rcu_read_unlock();
+ struct folio *folio;
- return present;
+ folio = filemap_get_folio(mapping, idx);
+ if (IS_ERR(folio))
+ return false;
+ folio_put(folio);
+ return true;
}
int hugetlb_add_to_page_cache(struct folio *folio, struct address_space *mapping,
@@ -6062,7 +6059,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
vm_fault_t ret;
u32 hash;
pgoff_t idx;
- struct page *page = NULL;
+ struct folio *folio = NULL;
struct folio *pagecache_folio = NULL;
struct hstate *h = hstate_vma(vma);
struct address_space *mapping;
@@ -6179,16 +6176,16 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
/*
* hugetlb_wp() requires page locks of pte_page(entry) and
* pagecache_folio, so here we need take the former one
- * when page != pagecache_folio or !pagecache_folio.
+ * when folio != pagecache_folio or !pagecache_folio.
*/
- page = pte_page(entry);
- if (page_folio(page) != pagecache_folio)
- if (!trylock_page(page)) {
+ folio = page_folio(pte_page(entry));
+ if (folio != pagecache_folio)
+ if (!folio_trylock(folio)) {
need_wait_lock = 1;
goto out_ptl;
}
- get_page(page);
+ folio_get(folio);
if (flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) {
if (!huge_pte_write(entry)) {
@@ -6204,9 +6201,9 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
flags & FAULT_FLAG_WRITE))
update_mmu_cache(vma, haddr, ptep);
out_put_page:
- if (page_folio(page) != pagecache_folio)
- unlock_page(page);
- put_page(page);
+ if (folio != pagecache_folio)
+ folio_unlock(folio);
+ folio_put(folio);
out_ptl:
spin_unlock(ptl);
@@ -6225,7 +6222,7 @@ out_mutex:
* here without taking refcount.
*/
if (need_wait_lock)
- wait_on_page_locked(page);
+ folio_wait_locked(folio);
return ret;
}
@@ -6425,17 +6422,14 @@ out_release_nounlock:
}
#endif /* CONFIG_USERFAULTFD */
-static void record_subpages_vmas(struct page *page, struct vm_area_struct *vma,
- int refs, struct page **pages,
- struct vm_area_struct **vmas)
+static void record_subpages(struct page *page, struct vm_area_struct *vma,
+ int refs, struct page **pages)
{
int nr;
for (nr = 0; nr < refs; nr++) {
if (likely(pages))
pages[nr] = nth_page(page, nr);
- if (vmas)
- vmas[nr] = vma;
}
}
@@ -6508,9 +6502,9 @@ out_unlock:
}
long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
- struct page **pages, struct vm_area_struct **vmas,
- unsigned long *position, unsigned long *nr_pages,
- long i, unsigned int flags, int *locked)
+ struct page **pages, unsigned long *position,
+ unsigned long *nr_pages, long i, unsigned int flags,
+ int *locked)
{
unsigned long pfn_offset;
unsigned long vaddr = *position;
@@ -6638,7 +6632,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
* If subpage information not requested, update counters
* and skip the same_page loop below.
*/
- if (!pages && !vmas && !pfn_offset &&
+ if (!pages && !pfn_offset &&
(vaddr + huge_page_size(h) < vma->vm_end) &&
(remainder >= pages_per_huge_page(h))) {
vaddr += huge_page_size(h);
@@ -6653,11 +6647,10 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
refs = min3(pages_per_huge_page(h) - pfn_offset, remainder,
(vma->vm_end - ALIGN_DOWN(vaddr, PAGE_SIZE)) >> PAGE_SHIFT);
- if (pages || vmas)
- record_subpages_vmas(nth_page(page, pfn_offset),
- vma, refs,
- likely(pages) ? pages + i : NULL,
- vmas ? vmas + i : NULL);
+ if (pages)
+ record_subpages(nth_page(page, pfn_offset),
+ vma, refs,
+ likely(pages) ? pages + i : NULL);
if (pages) {
/*
@@ -7137,7 +7130,6 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long saddr;
pte_t *spte = NULL;
pte_t *pte;
- spinlock_t *ptl;
i_mmap_lock_read(mapping);
vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) {
@@ -7158,7 +7150,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
if (!spte)
goto out;
- ptl = huge_pte_lock(hstate_vma(vma), mm, spte);
+ spin_lock(&mm->page_table_lock);
if (pud_none(*pud)) {
pud_populate(mm, pud,
(pmd_t *)((unsigned long)spte & PAGE_MASK));
@@ -7166,7 +7158,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
} else {
put_page(virt_to_page(spte));
}
- spin_unlock(ptl);
+ spin_unlock(&mm->page_table_lock);
out:
pte = (pte_t *)pmd_alloc(mm, pud, addr);
i_mmap_unlock_read(mapping);
@@ -7254,7 +7246,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
pte = (pte_t *)pmd_alloc(mm, pud, addr);
}
}
- BUG_ON(pte && pte_present(*pte) && !pte_huge(*pte));
+ BUG_ON(pte && pte_present(ptep_get(pte)) && !pte_huge(ptep_get(pte)));
return pte;
}
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index 27f001e0f0a2..c2007ef5e9b0 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -105,7 +105,7 @@ static void vmemmap_pte_range(pmd_t *pmd, unsigned long addr,
* remapping (which is calling @walk->remap_pte).
*/
if (!walk->reuse_page) {
- walk->reuse_page = pte_page(*pte);
+ walk->reuse_page = pte_page(ptep_get(pte));
/*
* Because the reuse address is part of the range that we are
* walking, skip the reuse address range.
@@ -239,7 +239,7 @@ static void vmemmap_remap_pte(pte_t *pte, unsigned long addr,
* to the tail pages.
*/
pgprot_t pgprot = PAGE_KERNEL_RO;
- struct page *page = pte_page(*pte);
+ struct page *page = pte_page(ptep_get(pte));
pte_t entry;
/* Remapping the head page requires r/w */
@@ -286,7 +286,7 @@ static void vmemmap_restore_pte(pte_t *pte, unsigned long addr,
struct page *page;
void *to;
- BUG_ON(pte_page(*pte) != walk->reuse_page);
+ BUG_ON(pte_page(ptep_get(pte)) != walk->reuse_page);
page = list_first_entry(walk->vmemmap_pages, struct page, lru);
list_del(&page->lru);
@@ -384,8 +384,9 @@ static int vmemmap_remap_free(unsigned long start, unsigned long end,
}
static int alloc_vmemmap_page_list(unsigned long start, unsigned long end,
- gfp_t gfp_mask, struct list_head *list)
+ struct list_head *list)
{
+ gfp_t gfp_mask = GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_THISNODE;
unsigned long nr_pages = (end - start) >> PAGE_SHIFT;
int nid = page_to_nid((struct page *)start);
struct page *page, *next;
@@ -413,12 +414,11 @@ out:
* @end: end address of the vmemmap virtual address range that we want to
* remap.
* @reuse: reuse address.
- * @gfp_mask: GFP flag for allocating vmemmap pages.
*
* Return: %0 on success, negative error code otherwise.
*/
static int vmemmap_remap_alloc(unsigned long start, unsigned long end,
- unsigned long reuse, gfp_t gfp_mask)
+ unsigned long reuse)
{
LIST_HEAD(vmemmap_pages);
struct vmemmap_remap_walk walk = {
@@ -430,7 +430,7 @@ static int vmemmap_remap_alloc(unsigned long start, unsigned long end,
/* See the comment in the vmemmap_remap_free(). */
BUG_ON(start - reuse != PAGE_SIZE);
- if (alloc_vmemmap_page_list(start, end, gfp_mask, &vmemmap_pages))
+ if (alloc_vmemmap_page_list(start, end, &vmemmap_pages))
return -ENOMEM;
mmap_read_lock(&init_mm);
@@ -476,8 +476,7 @@ int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head)
* When a HugeTLB page is freed to the buddy allocator, previously
* discarded vmemmap pages must be allocated and remapping.
*/
- ret = vmemmap_remap_alloc(vmemmap_start, vmemmap_end, vmemmap_reuse,
- GFP_KERNEL | __GFP_NORETRY | __GFP_THISNODE);
+ ret = vmemmap_remap_alloc(vmemmap_start, vmemmap_end, vmemmap_reuse);
if (!ret) {
ClearHPageVmemmapOptimized(head);
static_branch_dec(&hugetlb_optimize_vmemmap_key);
diff --git a/mm/internal.h b/mm/internal.h
index e6029d94bdb2..a7d9e980429a 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -133,8 +133,8 @@ int truncate_inode_folio(struct address_space *mapping, struct folio *folio);
bool truncate_inode_partial_folio(struct folio *folio, loff_t start,
loff_t end);
long invalidate_inode_page(struct page *page);
-unsigned long invalidate_mapping_pagevec(struct address_space *mapping,
- pgoff_t start, pgoff_t end, unsigned long *nr_pagevec);
+unsigned long mapping_try_invalidate(struct address_space *mapping,
+ pgoff_t start, pgoff_t end, unsigned long *nr_failed);
/**
* folio_evictable - Test whether a folio is evictable.
@@ -202,10 +202,12 @@ extern char * const zone_names[MAX_NR_ZONES];
/* perform sanity checks on struct pages being allocated or freed */
DECLARE_STATIC_KEY_MAYBE(CONFIG_DEBUG_VM, check_pages_enabled);
-static inline bool is_check_pages_enabled(void)
-{
- return static_branch_unlikely(&check_pages_enabled);
-}
+extern int min_free_kbytes;
+
+void setup_per_zone_wmarks(void);
+void calculate_min_free_kbytes(void);
+int __meminit init_per_zone_wmark_min(void);
+void page_alloc_sysctl_init(void);
/*
* Structure for holding the mostly immutable allocation parameters passed
@@ -365,6 +367,13 @@ static inline struct page *pageblock_pfn_to_page(unsigned long start_pfn,
return __pageblock_pfn_to_page(start_pfn, end_pfn, zone);
}
+void set_zone_contiguous(struct zone *zone);
+
+static inline void clear_zone_contiguous(struct zone *zone)
+{
+ zone->contiguous = false;
+}
+
extern int __isolate_free_page(struct page *page, unsigned int order);
extern void __putback_isolated_page(struct page *page, unsigned int order,
int mt);
@@ -372,12 +381,27 @@ extern void memblock_free_pages(struct page *page, unsigned long pfn,
unsigned int order);
extern void __free_pages_core(struct page *page, unsigned int order);
+/*
+ * This will have no effect, other than possibly generating a warning, if the
+ * caller passes in a non-large folio.
+ */
+static inline void folio_set_order(struct folio *folio, unsigned int order)
+{
+ if (WARN_ON_ONCE(!order || !folio_test_large(folio)))
+ return;
+
+ folio->_folio_order = order;
+#ifdef CONFIG_64BIT
+ folio->_folio_nr_pages = 1U << order;
+#endif
+}
+
static inline void prep_compound_head(struct page *page, unsigned int order)
{
struct folio *folio = (struct folio *)page;
- set_compound_page_dtor(page, COMPOUND_PAGE_DTOR);
- set_compound_order(page, order);
+ folio_set_compound_dtor(folio, COMPOUND_PAGE_DTOR);
+ folio_set_order(folio, order);
atomic_set(&folio->_entire_mapcount, -1);
atomic_set(&folio->_nr_pages_mapped, 0);
atomic_set(&folio->_pincount, 0);
@@ -410,27 +434,12 @@ extern void *memmap_alloc(phys_addr_t size, phys_addr_t align,
phys_addr_t min_addr,
int nid, bool exact_nid);
-int split_free_page(struct page *free_page,
- unsigned int order, unsigned long split_pfn_offset);
+void memmap_init_range(unsigned long, int, unsigned long, unsigned long,
+ unsigned long, enum meminit_context, struct vmem_altmap *, int);
-/*
- * This will have no effect, other than possibly generating a warning, if the
- * caller passes in a non-large folio.
- */
-static inline void folio_set_order(struct folio *folio, unsigned int order)
-{
- if (WARN_ON_ONCE(!folio_test_large(folio)))
- return;
- folio->_folio_order = order;
-#ifdef CONFIG_64BIT
- /*
- * When hugetlb dissolves a folio, we need to clear the tail
- * page, rather than setting nr_pages to 1.
- */
- folio->_folio_nr_pages = order ? 1U << order : 0;
-#endif
-}
+int split_free_page(struct page *free_page,
+ unsigned int order, unsigned long split_pfn_offset);
#if defined CONFIG_COMPACTION || defined CONFIG_CMA
@@ -557,8 +566,8 @@ extern long populate_vma_page_range(struct vm_area_struct *vma,
extern long faultin_vma_page_range(struct vm_area_struct *vma,
unsigned long start, unsigned long end,
bool write, int *locked);
-extern int mlock_future_check(struct mm_struct *mm, unsigned long flags,
- unsigned long len);
+extern bool mlock_future_ok(struct mm_struct *mm, unsigned long flags,
+ unsigned long bytes);
/*
* mlock_vma_folio() and munlock_vma_folio():
* should be called with vma's mmap_lock held for read or write,
@@ -1041,17 +1050,17 @@ static inline void vma_iter_store(struct vma_iterator *vmi,
{
#if defined(CONFIG_DEBUG_VM_MAPLE_TREE)
- if (WARN_ON(vmi->mas.node != MAS_START && vmi->mas.index > vma->vm_start)) {
- printk("%lu > %lu\n", vmi->mas.index, vma->vm_start);
- printk("store of vma %lu-%lu", vma->vm_start, vma->vm_end);
- printk("into slot %lu-%lu", vmi->mas.index, vmi->mas.last);
- mt_dump(vmi->mas.tree);
+ if (MAS_WARN_ON(&vmi->mas, vmi->mas.node != MAS_START &&
+ vmi->mas.index > vma->vm_start)) {
+ pr_warn("%lx > %lx\n store vma %lx-%lx\n into slot %lx-%lx\n",
+ vmi->mas.index, vma->vm_start, vma->vm_start,
+ vma->vm_end, vmi->mas.index, vmi->mas.last);
}
- if (WARN_ON(vmi->mas.node != MAS_START && vmi->mas.last < vma->vm_start)) {
- printk("%lu < %lu\n", vmi->mas.last, vma->vm_start);
- printk("store of vma %lu-%lu", vma->vm_start, vma->vm_end);
- printk("into slot %lu-%lu", vmi->mas.index, vmi->mas.last);
- mt_dump(vmi->mas.tree);
+ if (MAS_WARN_ON(&vmi->mas, vmi->mas.node != MAS_START &&
+ vmi->mas.last < vma->vm_start)) {
+ pr_warn("%lx < %lx\nstore vma %lx-%lx\ninto slot %lx-%lx\n",
+ vmi->mas.last, vma->vm_start, vma->vm_start, vma->vm_end,
+ vmi->mas.index, vmi->mas.last);
}
#endif
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index b376a5d055e5..256930da578a 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -445,7 +445,7 @@ void * __must_check __kasan_krealloc(const void *object, size_t size, gfp_t flag
bool __kasan_check_byte(const void *address, unsigned long ip)
{
if (!kasan_byte_accessible(address)) {
- kasan_report((unsigned long)address, 1, false, ip);
+ kasan_report(address, 1, false, ip);
return false;
}
return true;
diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c
index e5eef670735e..5b4c97baa656 100644
--- a/mm/kasan/generic.c
+++ b/mm/kasan/generic.c
@@ -40,39 +40,39 @@
* depending on memory access size X.
*/
-static __always_inline bool memory_is_poisoned_1(unsigned long addr)
+static __always_inline bool memory_is_poisoned_1(const void *addr)
{
- s8 shadow_value = *(s8 *)kasan_mem_to_shadow((void *)addr);
+ s8 shadow_value = *(s8 *)kasan_mem_to_shadow(addr);
if (unlikely(shadow_value)) {
- s8 last_accessible_byte = addr & KASAN_GRANULE_MASK;
+ s8 last_accessible_byte = (unsigned long)addr & KASAN_GRANULE_MASK;
return unlikely(last_accessible_byte >= shadow_value);
}
return false;
}
-static __always_inline bool memory_is_poisoned_2_4_8(unsigned long addr,
+static __always_inline bool memory_is_poisoned_2_4_8(const void *addr,
unsigned long size)
{
- u8 *shadow_addr = (u8 *)kasan_mem_to_shadow((void *)addr);
+ u8 *shadow_addr = (u8 *)kasan_mem_to_shadow(addr);
/*
* Access crosses 8(shadow size)-byte boundary. Such access maps
* into 2 shadow bytes, so we need to check them both.
*/
- if (unlikely(((addr + size - 1) & KASAN_GRANULE_MASK) < size - 1))
+ if (unlikely((((unsigned long)addr + size - 1) & KASAN_GRANULE_MASK) < size - 1))
return *shadow_addr || memory_is_poisoned_1(addr + size - 1);
return memory_is_poisoned_1(addr + size - 1);
}
-static __always_inline bool memory_is_poisoned_16(unsigned long addr)
+static __always_inline bool memory_is_poisoned_16(const void *addr)
{
- u16 *shadow_addr = (u16 *)kasan_mem_to_shadow((void *)addr);
+ u16 *shadow_addr = (u16 *)kasan_mem_to_shadow(addr);
/* Unaligned 16-bytes access maps into 3 shadow bytes. */
- if (unlikely(!IS_ALIGNED(addr, KASAN_GRANULE_SIZE)))
+ if (unlikely(!IS_ALIGNED((unsigned long)addr, KASAN_GRANULE_SIZE)))
return *shadow_addr || memory_is_poisoned_1(addr + 15);
return *shadow_addr;
@@ -120,26 +120,25 @@ static __always_inline unsigned long memory_is_nonzero(const void *start,
return bytes_is_nonzero(start, (end - start) % 8);
}
-static __always_inline bool memory_is_poisoned_n(unsigned long addr,
- size_t size)
+static __always_inline bool memory_is_poisoned_n(const void *addr, size_t size)
{
unsigned long ret;
- ret = memory_is_nonzero(kasan_mem_to_shadow((void *)addr),
- kasan_mem_to_shadow((void *)addr + size - 1) + 1);
+ ret = memory_is_nonzero(kasan_mem_to_shadow(addr),
+ kasan_mem_to_shadow(addr + size - 1) + 1);
if (unlikely(ret)) {
- unsigned long last_byte = addr + size - 1;
- s8 *last_shadow = (s8 *)kasan_mem_to_shadow((void *)last_byte);
+ const void *last_byte = addr + size - 1;
+ s8 *last_shadow = (s8 *)kasan_mem_to_shadow(last_byte);
if (unlikely(ret != (unsigned long)last_shadow ||
- ((long)(last_byte & KASAN_GRANULE_MASK) >= *last_shadow)))
+ (((long)last_byte & KASAN_GRANULE_MASK) >= *last_shadow)))
return true;
}
return false;
}
-static __always_inline bool memory_is_poisoned(unsigned long addr, size_t size)
+static __always_inline bool memory_is_poisoned(const void *addr, size_t size)
{
if (__builtin_constant_p(size)) {
switch (size) {
@@ -159,7 +158,7 @@ static __always_inline bool memory_is_poisoned(unsigned long addr, size_t size)
return memory_is_poisoned_n(addr, size);
}
-static __always_inline bool check_region_inline(unsigned long addr,
+static __always_inline bool check_region_inline(const void *addr,
size_t size, bool write,
unsigned long ret_ip)
{
@@ -172,7 +171,7 @@ static __always_inline bool check_region_inline(unsigned long addr,
if (unlikely(addr + size < addr))
return !kasan_report(addr, size, write, ret_ip);
- if (unlikely(!addr_has_metadata((void *)addr)))
+ if (unlikely(!addr_has_metadata(addr)))
return !kasan_report(addr, size, write, ret_ip);
if (likely(!memory_is_poisoned(addr, size)))
@@ -181,7 +180,7 @@ static __always_inline bool check_region_inline(unsigned long addr,
return !kasan_report(addr, size, write, ret_ip);
}
-bool kasan_check_range(unsigned long addr, size_t size, bool write,
+bool kasan_check_range(const void *addr, size_t size, bool write,
unsigned long ret_ip)
{
return check_region_inline(addr, size, write, ret_ip);
@@ -221,36 +220,37 @@ static void register_global(struct kasan_global *global)
KASAN_GLOBAL_REDZONE, false);
}
-void __asan_register_globals(struct kasan_global *globals, size_t size)
+void __asan_register_globals(void *ptr, ssize_t size)
{
int i;
+ struct kasan_global *globals = ptr;
for (i = 0; i < size; i++)
register_global(&globals[i]);
}
EXPORT_SYMBOL(__asan_register_globals);
-void __asan_unregister_globals(struct kasan_global *globals, size_t size)
+void __asan_unregister_globals(void *ptr, ssize_t size)
{
}
EXPORT_SYMBOL(__asan_unregister_globals);
#define DEFINE_ASAN_LOAD_STORE(size) \
- void __asan_load##size(unsigned long addr) \
+ void __asan_load##size(void *addr) \
{ \
check_region_inline(addr, size, false, _RET_IP_); \
} \
EXPORT_SYMBOL(__asan_load##size); \
__alias(__asan_load##size) \
- void __asan_load##size##_noabort(unsigned long); \
+ void __asan_load##size##_noabort(void *); \
EXPORT_SYMBOL(__asan_load##size##_noabort); \
- void __asan_store##size(unsigned long addr) \
+ void __asan_store##size(void *addr) \
{ \
check_region_inline(addr, size, true, _RET_IP_); \
} \
EXPORT_SYMBOL(__asan_store##size); \
__alias(__asan_store##size) \
- void __asan_store##size##_noabort(unsigned long); \
+ void __asan_store##size##_noabort(void *); \
EXPORT_SYMBOL(__asan_store##size##_noabort)
DEFINE_ASAN_LOAD_STORE(1);
@@ -259,24 +259,24 @@ DEFINE_ASAN_LOAD_STORE(4);
DEFINE_ASAN_LOAD_STORE(8);
DEFINE_ASAN_LOAD_STORE(16);
-void __asan_loadN(unsigned long addr, size_t size)
+void __asan_loadN(void *addr, ssize_t size)
{
kasan_check_range(addr, size, false, _RET_IP_);
}
EXPORT_SYMBOL(__asan_loadN);
__alias(__asan_loadN)
-void __asan_loadN_noabort(unsigned long, size_t);
+void __asan_loadN_noabort(void *, ssize_t);
EXPORT_SYMBOL(__asan_loadN_noabort);
-void __asan_storeN(unsigned long addr, size_t size)
+void __asan_storeN(void *addr, ssize_t size)
{
kasan_check_range(addr, size, true, _RET_IP_);
}
EXPORT_SYMBOL(__asan_storeN);
__alias(__asan_storeN)
-void __asan_storeN_noabort(unsigned long, size_t);
+void __asan_storeN_noabort(void *, ssize_t);
EXPORT_SYMBOL(__asan_storeN_noabort);
/* to shut up compiler complaints */
@@ -284,7 +284,7 @@ void __asan_handle_no_return(void) {}
EXPORT_SYMBOL(__asan_handle_no_return);
/* Emitted by compiler to poison alloca()ed objects. */
-void __asan_alloca_poison(unsigned long addr, size_t size)
+void __asan_alloca_poison(void *addr, ssize_t size)
{
size_t rounded_up_size = round_up(size, KASAN_GRANULE_SIZE);
size_t padding_size = round_up(size, KASAN_ALLOCA_REDZONE_SIZE) -
@@ -295,7 +295,7 @@ void __asan_alloca_poison(unsigned long addr, size_t size)
KASAN_ALLOCA_REDZONE_SIZE);
const void *right_redzone = (const void *)(addr + rounded_up_size);
- WARN_ON(!IS_ALIGNED(addr, KASAN_ALLOCA_REDZONE_SIZE));
+ WARN_ON(!IS_ALIGNED((unsigned long)addr, KASAN_ALLOCA_REDZONE_SIZE));
kasan_unpoison((const void *)(addr + rounded_down_size),
size - rounded_down_size, false);
@@ -307,18 +307,18 @@ void __asan_alloca_poison(unsigned long addr, size_t size)
EXPORT_SYMBOL(__asan_alloca_poison);
/* Emitted by compiler to unpoison alloca()ed areas when the stack unwinds. */
-void __asan_allocas_unpoison(const void *stack_top, const void *stack_bottom)
+void __asan_allocas_unpoison(void *stack_top, ssize_t stack_bottom)
{
- if (unlikely(!stack_top || stack_top > stack_bottom))
+ if (unlikely(!stack_top || stack_top > (void *)stack_bottom))
return;
- kasan_unpoison(stack_top, stack_bottom - stack_top, false);
+ kasan_unpoison(stack_top, (void *)stack_bottom - stack_top, false);
}
EXPORT_SYMBOL(__asan_allocas_unpoison);
/* Emitted by the compiler to [un]poison local variables. */
#define DEFINE_ASAN_SET_SHADOW(byte) \
- void __asan_set_shadow_##byte(const void *addr, size_t size) \
+ void __asan_set_shadow_##byte(const void *addr, ssize_t size) \
{ \
__memset((void *)addr, 0x##byte, size); \
} \
@@ -488,7 +488,7 @@ static void __kasan_record_aux_stack(void *addr, bool can_alloc)
return;
alloc_meta->aux_stack[1] = alloc_meta->aux_stack[0];
- alloc_meta->aux_stack[0] = kasan_save_stack(GFP_NOWAIT, can_alloc);
+ alloc_meta->aux_stack[0] = kasan_save_stack(0, can_alloc);
}
void kasan_record_aux_stack(void *addr)
@@ -518,7 +518,7 @@ void kasan_save_free_info(struct kmem_cache *cache, void *object)
if (!free_meta)
return;
- kasan_set_track(&free_meta->free_track, GFP_NOWAIT);
+ kasan_set_track(&free_meta->free_track, 0);
/* The object was freed and has free track set. */
*(u8 *)kasan_mem_to_shadow(object) = KASAN_SLAB_FREETRACK;
}
diff --git a/mm/kasan/init.c b/mm/kasan/init.c
index cc64ed6858c6..dcfec277e839 100644
--- a/mm/kasan/init.c
+++ b/mm/kasan/init.c
@@ -286,7 +286,7 @@ static void kasan_free_pte(pte_t *pte_start, pmd_t *pmd)
for (i = 0; i < PTRS_PER_PTE; i++) {
pte = pte_start + i;
- if (!pte_none(*pte))
+ if (!pte_none(ptep_get(pte)))
return;
}
@@ -343,16 +343,19 @@ static void kasan_remove_pte_table(pte_t *pte, unsigned long addr,
unsigned long end)
{
unsigned long next;
+ pte_t ptent;
for (; addr < end; addr = next, pte++) {
next = (addr + PAGE_SIZE) & PAGE_MASK;
if (next > end)
next = end;
- if (!pte_present(*pte))
+ ptent = ptep_get(pte);
+
+ if (!pte_present(ptent))
continue;
- if (WARN_ON(!kasan_early_shadow_page_entry(*pte)))
+ if (WARN_ON(!kasan_early_shadow_page_entry(ptent)))
continue;
pte_clear(&init_mm, addr, pte);
}
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index f5e4f5f2ba20..b799f11e45dc 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -198,13 +198,13 @@ enum kasan_report_type {
struct kasan_report_info {
/* Filled in by kasan_report_*(). */
enum kasan_report_type type;
- void *access_addr;
+ const void *access_addr;
size_t access_size;
bool is_write;
unsigned long ip;
/* Filled in by the common reporting code. */
- void *first_bad_addr;
+ const void *first_bad_addr;
struct kmem_cache *cache;
void *object;
size_t alloc_size;
@@ -311,7 +311,7 @@ static __always_inline bool addr_has_metadata(const void *addr)
* @ret_ip: return address
* @return: true if access was valid, false if invalid
*/
-bool kasan_check_range(unsigned long addr, size_t size, bool write,
+bool kasan_check_range(const void *addr, size_t size, bool write,
unsigned long ret_ip);
#else /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */
@@ -323,7 +323,7 @@ static __always_inline bool addr_has_metadata(const void *addr)
#endif /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */
-void *kasan_find_first_bad_addr(void *addr, size_t size);
+const void *kasan_find_first_bad_addr(const void *addr, size_t size);
size_t kasan_get_alloc_size(void *object, struct kmem_cache *cache);
void kasan_complete_mode_report_info(struct kasan_report_info *info);
void kasan_metadata_fetch_row(char *buffer, void *row);
@@ -346,7 +346,7 @@ void kasan_print_aux_stacks(struct kmem_cache *cache, const void *object);
static inline void kasan_print_aux_stacks(struct kmem_cache *cache, const void *object) { }
#endif
-bool kasan_report(unsigned long addr, size_t size,
+bool kasan_report(const void *addr, size_t size,
bool is_write, unsigned long ip);
void kasan_report_invalid_free(void *object, unsigned long ip, enum kasan_report_type type);
@@ -571,79 +571,82 @@ void kasan_restore_multi_shot(bool enabled);
*/
asmlinkage void kasan_unpoison_task_stack_below(const void *watermark);
-void __asan_register_globals(struct kasan_global *globals, size_t size);
-void __asan_unregister_globals(struct kasan_global *globals, size_t size);
+void __asan_register_globals(void *globals, ssize_t size);
+void __asan_unregister_globals(void *globals, ssize_t size);
void __asan_handle_no_return(void);
-void __asan_alloca_poison(unsigned long addr, size_t size);
-void __asan_allocas_unpoison(const void *stack_top, const void *stack_bottom);
-
-void __asan_load1(unsigned long addr);
-void __asan_store1(unsigned long addr);
-void __asan_load2(unsigned long addr);
-void __asan_store2(unsigned long addr);
-void __asan_load4(unsigned long addr);
-void __asan_store4(unsigned long addr);
-void __asan_load8(unsigned long addr);
-void __asan_store8(unsigned long addr);
-void __asan_load16(unsigned long addr);
-void __asan_store16(unsigned long addr);
-void __asan_loadN(unsigned long addr, size_t size);
-void __asan_storeN(unsigned long addr, size_t size);
-
-void __asan_load1_noabort(unsigned long addr);
-void __asan_store1_noabort(unsigned long addr);
-void __asan_load2_noabort(unsigned long addr);
-void __asan_store2_noabort(unsigned long addr);
-void __asan_load4_noabort(unsigned long addr);
-void __asan_store4_noabort(unsigned long addr);
-void __asan_load8_noabort(unsigned long addr);
-void __asan_store8_noabort(unsigned long addr);
-void __asan_load16_noabort(unsigned long addr);
-void __asan_store16_noabort(unsigned long addr);
-void __asan_loadN_noabort(unsigned long addr, size_t size);
-void __asan_storeN_noabort(unsigned long addr, size_t size);
-
-void __asan_report_load1_noabort(unsigned long addr);
-void __asan_report_store1_noabort(unsigned long addr);
-void __asan_report_load2_noabort(unsigned long addr);
-void __asan_report_store2_noabort(unsigned long addr);
-void __asan_report_load4_noabort(unsigned long addr);
-void __asan_report_store4_noabort(unsigned long addr);
-void __asan_report_load8_noabort(unsigned long addr);
-void __asan_report_store8_noabort(unsigned long addr);
-void __asan_report_load16_noabort(unsigned long addr);
-void __asan_report_store16_noabort(unsigned long addr);
-void __asan_report_load_n_noabort(unsigned long addr, size_t size);
-void __asan_report_store_n_noabort(unsigned long addr, size_t size);
-
-void __asan_set_shadow_00(const void *addr, size_t size);
-void __asan_set_shadow_f1(const void *addr, size_t size);
-void __asan_set_shadow_f2(const void *addr, size_t size);
-void __asan_set_shadow_f3(const void *addr, size_t size);
-void __asan_set_shadow_f5(const void *addr, size_t size);
-void __asan_set_shadow_f8(const void *addr, size_t size);
-
-void *__asan_memset(void *addr, int c, size_t len);
-void *__asan_memmove(void *dest, const void *src, size_t len);
-void *__asan_memcpy(void *dest, const void *src, size_t len);
-
-void __hwasan_load1_noabort(unsigned long addr);
-void __hwasan_store1_noabort(unsigned long addr);
-void __hwasan_load2_noabort(unsigned long addr);
-void __hwasan_store2_noabort(unsigned long addr);
-void __hwasan_load4_noabort(unsigned long addr);
-void __hwasan_store4_noabort(unsigned long addr);
-void __hwasan_load8_noabort(unsigned long addr);
-void __hwasan_store8_noabort(unsigned long addr);
-void __hwasan_load16_noabort(unsigned long addr);
-void __hwasan_store16_noabort(unsigned long addr);
-void __hwasan_loadN_noabort(unsigned long addr, size_t size);
-void __hwasan_storeN_noabort(unsigned long addr, size_t size);
-
-void __hwasan_tag_memory(unsigned long addr, u8 tag, unsigned long size);
-
-void *__hwasan_memset(void *addr, int c, size_t len);
-void *__hwasan_memmove(void *dest, const void *src, size_t len);
-void *__hwasan_memcpy(void *dest, const void *src, size_t len);
+void __asan_alloca_poison(void *, ssize_t size);
+void __asan_allocas_unpoison(void *stack_top, ssize_t stack_bottom);
+
+void __asan_load1(void *);
+void __asan_store1(void *);
+void __asan_load2(void *);
+void __asan_store2(void *);
+void __asan_load4(void *);
+void __asan_store4(void *);
+void __asan_load8(void *);
+void __asan_store8(void *);
+void __asan_load16(void *);
+void __asan_store16(void *);
+void __asan_loadN(void *, ssize_t size);
+void __asan_storeN(void *, ssize_t size);
+
+void __asan_load1_noabort(void *);
+void __asan_store1_noabort(void *);
+void __asan_load2_noabort(void *);
+void __asan_store2_noabort(void *);
+void __asan_load4_noabort(void *);
+void __asan_store4_noabort(void *);
+void __asan_load8_noabort(void *);
+void __asan_store8_noabort(void *);
+void __asan_load16_noabort(void *);
+void __asan_store16_noabort(void *);
+void __asan_loadN_noabort(void *, ssize_t size);
+void __asan_storeN_noabort(void *, ssize_t size);
+
+void __asan_report_load1_noabort(void *);
+void __asan_report_store1_noabort(void *);
+void __asan_report_load2_noabort(void *);
+void __asan_report_store2_noabort(void *);
+void __asan_report_load4_noabort(void *);
+void __asan_report_store4_noabort(void *);
+void __asan_report_load8_noabort(void *);
+void __asan_report_store8_noabort(void *);
+void __asan_report_load16_noabort(void *);
+void __asan_report_store16_noabort(void *);
+void __asan_report_load_n_noabort(void *, ssize_t size);
+void __asan_report_store_n_noabort(void *, ssize_t size);
+
+void __asan_set_shadow_00(const void *addr, ssize_t size);
+void __asan_set_shadow_f1(const void *addr, ssize_t size);
+void __asan_set_shadow_f2(const void *addr, ssize_t size);
+void __asan_set_shadow_f3(const void *addr, ssize_t size);
+void __asan_set_shadow_f5(const void *addr, ssize_t size);
+void __asan_set_shadow_f8(const void *addr, ssize_t size);
+
+void *__asan_memset(void *addr, int c, ssize_t len);
+void *__asan_memmove(void *dest, const void *src, ssize_t len);
+void *__asan_memcpy(void *dest, const void *src, ssize_t len);
+
+void __hwasan_load1_noabort(void *);
+void __hwasan_store1_noabort(void *);
+void __hwasan_load2_noabort(void *);
+void __hwasan_store2_noabort(void *);
+void __hwasan_load4_noabort(void *);
+void __hwasan_store4_noabort(void *);
+void __hwasan_load8_noabort(void *);
+void __hwasan_store8_noabort(void *);
+void __hwasan_load16_noabort(void *);
+void __hwasan_store16_noabort(void *);
+void __hwasan_loadN_noabort(void *, ssize_t size);
+void __hwasan_storeN_noabort(void *, ssize_t size);
+
+void __hwasan_tag_memory(void *, u8 tag, ssize_t size);
+
+void *__hwasan_memset(void *addr, int c, ssize_t len);
+void *__hwasan_memmove(void *dest, const void *src, ssize_t len);
+void *__hwasan_memcpy(void *dest, const void *src, ssize_t len);
+
+void kasan_tag_mismatch(void *addr, unsigned long access_info,
+ unsigned long ret_ip);
#endif /* __MM_KASAN_KASAN_H */
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index 892a9dc9d4d3..ca4b6ff080a6 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -43,6 +43,7 @@ enum kasan_arg_fault {
KASAN_ARG_FAULT_DEFAULT,
KASAN_ARG_FAULT_REPORT,
KASAN_ARG_FAULT_PANIC,
+ KASAN_ARG_FAULT_PANIC_ON_WRITE,
};
static enum kasan_arg_fault kasan_arg_fault __ro_after_init = KASAN_ARG_FAULT_DEFAULT;
@@ -57,6 +58,8 @@ static int __init early_kasan_fault(char *arg)
kasan_arg_fault = KASAN_ARG_FAULT_REPORT;
else if (!strcmp(arg, "panic"))
kasan_arg_fault = KASAN_ARG_FAULT_PANIC;
+ else if (!strcmp(arg, "panic_on_write"))
+ kasan_arg_fault = KASAN_ARG_FAULT_PANIC_ON_WRITE;
else
return -EINVAL;
@@ -211,7 +214,7 @@ static void start_report(unsigned long *flags, bool sync)
pr_err("==================================================================\n");
}
-static void end_report(unsigned long *flags, void *addr)
+static void end_report(unsigned long *flags, const void *addr, bool is_write)
{
if (addr)
trace_error_report_end(ERROR_DETECTOR_KASAN,
@@ -220,8 +223,18 @@ static void end_report(unsigned long *flags, void *addr)
spin_unlock_irqrestore(&report_lock, *flags);
if (!test_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags))
check_panic_on_warn("KASAN");
- if (kasan_arg_fault == KASAN_ARG_FAULT_PANIC)
+ switch (kasan_arg_fault) {
+ case KASAN_ARG_FAULT_DEFAULT:
+ case KASAN_ARG_FAULT_REPORT:
+ break;
+ case KASAN_ARG_FAULT_PANIC:
panic("kasan.fault=panic set ...\n");
+ break;
+ case KASAN_ARG_FAULT_PANIC_ON_WRITE:
+ if (is_write)
+ panic("kasan.fault=panic_on_write set ...\n");
+ break;
+ }
add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
lockdep_on();
report_suppress_stop();
@@ -450,8 +463,8 @@ static void print_memory_metadata(const void *addr)
static void print_report(struct kasan_report_info *info)
{
- void *addr = kasan_reset_tag(info->access_addr);
- u8 tag = get_tag(info->access_addr);
+ void *addr = kasan_reset_tag((void *)info->access_addr);
+ u8 tag = get_tag((void *)info->access_addr);
print_error_description(info);
if (addr_has_metadata(addr))
@@ -468,12 +481,12 @@ static void print_report(struct kasan_report_info *info)
static void complete_report_info(struct kasan_report_info *info)
{
- void *addr = kasan_reset_tag(info->access_addr);
+ void *addr = kasan_reset_tag((void *)info->access_addr);
struct slab *slab;
if (info->type == KASAN_REPORT_ACCESS)
info->first_bad_addr = kasan_find_first_bad_addr(
- info->access_addr, info->access_size);
+ (void *)info->access_addr, info->access_size);
else
info->first_bad_addr = addr;
@@ -536,7 +549,11 @@ void kasan_report_invalid_free(void *ptr, unsigned long ip, enum kasan_report_ty
print_report(&info);
- end_report(&flags, ptr);
+ /*
+ * Invalid free is considered a "write" since the allocator's metadata
+ * updates involves writes.
+ */
+ end_report(&flags, ptr, true);
}
/*
@@ -544,11 +561,10 @@ void kasan_report_invalid_free(void *ptr, unsigned long ip, enum kasan_report_ty
* user_access_save/restore(): kasan_report_invalid_free() cannot be called
* from a UACCESS region, and kasan_report_async() is not used on x86.
*/
-bool kasan_report(unsigned long addr, size_t size, bool is_write,
+bool kasan_report(const void *addr, size_t size, bool is_write,
unsigned long ip)
{
bool ret = true;
- void *ptr = (void *)addr;
unsigned long ua_flags = user_access_save();
unsigned long irq_flags;
struct kasan_report_info info;
@@ -562,7 +578,7 @@ bool kasan_report(unsigned long addr, size_t size, bool is_write,
memset(&info, 0, sizeof(info));
info.type = KASAN_REPORT_ACCESS;
- info.access_addr = ptr;
+ info.access_addr = addr;
info.access_size = size;
info.is_write = is_write;
info.ip = ip;
@@ -571,7 +587,7 @@ bool kasan_report(unsigned long addr, size_t size, bool is_write,
print_report(&info);
- end_report(&irq_flags, ptr);
+ end_report(&irq_flags, (void *)addr, is_write);
out:
user_access_restore(ua_flags);
@@ -597,7 +613,11 @@ void kasan_report_async(void)
pr_err("Asynchronous fault: no details available\n");
pr_err("\n");
dump_stack_lvl(KERN_ERR);
- end_report(&flags, NULL);
+ /*
+ * Conservatively set is_write=true, because no details are available.
+ * In this mode, kasan.fault=panic_on_write is like kasan.fault=panic.
+ */
+ end_report(&flags, NULL, true);
}
#endif /* CONFIG_KASAN_HW_TAGS */
diff --git a/mm/kasan/report_generic.c b/mm/kasan/report_generic.c
index 87d39bc0a673..51a1e8a8877f 100644
--- a/mm/kasan/report_generic.c
+++ b/mm/kasan/report_generic.c
@@ -30,9 +30,9 @@
#include "kasan.h"
#include "../slab.h"
-void *kasan_find_first_bad_addr(void *addr, size_t size)
+const void *kasan_find_first_bad_addr(const void *addr, size_t size)
{
- void *p = addr;
+ const void *p = addr;
if (!addr_has_metadata(p))
return p;
@@ -362,14 +362,14 @@ void kasan_print_address_stack_frame(const void *addr)
#endif /* CONFIG_KASAN_STACK */
#define DEFINE_ASAN_REPORT_LOAD(size) \
-void __asan_report_load##size##_noabort(unsigned long addr) \
+void __asan_report_load##size##_noabort(void *addr) \
{ \
kasan_report(addr, size, false, _RET_IP_); \
} \
EXPORT_SYMBOL(__asan_report_load##size##_noabort)
#define DEFINE_ASAN_REPORT_STORE(size) \
-void __asan_report_store##size##_noabort(unsigned long addr) \
+void __asan_report_store##size##_noabort(void *addr) \
{ \
kasan_report(addr, size, true, _RET_IP_); \
} \
@@ -386,13 +386,13 @@ DEFINE_ASAN_REPORT_STORE(4);
DEFINE_ASAN_REPORT_STORE(8);
DEFINE_ASAN_REPORT_STORE(16);
-void __asan_report_load_n_noabort(unsigned long addr, size_t size)
+void __asan_report_load_n_noabort(void *addr, ssize_t size)
{
kasan_report(addr, size, false, _RET_IP_);
}
EXPORT_SYMBOL(__asan_report_load_n_noabort);
-void __asan_report_store_n_noabort(unsigned long addr, size_t size)
+void __asan_report_store_n_noabort(void *addr, ssize_t size)
{
kasan_report(addr, size, true, _RET_IP_);
}
diff --git a/mm/kasan/report_hw_tags.c b/mm/kasan/report_hw_tags.c
index 32e80f78de7d..065e1b2fc484 100644
--- a/mm/kasan/report_hw_tags.c
+++ b/mm/kasan/report_hw_tags.c
@@ -15,7 +15,7 @@
#include "kasan.h"
-void *kasan_find_first_bad_addr(void *addr, size_t size)
+const void *kasan_find_first_bad_addr(const void *addr, size_t size)
{
/*
* Hardware Tag-Based KASAN only calls this function for normal memory
diff --git a/mm/kasan/report_sw_tags.c b/mm/kasan/report_sw_tags.c
index 8b1f5a73ee6d..689e94f9fe3c 100644
--- a/mm/kasan/report_sw_tags.c
+++ b/mm/kasan/report_sw_tags.c
@@ -30,7 +30,7 @@
#include "kasan.h"
#include "../slab.h"
-void *kasan_find_first_bad_addr(void *addr, size_t size)
+const void *kasan_find_first_bad_addr(const void *addr, size_t size)
{
u8 tag = get_tag(addr);
void *p = kasan_reset_tag(addr);
diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c
index c8b86f3273b5..dd772f9d0f08 100644
--- a/mm/kasan/shadow.c
+++ b/mm/kasan/shadow.c
@@ -28,13 +28,13 @@
bool __kasan_check_read(const volatile void *p, unsigned int size)
{
- return kasan_check_range((unsigned long)p, size, false, _RET_IP_);
+ return kasan_check_range((void *)p, size, false, _RET_IP_);
}
EXPORT_SYMBOL(__kasan_check_read);
bool __kasan_check_write(const volatile void *p, unsigned int size)
{
- return kasan_check_range((unsigned long)p, size, true, _RET_IP_);
+ return kasan_check_range((void *)p, size, true, _RET_IP_);
}
EXPORT_SYMBOL(__kasan_check_write);
@@ -50,7 +50,7 @@ EXPORT_SYMBOL(__kasan_check_write);
#undef memset
void *memset(void *addr, int c, size_t len)
{
- if (!kasan_check_range((unsigned long)addr, len, true, _RET_IP_))
+ if (!kasan_check_range(addr, len, true, _RET_IP_))
return NULL;
return __memset(addr, c, len);
@@ -60,8 +60,8 @@ void *memset(void *addr, int c, size_t len)
#undef memmove
void *memmove(void *dest, const void *src, size_t len)
{
- if (!kasan_check_range((unsigned long)src, len, false, _RET_IP_) ||
- !kasan_check_range((unsigned long)dest, len, true, _RET_IP_))
+ if (!kasan_check_range(src, len, false, _RET_IP_) ||
+ !kasan_check_range(dest, len, true, _RET_IP_))
return NULL;
return __memmove(dest, src, len);
@@ -71,17 +71,17 @@ void *memmove(void *dest, const void *src, size_t len)
#undef memcpy
void *memcpy(void *dest, const void *src, size_t len)
{
- if (!kasan_check_range((unsigned long)src, len, false, _RET_IP_) ||
- !kasan_check_range((unsigned long)dest, len, true, _RET_IP_))
+ if (!kasan_check_range(src, len, false, _RET_IP_) ||
+ !kasan_check_range(dest, len, true, _RET_IP_))
return NULL;
return __memcpy(dest, src, len);
}
#endif
-void *__asan_memset(void *addr, int c, size_t len)
+void *__asan_memset(void *addr, int c, ssize_t len)
{
- if (!kasan_check_range((unsigned long)addr, len, true, _RET_IP_))
+ if (!kasan_check_range(addr, len, true, _RET_IP_))
return NULL;
return __memset(addr, c, len);
@@ -89,10 +89,10 @@ void *__asan_memset(void *addr, int c, size_t len)
EXPORT_SYMBOL(__asan_memset);
#ifdef __HAVE_ARCH_MEMMOVE
-void *__asan_memmove(void *dest, const void *src, size_t len)
+void *__asan_memmove(void *dest, const void *src, ssize_t len)
{
- if (!kasan_check_range((unsigned long)src, len, false, _RET_IP_) ||
- !kasan_check_range((unsigned long)dest, len, true, _RET_IP_))
+ if (!kasan_check_range(src, len, false, _RET_IP_) ||
+ !kasan_check_range(dest, len, true, _RET_IP_))
return NULL;
return __memmove(dest, src, len);
@@ -100,10 +100,10 @@ void *__asan_memmove(void *dest, const void *src, size_t len)
EXPORT_SYMBOL(__asan_memmove);
#endif
-void *__asan_memcpy(void *dest, const void *src, size_t len)
+void *__asan_memcpy(void *dest, const void *src, ssize_t len)
{
- if (!kasan_check_range((unsigned long)src, len, false, _RET_IP_) ||
- !kasan_check_range((unsigned long)dest, len, true, _RET_IP_))
+ if (!kasan_check_range(src, len, false, _RET_IP_) ||
+ !kasan_check_range(dest, len, true, _RET_IP_))
return NULL;
return __memcpy(dest, src, len);
@@ -111,13 +111,13 @@ void *__asan_memcpy(void *dest, const void *src, size_t len)
EXPORT_SYMBOL(__asan_memcpy);
#ifdef CONFIG_KASAN_SW_TAGS
-void *__hwasan_memset(void *addr, int c, size_t len) __alias(__asan_memset);
+void *__hwasan_memset(void *addr, int c, ssize_t len) __alias(__asan_memset);
EXPORT_SYMBOL(__hwasan_memset);
#ifdef __HAVE_ARCH_MEMMOVE
-void *__hwasan_memmove(void *dest, const void *src, size_t len) __alias(__asan_memmove);
+void *__hwasan_memmove(void *dest, const void *src, ssize_t len) __alias(__asan_memmove);
EXPORT_SYMBOL(__hwasan_memmove);
#endif
-void *__hwasan_memcpy(void *dest, const void *src, size_t len) __alias(__asan_memcpy);
+void *__hwasan_memcpy(void *dest, const void *src, ssize_t len) __alias(__asan_memcpy);
EXPORT_SYMBOL(__hwasan_memcpy);
#endif
@@ -226,7 +226,7 @@ static bool shadow_mapped(unsigned long addr)
if (pmd_bad(*pmd))
return true;
pte = pte_offset_kernel(pmd, addr);
- return !pte_none(*pte);
+ return !pte_none(ptep_get(pte));
}
static int __meminit kasan_mem_notifier(struct notifier_block *nb,
@@ -317,7 +317,7 @@ static int kasan_populate_vmalloc_pte(pte_t *ptep, unsigned long addr,
unsigned long page;
pte_t pte;
- if (likely(!pte_none(*ptep)))
+ if (likely(!pte_none(ptep_get(ptep))))
return 0;
page = __get_free_page(GFP_KERNEL);
@@ -328,7 +328,7 @@ static int kasan_populate_vmalloc_pte(pte_t *ptep, unsigned long addr,
pte = pfn_pte(PFN_DOWN(__pa(page)), PAGE_KERNEL);
spin_lock(&init_mm.page_table_lock);
- if (likely(pte_none(*ptep))) {
+ if (likely(pte_none(ptep_get(ptep)))) {
set_pte_at(&init_mm, addr, ptep, pte);
page = 0;
}
@@ -418,11 +418,11 @@ static int kasan_depopulate_vmalloc_pte(pte_t *ptep, unsigned long addr,
{
unsigned long page;
- page = (unsigned long)__va(pte_pfn(*ptep) << PAGE_SHIFT);
+ page = (unsigned long)__va(pte_pfn(ptep_get(ptep)) << PAGE_SHIFT);
spin_lock(&init_mm.page_table_lock);
- if (likely(!pte_none(*ptep))) {
+ if (likely(!pte_none(ptep_get(ptep)))) {
pte_clear(&init_mm, addr, ptep);
free_page(page);
}
diff --git a/mm/kasan/sw_tags.c b/mm/kasan/sw_tags.c
index 30da65fa02a1..220b5d4c6876 100644
--- a/mm/kasan/sw_tags.c
+++ b/mm/kasan/sw_tags.c
@@ -70,8 +70,8 @@ u8 kasan_random_tag(void)
return (u8)(state % (KASAN_TAG_MAX + 1));
}
-bool kasan_check_range(unsigned long addr, size_t size, bool write,
- unsigned long ret_ip)
+bool kasan_check_range(const void *addr, size_t size, bool write,
+ unsigned long ret_ip)
{
u8 tag;
u8 *shadow_first, *shadow_last, *shadow;
@@ -133,12 +133,12 @@ bool kasan_byte_accessible(const void *addr)
}
#define DEFINE_HWASAN_LOAD_STORE(size) \
- void __hwasan_load##size##_noabort(unsigned long addr) \
+ void __hwasan_load##size##_noabort(void *addr) \
{ \
- kasan_check_range(addr, size, false, _RET_IP_); \
+ kasan_check_range(addr, size, false, _RET_IP_); \
} \
EXPORT_SYMBOL(__hwasan_load##size##_noabort); \
- void __hwasan_store##size##_noabort(unsigned long addr) \
+ void __hwasan_store##size##_noabort(void *addr) \
{ \
kasan_check_range(addr, size, true, _RET_IP_); \
} \
@@ -150,25 +150,25 @@ DEFINE_HWASAN_LOAD_STORE(4);
DEFINE_HWASAN_LOAD_STORE(8);
DEFINE_HWASAN_LOAD_STORE(16);
-void __hwasan_loadN_noabort(unsigned long addr, unsigned long size)
+void __hwasan_loadN_noabort(void *addr, ssize_t size)
{
kasan_check_range(addr, size, false, _RET_IP_);
}
EXPORT_SYMBOL(__hwasan_loadN_noabort);
-void __hwasan_storeN_noabort(unsigned long addr, unsigned long size)
+void __hwasan_storeN_noabort(void *addr, ssize_t size)
{
kasan_check_range(addr, size, true, _RET_IP_);
}
EXPORT_SYMBOL(__hwasan_storeN_noabort);
-void __hwasan_tag_memory(unsigned long addr, u8 tag, unsigned long size)
+void __hwasan_tag_memory(void *addr, u8 tag, ssize_t size)
{
- kasan_poison((void *)addr, size, tag, false);
+ kasan_poison(addr, size, tag, false);
}
EXPORT_SYMBOL(__hwasan_tag_memory);
-void kasan_tag_mismatch(unsigned long addr, unsigned long access_info,
+void kasan_tag_mismatch(void *addr, unsigned long access_info,
unsigned long ret_ip)
{
kasan_report(addr, 1 << (access_info & 0xf), access_info & 0x10,
diff --git a/mm/kasan/tags.c b/mm/kasan/tags.c
index 67a222586846..7dcfe341d48e 100644
--- a/mm/kasan/tags.c
+++ b/mm/kasan/tags.c
@@ -140,5 +140,5 @@ void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags)
void kasan_save_free_info(struct kmem_cache *cache, void *object)
{
- save_stack_info(cache, object, GFP_NOWAIT, true);
+ save_stack_info(cache, object, 0, true);
}
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 2d0d58fb4e7f..3beb4ad2ee5e 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -88,7 +88,7 @@ static unsigned int khugepaged_max_ptes_swap __read_mostly;
static unsigned int khugepaged_max_ptes_shared __read_mostly;
#define MM_SLOTS_HASH_BITS 10
-static __read_mostly DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
+static DEFINE_READ_MOSTLY_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
static struct kmem_cache *mm_slot_cache __read_mostly;
@@ -422,19 +422,17 @@ void __khugepaged_enter(struct mm_struct *mm)
struct mm_slot *slot;
int wakeup;
+ /* __khugepaged_exit() must not run from under us */
+ VM_BUG_ON_MM(hpage_collapse_test_exit(mm), mm);
+ if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags)))
+ return;
+
mm_slot = mm_slot_alloc(mm_slot_cache);
if (!mm_slot)
return;
slot = &mm_slot->slot;
- /* __khugepaged_exit() must not run from under us */
- VM_BUG_ON_MM(hpage_collapse_test_exit(mm), mm);
- if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) {
- mm_slot_free(mm_slot_cache, mm_slot);
- return;
- }
-
spin_lock(&khugepaged_mm_lock);
mm_slot_insert(mm_slots_hash, mm, slot);
/*
@@ -513,7 +511,7 @@ static void release_pte_pages(pte_t *pte, pte_t *_pte,
struct folio *folio, *tmp;
while (--_pte >= pte) {
- pte_t pteval = *_pte;
+ pte_t pteval = ptep_get(_pte);
unsigned long pfn;
if (pte_none(pteval))
@@ -557,7 +555,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
_pte++, address += PAGE_SIZE) {
- pte_t pteval = *_pte;
+ pte_t pteval = ptep_get(_pte);
if (pte_none(pteval) || (pte_present(pteval) &&
is_zero_pfn(pte_pfn(pteval)))) {
++none_or_zero;
@@ -701,7 +699,7 @@ static void __collapse_huge_page_copy_succeeded(pte_t *pte,
for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
_pte++, address += PAGE_SIZE) {
- pteval = *_pte;
+ pteval = ptep_get(_pte);
if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1);
if (is_zero_pfn(pte_pfn(pteval))) {
@@ -799,7 +797,7 @@ static int __collapse_huge_page_copy(pte_t *pte,
*/
for (_pte = pte, _address = address; _pte < pte + HPAGE_PMD_NR;
_pte++, page++, _address += PAGE_SIZE) {
- pteval = *_pte;
+ pteval = ptep_get(_pte);
if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
clear_user_highpage(page, _address);
continue;
@@ -946,10 +944,6 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
return SCAN_SUCCEED;
}
-/*
- * See pmd_trans_unstable() for how the result may change out from
- * underneath us, even if we hold mmap_lock in read.
- */
static int find_pmd_or_thp_or_none(struct mm_struct *mm,
unsigned long address,
pmd_t **pmd)
@@ -961,11 +955,6 @@ static int find_pmd_or_thp_or_none(struct mm_struct *mm,
return SCAN_PMD_NULL;
pmde = pmdp_get_lockless(*pmd);
-
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- /* See comments in pmd_none_or_trans_huge_or_clear_bad() */
- barrier();
-#endif
if (pmd_none(pmde))
return SCAN_PMD_NONE;
if (!pmd_present(pmde))
@@ -998,9 +987,8 @@ static int check_pmd_still_valid(struct mm_struct *mm,
* Only done if hpage_collapse_scan_pmd believes it is worthwhile.
*
* Called and returns without pte mapped or spinlocks held.
- * Note that if false is returned, mmap_lock will be released.
+ * Returns result: if not SCAN_SUCCEED, mmap_lock has been released.
*/
-
static int __collapse_huge_page_swapin(struct mm_struct *mm,
struct vm_area_struct *vma,
unsigned long haddr, pmd_t *pmd,
@@ -1009,23 +997,37 @@ static int __collapse_huge_page_swapin(struct mm_struct *mm,
int swapped_in = 0;
vm_fault_t ret = 0;
unsigned long address, end = haddr + (HPAGE_PMD_NR * PAGE_SIZE);
+ int result;
+ pte_t *pte = NULL;
+ spinlock_t *ptl;
for (address = haddr; address < end; address += PAGE_SIZE) {
struct vm_fault vmf = {
.vma = vma,
.address = address,
- .pgoff = linear_page_index(vma, haddr),
+ .pgoff = linear_page_index(vma, address),
.flags = FAULT_FLAG_ALLOW_RETRY,
.pmd = pmd,
};
- vmf.pte = pte_offset_map(pmd, address);
- vmf.orig_pte = *vmf.pte;
- if (!is_swap_pte(vmf.orig_pte)) {
- pte_unmap(vmf.pte);
- continue;
+ if (!pte++) {
+ pte = pte_offset_map_nolock(mm, pmd, address, &ptl);
+ if (!pte) {
+ mmap_read_unlock(mm);
+ result = SCAN_PMD_NULL;
+ goto out;
+ }
}
+
+ vmf.orig_pte = ptep_get_lockless(pte);
+ if (!is_swap_pte(vmf.orig_pte))
+ continue;
+
+ vmf.pte = pte;
+ vmf.ptl = ptl;
ret = do_swap_page(&vmf);
+ /* Which unmaps pte (after perhaps re-checking the entry) */
+ pte = NULL;
/*
* do_swap_page returns VM_FAULT_RETRY with released mmap_lock.
@@ -1034,24 +1036,29 @@ static int __collapse_huge_page_swapin(struct mm_struct *mm,
* resulting in later failure.
*/
if (ret & VM_FAULT_RETRY) {
- trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
/* Likely, but not guaranteed, that page lock failed */
- return SCAN_PAGE_LOCK;
+ result = SCAN_PAGE_LOCK;
+ goto out;
}
if (ret & VM_FAULT_ERROR) {
mmap_read_unlock(mm);
- trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
- return SCAN_FAIL;
+ result = SCAN_FAIL;
+ goto out;
}
swapped_in++;
}
- /* Drain LRU add pagevec to remove extra pin on the swapped in pages */
+ if (pte)
+ pte_unmap(pte);
+
+ /* Drain LRU cache to remove extra pin on the swapped in pages */
if (swapped_in)
lru_add_drain();
- trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 1);
- return SCAN_SUCCEED;
+ result = SCAN_SUCCEED;
+out:
+ trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, result);
+ return result;
}
static int alloc_charge_hpage(struct page **hpage, struct mm_struct *mm,
@@ -1151,9 +1158,6 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
address + HPAGE_PMD_SIZE);
mmu_notifier_invalidate_range_start(&range);
- pte = pte_offset_map(pmd, address);
- pte_ptl = pte_lockptr(mm, pmd);
-
pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */
/*
* This removes any huge TLB entry from the CPU so we won't allow
@@ -1168,13 +1172,18 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
mmu_notifier_invalidate_range_end(&range);
tlb_remove_table_sync_one();
- spin_lock(pte_ptl);
- result = __collapse_huge_page_isolate(vma, address, pte, cc,
- &compound_pagelist);
- spin_unlock(pte_ptl);
+ pte = pte_offset_map_lock(mm, &_pmd, address, &pte_ptl);
+ if (pte) {
+ result = __collapse_huge_page_isolate(vma, address, pte, cc,
+ &compound_pagelist);
+ spin_unlock(pte_ptl);
+ } else {
+ result = SCAN_PMD_NULL;
+ }
if (unlikely(result != SCAN_SUCCEED)) {
- pte_unmap(pte);
+ if (pte)
+ pte_unmap(pte);
spin_lock(pmd_ptl);
BUG_ON(!pmd_none(*pmd));
/*
@@ -1258,9 +1267,14 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
memset(cc->node_load, 0, sizeof(cc->node_load));
nodes_clear(cc->alloc_nmask);
pte = pte_offset_map_lock(mm, pmd, address, &ptl);
+ if (!pte) {
+ result = SCAN_PMD_NULL;
+ goto out;
+ }
+
for (_address = address, _pte = pte; _pte < pte + HPAGE_PMD_NR;
_pte++, _address += PAGE_SIZE) {
- pte_t pteval = *_pte;
+ pte_t pteval = ptep_get(_pte);
if (is_swap_pte(pteval)) {
++unmapped;
if (!cc->is_khugepaged ||
@@ -1627,25 +1641,28 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
* lockless_pages_from_mm() and the hardware page walker can access page
* tables while all the high-level locks are held in write mode.
*/
- start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl);
result = SCAN_FAIL;
+ start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl);
+ if (!start_pte)
+ goto drop_immap;
/* step 1: check all mapped PTEs are to the right huge page */
for (i = 0, addr = haddr, pte = start_pte;
i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
struct page *page;
+ pte_t ptent = ptep_get(pte);
/* empty pte, skip */
- if (pte_none(*pte))
+ if (pte_none(ptent))
continue;
/* page swapped out, abort */
- if (!pte_present(*pte)) {
+ if (!pte_present(ptent)) {
result = SCAN_PTE_NON_PRESENT;
goto abort;
}
- page = vm_normal_page(vma, addr, *pte);
+ page = vm_normal_page(vma, addr, ptent);
if (WARN_ON_ONCE(page && is_zone_device_page(page)))
page = NULL;
/*
@@ -1661,10 +1678,11 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
for (i = 0, addr = haddr, pte = start_pte;
i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
struct page *page;
+ pte_t ptent = ptep_get(pte);
- if (pte_none(*pte))
+ if (pte_none(ptent))
continue;
- page = vm_normal_page(vma, addr, *pte);
+ page = vm_normal_page(vma, addr, ptent);
if (WARN_ON_ONCE(page && is_zone_device_page(page)))
goto abort;
page_remove_rmap(page, vma, false);
@@ -1702,6 +1720,7 @@ drop_hpage:
abort:
pte_unmap_unlock(start_pte, ptl);
+drop_immap:
i_mmap_unlock_write(vma->vm_file->f_mapping);
goto drop_hpage;
}
@@ -1953,7 +1972,7 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
result = SCAN_FAIL;
goto xa_unlocked;
}
- /* drain pagevecs to help isolate_lru_page() */
+ /* drain lru cache to help isolate_lru_page() */
lru_add_drain();
page = folio_file_page(folio, index);
} else if (trylock_page(page)) {
@@ -1969,7 +1988,7 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
page_cache_sync_readahead(mapping, &file->f_ra,
file, index,
end - index);
- /* drain pagevecs to help isolate_lru_page() */
+ /* drain lru cache to help isolate_lru_page() */
lru_add_drain();
page = find_lock_page(mapping, index);
if (unlikely(page == NULL)) {
diff --git a/mm/kmsan/core.c b/mm/kmsan/core.c
index 7d1e4aa30bae..3adb4c1d3b19 100644
--- a/mm/kmsan/core.c
+++ b/mm/kmsan/core.c
@@ -74,7 +74,7 @@ depot_stack_handle_t kmsan_save_stack_with_flags(gfp_t flags,
nr_entries = stack_trace_save(entries, KMSAN_STACK_DEPTH, 0);
/* Don't sleep. */
- flags &= ~__GFP_DIRECT_RECLAIM;
+ flags &= ~(__GFP_DIRECT_RECLAIM | __GFP_KSWAPD_RECLAIM);
handle = __stack_depot_save(entries, nr_entries, flags, true);
return stack_depot_set_extra_bits(handle, extra);
@@ -245,7 +245,7 @@ depot_stack_handle_t kmsan_internal_chain_origin(depot_stack_handle_t id)
extra_bits = kmsan_extra_bits(depth, uaf);
entries[0] = KMSAN_CHAIN_MAGIC_ORIGIN;
- entries[1] = kmsan_save_stack_with_flags(GFP_ATOMIC, 0);
+ entries[1] = kmsan_save_stack_with_flags(__GFP_HIGH, 0);
entries[2] = id;
/*
* @entries is a local var in non-instrumented code, so KMSAN does not
@@ -253,7 +253,7 @@ depot_stack_handle_t kmsan_internal_chain_origin(depot_stack_handle_t id)
* positives when __stack_depot_save() passes it to instrumented code.
*/
kmsan_internal_unpoison_memory(entries, sizeof(entries), false);
- handle = __stack_depot_save(entries, ARRAY_SIZE(entries), GFP_ATOMIC,
+ handle = __stack_depot_save(entries, ARRAY_SIZE(entries), __GFP_HIGH,
true);
return stack_depot_set_extra_bits(handle, extra_bits);
}
diff --git a/mm/kmsan/instrumentation.c b/mm/kmsan/instrumentation.c
index cf12e9616b24..cc3907a9c33a 100644
--- a/mm/kmsan/instrumentation.c
+++ b/mm/kmsan/instrumentation.c
@@ -282,7 +282,7 @@ void __msan_poison_alloca(void *address, uintptr_t size, char *descr)
/* stack_depot_save() may allocate memory. */
kmsan_enter_runtime();
- handle = stack_depot_save(entries, ARRAY_SIZE(entries), GFP_ATOMIC);
+ handle = stack_depot_save(entries, ARRAY_SIZE(entries), __GFP_HIGH);
kmsan_leave_runtime();
kmsan_internal_set_shadow_origin(address, size, -1, handle,
diff --git a/mm/ksm.c b/mm/ksm.c
index 0156bded3a66..ba266359da55 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -429,16 +429,17 @@ static int break_ksm_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long nex
struct page *page = NULL;
spinlock_t *ptl;
pte_t *pte;
+ pte_t ptent;
int ret;
- if (pmd_leaf(*pmd) || !pmd_present(*pmd))
- return 0;
-
pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
- if (pte_present(*pte)) {
- page = vm_normal_page(walk->vma, addr, *pte);
- } else if (!pte_none(*pte)) {
- swp_entry_t entry = pte_to_swp_entry(*pte);
+ if (!pte)
+ return 0;
+ ptent = ptep_get(pte);
+ if (pte_present(ptent)) {
+ page = vm_normal_page(walk->vma, addr, ptent);
+ } else if (!pte_none(ptent)) {
+ swp_entry_t entry = pte_to_swp_entry(ptent);
/*
* As KSM pages remain KSM pages until freed, no need to wait
@@ -931,7 +932,7 @@ static int remove_stable_node(struct ksm_stable_node *stable_node)
* The stable node did not yet appear stale to get_ksm_page(),
* since that allows for an unmapped ksm page to be recognized
* right up until it is freed; but the node is safe to remove.
- * This page might be in a pagevec waiting to be freed,
+ * This page might be in an LRU cache waiting to be freed,
* or it might be PageSwapCache (perhaps under writeback),
* or it might have been removed from swapcache a moment ago.
*/
@@ -1086,6 +1087,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
int err = -EFAULT;
struct mmu_notifier_range range;
bool anon_exclusive;
+ pte_t entry;
pvmw.address = page_address_in_vma(page, vma);
if (pvmw.address == -EFAULT)
@@ -1103,10 +1105,9 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
goto out_unlock;
anon_exclusive = PageAnonExclusive(page);
- if (pte_write(*pvmw.pte) || pte_dirty(*pvmw.pte) ||
+ entry = ptep_get(pvmw.pte);
+ if (pte_write(entry) || pte_dirty(entry) ||
anon_exclusive || mm_tlb_flush_pending(mm)) {
- pte_t entry;
-
swapped = PageSwapCache(page);
flush_cache_page(vma, pvmw.address, page_to_pfn(page));
/*
@@ -1148,7 +1149,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
set_pte_at_notify(mm, pvmw.address, pvmw.pte, entry);
}
- *orig_pte = *pvmw.pte;
+ *orig_pte = entry;
err = 0;
out_unlock:
@@ -1194,8 +1195,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
* without holding anon_vma lock for write. So when looking for a
* genuine pmde (in which to find pte), test present and !THP together.
*/
- pmde = *pmd;
- barrier();
+ pmde = pmdp_get_lockless(pmd);
if (!pmd_present(pmde) || pmd_trans_huge(pmde))
goto out;
@@ -1204,7 +1204,9 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
mmu_notifier_invalidate_range_start(&range);
ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
- if (!pte_same(*ptep, orig_pte)) {
+ if (!ptep)
+ goto out_mn;
+ if (!pte_same(ptep_get(ptep), orig_pte)) {
pte_unmap_unlock(ptep, ptl);
goto out_mn;
}
@@ -1231,7 +1233,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
dec_mm_counter(mm, MM_ANONPAGES);
}
- flush_cache_page(vma, addr, pte_pfn(*ptep));
+ flush_cache_page(vma, addr, pte_pfn(ptep_get(ptep)));
/*
* No need to notify as we are replacing a read only page with another
* read only page with the same content.
@@ -2301,8 +2303,8 @@ static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page)
trace_ksm_start_scan(ksm_scan.seqnr, ksm_rmap_items);
/*
- * A number of pages can hang around indefinitely on per-cpu
- * pagevecs, raised page count preventing write_protect_page
+ * A number of pages can hang around indefinitely in per-cpu
+ * LRU cache, raised page count preventing write_protect_page
* from merging them. Though it doesn't really matter much,
* it is puzzling to see some stuck in pages_volatile until
* other activity jostles them out, and they also prevented
diff --git a/mm/madvise.c b/mm/madvise.c
index b5ffbaf616f5..886f06066622 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -188,37 +188,43 @@ success:
#ifdef CONFIG_SWAP
static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
- unsigned long end, struct mm_walk *walk)
+ unsigned long end, struct mm_walk *walk)
{
struct vm_area_struct *vma = walk->private;
- unsigned long index;
struct swap_iocb *splug = NULL;
+ pte_t *ptep = NULL;
+ spinlock_t *ptl;
+ unsigned long addr;
- if (pmd_none_or_trans_huge_or_clear_bad(pmd))
- return 0;
-
- for (index = start; index != end; index += PAGE_SIZE) {
+ for (addr = start; addr < end; addr += PAGE_SIZE) {
pte_t pte;
swp_entry_t entry;
struct page *page;
- spinlock_t *ptl;
- pte_t *ptep;
- ptep = pte_offset_map_lock(vma->vm_mm, pmd, index, &ptl);
- pte = *ptep;
- pte_unmap_unlock(ptep, ptl);
+ if (!ptep++) {
+ ptep = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+ if (!ptep)
+ break;
+ }
+ pte = ptep_get(ptep);
if (!is_swap_pte(pte))
continue;
entry = pte_to_swp_entry(pte);
if (unlikely(non_swap_entry(entry)))
continue;
+ pte_unmap_unlock(ptep, ptl);
+ ptep = NULL;
+
page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE,
- vma, index, false, &splug);
+ vma, addr, false, &splug);
if (page)
put_page(page);
}
+
+ if (ptep)
+ pte_unmap_unlock(ptep, ptl);
swap_read_unplug(splug);
cond_resched();
@@ -229,30 +235,34 @@ static const struct mm_walk_ops swapin_walk_ops = {
.pmd_entry = swapin_walk_pmd_entry,
};
-static void force_shm_swapin_readahead(struct vm_area_struct *vma,
+static void shmem_swapin_range(struct vm_area_struct *vma,
unsigned long start, unsigned long end,
struct address_space *mapping)
{
XA_STATE(xas, &mapping->i_pages, linear_page_index(vma, start));
- pgoff_t end_index = linear_page_index(vma, end + PAGE_SIZE - 1);
+ pgoff_t end_index = linear_page_index(vma, end) - 1;
struct page *page;
struct swap_iocb *splug = NULL;
rcu_read_lock();
xas_for_each(&xas, page, end_index) {
- swp_entry_t swap;
+ unsigned long addr;
+ swp_entry_t entry;
if (!xa_is_value(page))
continue;
- swap = radix_to_swp_entry(page);
+ entry = radix_to_swp_entry(page);
/* There might be swapin error entries in shmem mapping. */
- if (non_swap_entry(swap))
+ if (non_swap_entry(entry))
continue;
+
+ addr = vma->vm_start +
+ ((xas.xa_index - vma->vm_pgoff) << PAGE_SHIFT);
xas_pause(&xas);
rcu_read_unlock();
- page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE,
- NULL, 0, false, &splug);
+ page = read_swap_cache_async(entry, mapping_gfp_mask(mapping),
+ vma, addr, false, &splug);
if (page)
put_page(page);
@@ -260,8 +270,6 @@ static void force_shm_swapin_readahead(struct vm_area_struct *vma,
}
rcu_read_unlock();
swap_read_unplug(splug);
-
- lru_add_drain(); /* Push any new pages onto the LRU now */
}
#endif /* CONFIG_SWAP */
@@ -285,8 +293,8 @@ static long madvise_willneed(struct vm_area_struct *vma,
}
if (shmem_mapping(file->f_mapping)) {
- force_shm_swapin_readahead(vma, start, end,
- file->f_mapping);
+ shmem_swapin_range(vma, start, end, file->f_mapping);
+ lru_add_drain(); /* Push any new pages onto the LRU now */
return 0;
}
#else
@@ -340,7 +348,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
bool pageout = private->pageout;
struct mm_struct *mm = tlb->mm;
struct vm_area_struct *vma = walk->vma;
- pte_t *orig_pte, *pte, ptent;
+ pte_t *start_pte, *pte, ptent;
spinlock_t *ptl;
struct folio *folio = NULL;
LIST_HEAD(folio_list);
@@ -422,15 +430,15 @@ huge_unlock:
}
regular_folio:
- if (pmd_trans_unstable(pmd))
- return 0;
#endif
tlb_change_page_size(tlb, PAGE_SIZE);
- orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+ start_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+ if (!start_pte)
+ return 0;
flush_tlb_batched_pending(mm);
arch_enter_lazy_mmu_mode();
for (; addr < end; pte++, addr += PAGE_SIZE) {
- ptent = *pte;
+ ptent = ptep_get(pte);
if (pte_none(ptent))
continue;
@@ -447,25 +455,28 @@ regular_folio:
* are sure it's worth. Split it if we are only owner.
*/
if (folio_test_large(folio)) {
+ int err;
+
if (folio_mapcount(folio) != 1)
break;
if (pageout_anon_only_filter && !folio_test_anon(folio))
break;
- folio_get(folio);
- if (!folio_trylock(folio)) {
- folio_put(folio);
- break;
- }
- pte_unmap_unlock(orig_pte, ptl);
- if (split_folio(folio)) {
- folio_unlock(folio);
- folio_put(folio);
- orig_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+ if (!folio_trylock(folio))
break;
- }
+ folio_get(folio);
+ arch_leave_lazy_mmu_mode();
+ pte_unmap_unlock(start_pte, ptl);
+ start_pte = NULL;
+ err = split_folio(folio);
folio_unlock(folio);
folio_put(folio);
- orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+ if (err)
+ break;
+ start_pte = pte =
+ pte_offset_map_lock(mm, pmd, addr, &ptl);
+ if (!start_pte)
+ break;
+ arch_enter_lazy_mmu_mode();
pte--;
addr -= PAGE_SIZE;
continue;
@@ -510,8 +521,10 @@ regular_folio:
folio_deactivate(folio);
}
- arch_leave_lazy_mmu_mode();
- pte_unmap_unlock(orig_pte, ptl);
+ if (start_pte) {
+ arch_leave_lazy_mmu_mode();
+ pte_unmap_unlock(start_pte, ptl);
+ }
if (pageout)
reclaim_pages(&folio_list);
cond_resched();
@@ -612,7 +625,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
struct mm_struct *mm = tlb->mm;
struct vm_area_struct *vma = walk->vma;
spinlock_t *ptl;
- pte_t *orig_pte, *pte, ptent;
+ pte_t *start_pte, *pte, ptent;
struct folio *folio;
int nr_swap = 0;
unsigned long next;
@@ -620,17 +633,16 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
next = pmd_addr_end(addr, end);
if (pmd_trans_huge(*pmd))
if (madvise_free_huge_pmd(tlb, vma, pmd, addr, next))
- goto next;
-
- if (pmd_trans_unstable(pmd))
- return 0;
+ return 0;
tlb_change_page_size(tlb, PAGE_SIZE);
- orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+ start_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+ if (!start_pte)
+ return 0;
flush_tlb_batched_pending(mm);
arch_enter_lazy_mmu_mode();
for (; addr != end; pte++, addr += PAGE_SIZE) {
- ptent = *pte;
+ ptent = ptep_get(pte);
if (pte_none(ptent))
continue;
@@ -664,23 +676,26 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
* deactivate all pages.
*/
if (folio_test_large(folio)) {
+ int err;
+
if (folio_mapcount(folio) != 1)
- goto out;
+ break;
+ if (!folio_trylock(folio))
+ break;
folio_get(folio);
- if (!folio_trylock(folio)) {
- folio_put(folio);
- goto out;
- }
- pte_unmap_unlock(orig_pte, ptl);
- if (split_folio(folio)) {
- folio_unlock(folio);
- folio_put(folio);
- orig_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
- goto out;
- }
+ arch_leave_lazy_mmu_mode();
+ pte_unmap_unlock(start_pte, ptl);
+ start_pte = NULL;
+ err = split_folio(folio);
folio_unlock(folio);
folio_put(folio);
- orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+ if (err)
+ break;
+ start_pte = pte =
+ pte_offset_map_lock(mm, pmd, addr, &ptl);
+ if (!start_pte)
+ break;
+ arch_enter_lazy_mmu_mode();
pte--;
addr -= PAGE_SIZE;
continue;
@@ -725,17 +740,18 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
}
folio_mark_lazyfree(folio);
}
-out:
+
if (nr_swap) {
if (current->mm == mm)
sync_mm_rss(mm);
-
add_mm_counter(mm, MM_SWAPENTS, nr_swap);
}
- arch_leave_lazy_mmu_mode();
- pte_unmap_unlock(orig_pte, ptl);
+ if (start_pte) {
+ arch_leave_lazy_mmu_mode();
+ pte_unmap_unlock(start_pte, ptl);
+ }
cond_resched();
-next:
+
return 0;
}
diff --git a/mm/mapping_dirty_helpers.c b/mm/mapping_dirty_helpers.c
index e1eb33f49059..a26dd8bcfcdb 100644
--- a/mm/mapping_dirty_helpers.c
+++ b/mm/mapping_dirty_helpers.c
@@ -35,7 +35,7 @@ static int wp_pte(pte_t *pte, unsigned long addr, unsigned long end,
struct mm_walk *walk)
{
struct wp_walk *wpwalk = walk->private;
- pte_t ptent = *pte;
+ pte_t ptent = ptep_get(pte);
if (pte_write(ptent)) {
pte_t old_pte = ptep_modify_prot_start(walk->vma, addr, pte);
@@ -91,7 +91,7 @@ static int clean_record_pte(pte_t *pte, unsigned long addr,
{
struct wp_walk *wpwalk = walk->private;
struct clean_walk *cwalk = to_clean_walk(wpwalk);
- pte_t ptent = *pte;
+ pte_t ptent = ptep_get(pte);
if (pte_dirty(ptent)) {
pgoff_t pgoff = ((addr - walk->vma->vm_start) >> PAGE_SHIFT) +
@@ -128,19 +128,11 @@ static int wp_clean_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long end,
{
pmd_t pmdval = pmdp_get_lockless(pmd);
- if (!pmd_trans_unstable(&pmdval))
- return 0;
-
- if (pmd_none(pmdval)) {
- walk->action = ACTION_AGAIN;
- return 0;
- }
-
- /* Huge pmd, present or migrated */
- walk->action = ACTION_CONTINUE;
- if (pmd_trans_huge(pmdval) || pmd_devmap(pmdval))
+ /* Do not split a huge pmd, present or migrated */
+ if (pmd_trans_huge(pmdval) || pmd_devmap(pmdval)) {
WARN_ON(pmd_write(pmdval) || pmd_dirty(pmdval));
-
+ walk->action = ACTION_CONTINUE;
+ }
return 0;
}
@@ -156,23 +148,15 @@ static int wp_clean_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long end,
static int wp_clean_pud_entry(pud_t *pud, unsigned long addr, unsigned long end,
struct mm_walk *walk)
{
+#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
pud_t pudval = READ_ONCE(*pud);
- if (!pud_trans_unstable(&pudval))
- return 0;
-
- if (pud_none(pudval)) {
- walk->action = ACTION_AGAIN;
- return 0;
- }
-
-#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
- /* Huge pud */
- walk->action = ACTION_CONTINUE;
- if (pud_trans_huge(pudval) || pud_devmap(pudval))
+ /* Do not split a huge pud */
+ if (pud_trans_huge(pudval) || pud_devmap(pudval)) {
WARN_ON(pud_write(pudval) || pud_dirty(pudval));
+ walk->action = ACTION_CONTINUE;
+ }
#endif
-
return 0;
}
diff --git a/mm/memblock.c b/mm/memblock.c
index 50b921119600..388bc0c78998 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -2091,19 +2091,30 @@ static void __init memmap_init_reserved_pages(void)
{
struct memblock_region *region;
phys_addr_t start, end;
- u64 i;
+ int nid;
+
+ /*
+ * set nid on all reserved pages and also treat struct
+ * pages for the NOMAP regions as PageReserved
+ */
+ for_each_mem_region(region) {
+ nid = memblock_get_region_node(region);
+ start = region->base;
+ end = start + region->size;
+
+ if (memblock_is_nomap(region))
+ reserve_bootmem_region(start, end, nid);
+
+ memblock_set_node(start, end, &memblock.reserved, nid);
+ }
/* initialize struct pages for the reserved regions */
- for_each_reserved_mem_range(i, &start, &end)
- reserve_bootmem_region(start, end);
+ for_each_reserved_mem_region(region) {
+ nid = memblock_get_region_node(region);
+ start = region->base;
+ end = start + region->size;
- /* and also treat struct pages for the NOMAP regions as PageReserved */
- for_each_mem_region(region) {
- if (memblock_is_nomap(region)) {
- start = region->base;
- end = start + region->size;
- reserve_bootmem_region(start, end);
- }
+ reserve_bootmem_region(start, end, nid);
}
}
@@ -2131,7 +2142,7 @@ static unsigned long __init free_low_memory_core_early(void)
static int reset_managed_pages_done __initdata;
-void reset_node_managed_pages(pg_data_t *pgdat)
+static void __init reset_node_managed_pages(pg_data_t *pgdat)
{
struct zone *z;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 4b27e245a055..e8ca4bdcb03c 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -485,7 +485,7 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, int nid)
if (lru_gen_enabled()) {
if (soft_limit_excess(memcg))
- lru_gen_soft_reclaim(&memcg->nodeinfo[nid]->lruvec);
+ lru_gen_soft_reclaim(memcg, nid);
return;
}
@@ -639,7 +639,7 @@ static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val)
}
}
-static void do_flush_stats(bool atomic)
+static void do_flush_stats(void)
{
/*
* We always flush the entire tree, so concurrent flushers can just
@@ -652,30 +652,16 @@ static void do_flush_stats(bool atomic)
WRITE_ONCE(flush_next_time, jiffies_64 + 2*FLUSH_TIME);
- if (atomic)
- cgroup_rstat_flush_atomic(root_mem_cgroup->css.cgroup);
- else
- cgroup_rstat_flush(root_mem_cgroup->css.cgroup);
+ cgroup_rstat_flush(root_mem_cgroup->css.cgroup);
atomic_set(&stats_flush_threshold, 0);
atomic_set(&stats_flush_ongoing, 0);
}
-static bool should_flush_stats(void)
-{
- return atomic_read(&stats_flush_threshold) > num_online_cpus();
-}
-
void mem_cgroup_flush_stats(void)
{
- if (should_flush_stats())
- do_flush_stats(false);
-}
-
-void mem_cgroup_flush_stats_atomic(void)
-{
- if (should_flush_stats())
- do_flush_stats(true);
+ if (atomic_read(&stats_flush_threshold) > num_online_cpus())
+ do_flush_stats();
}
void mem_cgroup_flush_stats_ratelimited(void)
@@ -690,7 +676,7 @@ static void flush_memcg_stats_dwork(struct work_struct *w)
* Always flush here so that flushing in latency-sensitive paths is
* as cheap as possible.
*/
- do_flush_stats(false);
+ do_flush_stats();
queue_delayed_work(system_unbound_wq, &stats_flush_dwork, FLUSH_TIME);
}
@@ -1273,13 +1259,13 @@ static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
*
* This function iterates over tasks attached to @memcg or to any of its
* descendants and calls @fn for each task. If @fn returns a non-zero
- * value, the function breaks the iteration loop and returns the value.
- * Otherwise, it will iterate over all tasks and return 0.
+ * value, the function breaks the iteration loop. Otherwise, it will iterate
+ * over all tasks and return 0.
*
* This function must not be called for the root memory cgroup.
*/
-int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
- int (*fn)(struct task_struct *, void *), void *arg)
+void mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
+ int (*fn)(struct task_struct *, void *), void *arg)
{
struct mem_cgroup *iter;
int ret = 0;
@@ -1299,7 +1285,6 @@ int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
break;
}
}
- return ret;
}
#ifdef CONFIG_DEBUG_VM
@@ -1580,13 +1565,10 @@ static inline unsigned long memcg_page_state_output(struct mem_cgroup *memcg,
return memcg_page_state(memcg, item) * memcg_page_state_unit(item);
}
-static void memory_stat_format(struct mem_cgroup *memcg, char *buf, int bufsize)
+static void memcg_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
{
- struct seq_buf s;
int i;
- seq_buf_init(&s, buf, bufsize);
-
/*
* Provide statistics on the state of the memory subsystem as
* well as cumulative event counters that show past behavior.
@@ -1603,21 +1585,21 @@ static void memory_stat_format(struct mem_cgroup *memcg, char *buf, int bufsize)
u64 size;
size = memcg_page_state_output(memcg, memory_stats[i].idx);
- seq_buf_printf(&s, "%s %llu\n", memory_stats[i].name, size);
+ seq_buf_printf(s, "%s %llu\n", memory_stats[i].name, size);
if (unlikely(memory_stats[i].idx == NR_SLAB_UNRECLAIMABLE_B)) {
size += memcg_page_state_output(memcg,
NR_SLAB_RECLAIMABLE_B);
- seq_buf_printf(&s, "slab %llu\n", size);
+ seq_buf_printf(s, "slab %llu\n", size);
}
}
/* Accumulated memory events */
- seq_buf_printf(&s, "pgscan %lu\n",
+ seq_buf_printf(s, "pgscan %lu\n",
memcg_events(memcg, PGSCAN_KSWAPD) +
memcg_events(memcg, PGSCAN_DIRECT) +
memcg_events(memcg, PGSCAN_KHUGEPAGED));
- seq_buf_printf(&s, "pgsteal %lu\n",
+ seq_buf_printf(s, "pgsteal %lu\n",
memcg_events(memcg, PGSTEAL_KSWAPD) +
memcg_events(memcg, PGSTEAL_DIRECT) +
memcg_events(memcg, PGSTEAL_KHUGEPAGED));
@@ -1627,13 +1609,24 @@ static void memory_stat_format(struct mem_cgroup *memcg, char *buf, int bufsize)
memcg_vm_event_stat[i] == PGPGOUT)
continue;
- seq_buf_printf(&s, "%s %lu\n",
+ seq_buf_printf(s, "%s %lu\n",
vm_event_name(memcg_vm_event_stat[i]),
memcg_events(memcg, memcg_vm_event_stat[i]));
}
/* The above should easily fit into one page */
- WARN_ON_ONCE(seq_buf_has_overflowed(&s));
+ WARN_ON_ONCE(seq_buf_has_overflowed(s));
+}
+
+static void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s);
+
+static void memory_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
+{
+ if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ memcg_stat_format(memcg, s);
+ else
+ memcg1_stat_format(memcg, s);
+ WARN_ON_ONCE(seq_buf_has_overflowed(s));
}
#define K(x) ((x) << (PAGE_SHIFT-10))
@@ -1671,6 +1664,7 @@ void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg)
{
/* Use static buffer, for the caller is holding oom_lock. */
static char buf[PAGE_SIZE];
+ struct seq_buf s;
lockdep_assert_held(&oom_lock);
@@ -1693,8 +1687,9 @@ void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg)
pr_info("Memory cgroup stats for ");
pr_cont_cgroup_path(memcg->css.cgroup);
pr_cont(":");
- memory_stat_format(memcg, buf, sizeof(buf));
- pr_info("%s", buf);
+ seq_buf_init(&s, buf, sizeof(buf));
+ memory_stat_format(memcg, &s);
+ seq_buf_do_printk(&s, KERN_INFO);
}
/*
@@ -2028,26 +2023,12 @@ bool mem_cgroup_oom_synchronize(bool handle)
if (locked)
mem_cgroup_oom_notify(memcg);
- if (locked && !READ_ONCE(memcg->oom_kill_disable)) {
- mem_cgroup_unmark_under_oom(memcg);
- finish_wait(&memcg_oom_waitq, &owait.wait);
- mem_cgroup_out_of_memory(memcg, current->memcg_oom_gfp_mask,
- current->memcg_oom_order);
- } else {
- schedule();
- mem_cgroup_unmark_under_oom(memcg);
- finish_wait(&memcg_oom_waitq, &owait.wait);
- }
+ schedule();
+ mem_cgroup_unmark_under_oom(memcg);
+ finish_wait(&memcg_oom_waitq, &owait.wait);
- if (locked) {
+ if (locked)
mem_cgroup_oom_unlock(memcg);
- /*
- * There is no guarantee that an OOM-lock contender
- * sees the wakeups triggered by the OOM kill
- * uncharges. Wake any sleepers explicitly.
- */
- memcg_oom_recover(memcg);
- }
cleanup:
current->memcg_in_oom = NULL;
css_put(&memcg->css);
@@ -2166,17 +2147,12 @@ again:
* When charge migration first begins, we can have multiple
* critical sections holding the fast-path RCU lock and one
* holding the slowpath move_lock. Track the task who has the
- * move_lock for unlock_page_memcg().
+ * move_lock for folio_memcg_unlock().
*/
memcg->move_lock_task = current;
memcg->move_lock_flags = flags;
}
-void lock_page_memcg(struct page *page)
-{
- folio_memcg_lock(page_folio(page));
-}
-
static void __folio_memcg_unlock(struct mem_cgroup *memcg)
{
if (memcg && memcg->move_lock_task == current) {
@@ -2204,11 +2180,6 @@ void folio_memcg_unlock(struct folio *folio)
__folio_memcg_unlock(folio_memcg(folio));
}
-void unlock_page_memcg(struct page *page)
-{
- folio_memcg_unlock(page_folio(page));
-}
-
struct memcg_stock_pcp {
local_lock_t stock_lock;
struct mem_cgroup *cached; /* this never be root cgroup */
@@ -2275,7 +2246,7 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
local_lock_irqsave(&memcg_stock.stock_lock, flags);
stock = this_cpu_ptr(&memcg_stock);
- if (memcg == stock->cached && stock->nr_pages >= nr_pages) {
+ if (memcg == READ_ONCE(stock->cached) && stock->nr_pages >= nr_pages) {
stock->nr_pages -= nr_pages;
ret = true;
}
@@ -2290,7 +2261,7 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
*/
static void drain_stock(struct memcg_stock_pcp *stock)
{
- struct mem_cgroup *old = stock->cached;
+ struct mem_cgroup *old = READ_ONCE(stock->cached);
if (!old)
return;
@@ -2303,7 +2274,7 @@ static void drain_stock(struct memcg_stock_pcp *stock)
}
css_put(&old->css);
- stock->cached = NULL;
+ WRITE_ONCE(stock->cached, NULL);
}
static void drain_local_stock(struct work_struct *dummy)
@@ -2338,10 +2309,10 @@ static void __refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
struct memcg_stock_pcp *stock;
stock = this_cpu_ptr(&memcg_stock);
- if (stock->cached != memcg) { /* reset if necessary */
+ if (READ_ONCE(stock->cached) != memcg) { /* reset if necessary */
drain_stock(stock);
css_get(&memcg->css);
- stock->cached = memcg;
+ WRITE_ONCE(stock->cached, memcg);
}
stock->nr_pages += nr_pages;
@@ -2383,7 +2354,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)
bool flush = false;
rcu_read_lock();
- memcg = stock->cached;
+ memcg = READ_ONCE(stock->cached);
if (memcg && stock->nr_pages &&
mem_cgroup_is_descendant(memcg, root_memcg))
flush = true;
@@ -2884,7 +2855,7 @@ static void commit_charge(struct folio *folio, struct mem_cgroup *memcg)
*
* - the page lock
* - LRU isolation
- * - lock_page_memcg()
+ * - folio_memcg_lock()
* - exclusive reference
* - mem_cgroup_trylock_pages()
*/
@@ -3208,12 +3179,12 @@ void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
* accumulating over a page of vmstat data or when pgdat or idx
* changes.
*/
- if (stock->cached_objcg != objcg) {
+ if (READ_ONCE(stock->cached_objcg) != objcg) {
old = drain_obj_stock(stock);
obj_cgroup_get(objcg);
stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes)
? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0;
- stock->cached_objcg = objcg;
+ WRITE_ONCE(stock->cached_objcg, objcg);
stock->cached_pgdat = pgdat;
} else if (stock->cached_pgdat != pgdat) {
/* Flush the existing cached vmstat data */
@@ -3267,7 +3238,7 @@ static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
local_lock_irqsave(&memcg_stock.stock_lock, flags);
stock = this_cpu_ptr(&memcg_stock);
- if (objcg == stock->cached_objcg && stock->nr_bytes >= nr_bytes) {
+ if (objcg == READ_ONCE(stock->cached_objcg) && stock->nr_bytes >= nr_bytes) {
stock->nr_bytes -= nr_bytes;
ret = true;
}
@@ -3279,7 +3250,7 @@ static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock)
{
- struct obj_cgroup *old = stock->cached_objcg;
+ struct obj_cgroup *old = READ_ONCE(stock->cached_objcg);
if (!old)
return NULL;
@@ -3332,7 +3303,7 @@ static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock)
stock->cached_pgdat = NULL;
}
- stock->cached_objcg = NULL;
+ WRITE_ONCE(stock->cached_objcg, NULL);
/*
* The `old' objects needs to be released by the caller via
* obj_cgroup_put() outside of memcg_stock_pcp::stock_lock.
@@ -3343,10 +3314,11 @@ static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock)
static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
struct mem_cgroup *root_memcg)
{
+ struct obj_cgroup *objcg = READ_ONCE(stock->cached_objcg);
struct mem_cgroup *memcg;
- if (stock->cached_objcg) {
- memcg = obj_cgroup_memcg(stock->cached_objcg);
+ if (objcg) {
+ memcg = obj_cgroup_memcg(objcg);
if (memcg && mem_cgroup_is_descendant(memcg, root_memcg))
return true;
}
@@ -3365,10 +3337,10 @@ static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes,
local_lock_irqsave(&memcg_stock.stock_lock, flags);
stock = this_cpu_ptr(&memcg_stock);
- if (stock->cached_objcg != objcg) { /* reset if necessary */
+ if (READ_ONCE(stock->cached_objcg) != objcg) { /* reset if necessary */
old = drain_obj_stock(stock);
obj_cgroup_get(objcg);
- stock->cached_objcg = objcg;
+ WRITE_ONCE(stock->cached_objcg, objcg);
stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes)
? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0;
allow_uncharge = true; /* Allow uncharge when objcg changes */
@@ -3699,27 +3671,13 @@ static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
if (mem_cgroup_is_root(memcg)) {
/*
- * We can reach here from irq context through:
- * uncharge_batch()
- * |--memcg_check_events()
- * |--mem_cgroup_threshold()
- * |--__mem_cgroup_threshold()
- * |--mem_cgroup_usage
- *
- * rstat flushing is an expensive operation that should not be
- * done from irq context; use stale stats in this case.
- * Arguably, usage threshold events are not reliable on the root
- * memcg anyway since its usage is ill-defined.
- *
- * Additionally, other call paths through memcg_check_events()
- * disable irqs, so make sure we are flushing stats atomically.
+ * Approximate root's usage from global state. This isn't
+ * perfect, but the root usage was always an approximation.
*/
- if (in_task())
- mem_cgroup_flush_stats_atomic();
- val = memcg_page_state(memcg, NR_FILE_PAGES) +
- memcg_page_state(memcg, NR_ANON_MAPPED);
+ val = global_node_page_state(NR_FILE_PAGES) +
+ global_node_page_state(NR_ANON_MAPPED);
if (swap)
- val += memcg_page_state(memcg, MEMCG_SWAP);
+ val += total_swap_pages - get_nr_swap_pages();
} else {
if (!swap)
val = page_counter_read(&memcg->memory);
@@ -4135,9 +4093,8 @@ static const unsigned int memcg1_events[] = {
PGMAJFAULT,
};
-static int memcg_stat_show(struct seq_file *m, void *v)
+static void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
{
- struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
unsigned long memory, memsw;
struct mem_cgroup *mi;
unsigned int i;
@@ -4152,18 +4109,18 @@ static int memcg_stat_show(struct seq_file *m, void *v)
if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
continue;
nr = memcg_page_state_local(memcg, memcg1_stats[i]);
- seq_printf(m, "%s %lu\n", memcg1_stat_names[i],
+ seq_buf_printf(s, "%s %lu\n", memcg1_stat_names[i],
nr * memcg_page_state_unit(memcg1_stats[i]));
}
for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
- seq_printf(m, "%s %lu\n", vm_event_name(memcg1_events[i]),
- memcg_events_local(memcg, memcg1_events[i]));
+ seq_buf_printf(s, "%s %lu\n", vm_event_name(memcg1_events[i]),
+ memcg_events_local(memcg, memcg1_events[i]));
for (i = 0; i < NR_LRU_LISTS; i++)
- seq_printf(m, "%s %lu\n", lru_list_name(i),
- memcg_page_state_local(memcg, NR_LRU_BASE + i) *
- PAGE_SIZE);
+ seq_buf_printf(s, "%s %lu\n", lru_list_name(i),
+ memcg_page_state_local(memcg, NR_LRU_BASE + i) *
+ PAGE_SIZE);
/* Hierarchical information */
memory = memsw = PAGE_COUNTER_MAX;
@@ -4171,11 +4128,11 @@ static int memcg_stat_show(struct seq_file *m, void *v)
memory = min(memory, READ_ONCE(mi->memory.max));
memsw = min(memsw, READ_ONCE(mi->memsw.max));
}
- seq_printf(m, "hierarchical_memory_limit %llu\n",
- (u64)memory * PAGE_SIZE);
+ seq_buf_printf(s, "hierarchical_memory_limit %llu\n",
+ (u64)memory * PAGE_SIZE);
if (do_memsw_account())
- seq_printf(m, "hierarchical_memsw_limit %llu\n",
- (u64)memsw * PAGE_SIZE);
+ seq_buf_printf(s, "hierarchical_memsw_limit %llu\n",
+ (u64)memsw * PAGE_SIZE);
for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
unsigned long nr;
@@ -4183,19 +4140,19 @@ static int memcg_stat_show(struct seq_file *m, void *v)
if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
continue;
nr = memcg_page_state(memcg, memcg1_stats[i]);
- seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i],
+ seq_buf_printf(s, "total_%s %llu\n", memcg1_stat_names[i],
(u64)nr * memcg_page_state_unit(memcg1_stats[i]));
}
for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
- seq_printf(m, "total_%s %llu\n",
- vm_event_name(memcg1_events[i]),
- (u64)memcg_events(memcg, memcg1_events[i]));
+ seq_buf_printf(s, "total_%s %llu\n",
+ vm_event_name(memcg1_events[i]),
+ (u64)memcg_events(memcg, memcg1_events[i]));
for (i = 0; i < NR_LRU_LISTS; i++)
- seq_printf(m, "total_%s %llu\n", lru_list_name(i),
- (u64)memcg_page_state(memcg, NR_LRU_BASE + i) *
- PAGE_SIZE);
+ seq_buf_printf(s, "total_%s %llu\n", lru_list_name(i),
+ (u64)memcg_page_state(memcg, NR_LRU_BASE + i) *
+ PAGE_SIZE);
#ifdef CONFIG_DEBUG_VM
{
@@ -4210,12 +4167,10 @@ static int memcg_stat_show(struct seq_file *m, void *v)
anon_cost += mz->lruvec.anon_cost;
file_cost += mz->lruvec.file_cost;
}
- seq_printf(m, "anon_cost %lu\n", anon_cost);
- seq_printf(m, "file_cost %lu\n", file_cost);
+ seq_buf_printf(s, "anon_cost %lu\n", anon_cost);
+ seq_buf_printf(s, "file_cost %lu\n", file_cost);
}
#endif
-
- return 0;
}
static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css,
@@ -4648,11 +4603,7 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
struct mem_cgroup *parent;
- /*
- * wb_writeback() takes a spinlock and calls
- * wb_over_bg_thresh()->mem_cgroup_wb_stats(). Do not sleep.
- */
- mem_cgroup_flush_stats_atomic();
+ mem_cgroup_flush_stats();
*pdirty = memcg_page_state(memcg, NR_FILE_DIRTY);
*pwriteback = memcg_page_state(memcg, NR_WRITEBACK);
@@ -5059,6 +5010,8 @@ static int mem_cgroup_slab_show(struct seq_file *m, void *p)
}
#endif
+static int memory_stat_show(struct seq_file *m, void *v);
+
static struct cftype mem_cgroup_legacy_files[] = {
{
.name = "usage_in_bytes",
@@ -5091,7 +5044,7 @@ static struct cftype mem_cgroup_legacy_files[] = {
},
{
.name = "stat",
- .seq_show = memcg_stat_show,
+ .seq_show = memory_stat_show,
},
{
.name = "force_empty",
@@ -5464,7 +5417,7 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
if (unlikely(mem_cgroup_is_root(memcg)))
queue_delayed_work(system_unbound_wq, &stats_flush_dwork,
- 2UL*HZ);
+ FLUSH_TIME);
lru_gen_online_memcg(memcg);
return 0;
offline_kmem:
@@ -5865,7 +5818,7 @@ static int mem_cgroup_move_account(struct page *page,
* with (un)charging, migration, LRU putback, or anything else
* that would rely on a stable page's memory cgroup.
*
- * Note that lock_page_memcg is a memcg lock, not a page lock,
+ * Note that folio_memcg_lock is a memcg lock, not a page lock,
* to save space. As soon as we switch page's memory cgroup to a
* new memcg that isn't locked, the above state can change
* concurrently again. Make sure we're truly done with it.
@@ -6057,11 +6010,11 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
return 0;
}
- if (pmd_trans_unstable(pmd))
- return 0;
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+ if (!pte)
+ return 0;
for (; addr != end; pte++, addr += PAGE_SIZE)
- if (get_mctgt_type(vma, addr, *pte, NULL))
+ if (get_mctgt_type(vma, addr, ptep_get(pte), NULL))
mc.precharge++; /* increment precharge temporarily */
pte_unmap_unlock(pte - 1, ptl);
cond_resched();
@@ -6277,12 +6230,12 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
return 0;
}
- if (pmd_trans_unstable(pmd))
- return 0;
retry:
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+ if (!pte)
+ return 0;
for (; addr != end; addr += PAGE_SIZE) {
- pte_t ptent = *(pte++);
+ pte_t ptent = ptep_get(pte++);
bool device = false;
swp_entry_t ent;
@@ -6356,7 +6309,7 @@ static void mem_cgroup_move_charge(void)
{
lru_add_drain_all();
/*
- * Signal lock_page_memcg() to take the memcg's move_lock
+ * Signal folio_memcg_lock() to take the memcg's move_lock
* while we're moving its pages to another memcg. Then wait
* for already started RCU-only updates to finish.
*/
@@ -6634,10 +6587,12 @@ static int memory_stat_show(struct seq_file *m, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
char *buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ struct seq_buf s;
if (!buf)
return -ENOMEM;
- memory_stat_format(memcg, buf, PAGE_SIZE);
+ seq_buf_init(&s, buf, PAGE_SIZE);
+ memory_stat_format(memcg, &s);
seq_puts(m, buf);
kfree(buf);
return 0;
@@ -6896,7 +6851,7 @@ static unsigned long effective_protection(unsigned long usage,
protected = min(usage, setting);
/*
* If all cgroups at this level combined claim and use more
- * protection then what the parent affords them, distribute
+ * protection than what the parent affords them, distribute
* shares in proportion to utilization.
*
* We are using actual utilization rather than the statically
@@ -7421,8 +7376,7 @@ static int __init mem_cgroup_init(void)
for_each_node(node) {
struct mem_cgroup_tree_per_node *rtpn;
- rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL,
- node_online(node) ? node : NUMA_NO_NODE);
+ rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, node);
rtpn->rb_root = RB_ROOT;
rtpn->rb_rightmost = NULL;
@@ -7656,6 +7610,14 @@ static u64 swap_current_read(struct cgroup_subsys_state *css,
return (u64)page_counter_read(&memcg->swap) * PAGE_SIZE;
}
+static u64 swap_peak_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+
+ return (u64)memcg->swap.watermark * PAGE_SIZE;
+}
+
static int swap_high_show(struct seq_file *m, void *v)
{
return seq_puts_memcg_tunable(m,
@@ -7735,6 +7697,11 @@ static struct cftype swap_files[] = {
.write = swap_max_write,
},
{
+ .name = "swap.peak",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .read_u64 = swap_peak_read,
+ },
+ {
.name = "swap.events",
.flags = CFTYPE_NOT_ON_ROOT,
.file_offset = offsetof(struct mem_cgroup, swap_events_file),
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 5b663eca1f29..e245191e6b04 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -6,16 +6,16 @@
* High level machine check handler. Handles pages reported by the
* hardware as being corrupted usually due to a multi-bit ECC memory or cache
* failure.
- *
+ *
* In addition there is a "soft offline" entry point that allows stop using
* not-yet-corrupted-by-suspicious pages without killing anything.
*
* Handles page cache pages in various states. The tricky part
- * here is that we can access any page asynchronously in respect to
- * other VM users, because memory failures could happen anytime and
- * anywhere. This could violate some of their assumptions. This is why
- * this code has to be extremely careful. Generally it tries to use
- * normal locking rules, as in get the standard locks, even if that means
+ * here is that we can access any page asynchronously in respect to
+ * other VM users, because memory failures could happen anytime and
+ * anywhere. This could violate some of their assumptions. This is why
+ * this code has to be extremely careful. Generally it tries to use
+ * normal locking rules, as in get the standard locks, even if that means
* the error handling takes potentially a long time.
*
* It can be very tempting to add handling for obscure cases here.
@@ -25,12 +25,12 @@
* https://git.kernel.org/cgit/utils/cpu/mce/mce-test.git/
* - The case actually shows up as a frequent (top 10) page state in
* tools/mm/page-types when running a real workload.
- *
+ *
* There are several operations here with exponential complexity because
- * of unsuitable VM data structures. For example the operation to map back
- * from RMAP chains to processes has to walk the complete process list and
+ * of unsuitable VM data structures. For example the operation to map back
+ * from RMAP chains to processes has to walk the complete process list and
* has non linear complexity with the number. But since memory corruptions
- * are rare we hope to get away with this. This avoids impacting the core
+ * are rare we hope to get away with this. This avoids impacting the core
* VM.
*/
@@ -123,7 +123,6 @@ const struct attribute_group memory_failure_attr_group = {
.attrs = memory_failure_attr,
};
-#ifdef CONFIG_SYSCTL
static struct ctl_table memory_failure_table[] = {
{
.procname = "memory_failure_early_kill",
@@ -146,14 +145,6 @@ static struct ctl_table memory_failure_table[] = {
{ }
};
-static int __init memory_failure_sysctl_init(void)
-{
- register_sysctl_init("vm", memory_failure_table);
- return 0;
-}
-late_initcall(memory_failure_sysctl_init);
-#endif /* CONFIG_SYSCTL */
-
/*
* Return values:
* 1: the page is dissolved (if needed) and taken off from buddy,
@@ -395,6 +386,7 @@ static unsigned long dev_pagemap_mapping_shift(struct vm_area_struct *vma,
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
+ pte_t ptent;
VM_BUG_ON_VMA(address == -EFAULT, vma);
pgd = pgd_offset(vma->vm_mm, address);
@@ -414,7 +406,10 @@ static unsigned long dev_pagemap_mapping_shift(struct vm_area_struct *vma,
if (pmd_devmap(*pmd))
return PMD_SHIFT;
pte = pte_offset_map(pmd, address);
- if (pte_present(*pte) && pte_devmap(*pte))
+ if (!pte)
+ return 0;
+ ptent = ptep_get(pte);
+ if (pte_present(ptent) && pte_devmap(ptent))
ret = PAGE_SHIFT;
pte_unmap(pte);
return ret;
@@ -800,13 +795,13 @@ static int hwpoison_pte_range(pmd_t *pmdp, unsigned long addr,
goto out;
}
- if (pmd_trans_unstable(pmdp))
- goto out;
-
mapped_pte = ptep = pte_offset_map_lock(walk->vma->vm_mm, pmdp,
addr, &ptl);
+ if (!ptep)
+ goto out;
+
for (; addr != end; ptep++, addr += PAGE_SIZE) {
- ret = check_hwpoisoned_entry(*ptep, addr, PAGE_SHIFT,
+ ret = check_hwpoisoned_entry(ptep_get(ptep), addr, PAGE_SHIFT,
hwp->pfn, &hwp->tk);
if (ret == 1)
break;
@@ -2441,6 +2436,8 @@ static int __init memory_failure_init(void)
INIT_WORK(&mf_cpu->work, memory_failure_work_func);
}
+ register_sysctl_init("vm", memory_failure_table);
+
return 0;
}
core_initcall(memory_failure_init);
diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
index e593e56e530b..a516e303e304 100644
--- a/mm/memory-tiers.c
+++ b/mm/memory-tiers.c
@@ -366,7 +366,7 @@ static void establish_demotion_targets(void)
lockdep_assert_held_once(&memory_tier_lock);
- if (!node_demotion || !IS_ENABLED(CONFIG_MIGRATION))
+ if (!node_demotion)
return;
disable_all_demotion_targets();
@@ -451,7 +451,6 @@ static void establish_demotion_targets(void)
}
#else
-static inline void disable_all_demotion_targets(void) {}
static inline void establish_demotion_targets(void) {}
#endif /* CONFIG_MIGRATION */
diff --git a/mm/memory.c b/mm/memory.c
index f69fbc251198..f758f59f3704 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -699,15 +699,17 @@ static void restore_exclusive_pte(struct vm_area_struct *vma,
struct page *page, unsigned long address,
pte_t *ptep)
{
+ pte_t orig_pte;
pte_t pte;
swp_entry_t entry;
+ orig_pte = ptep_get(ptep);
pte = pte_mkold(mk_pte(page, READ_ONCE(vma->vm_page_prot)));
- if (pte_swp_soft_dirty(*ptep))
+ if (pte_swp_soft_dirty(orig_pte))
pte = pte_mksoft_dirty(pte);
- entry = pte_to_swp_entry(*ptep);
- if (pte_swp_uffd_wp(*ptep))
+ entry = pte_to_swp_entry(orig_pte);
+ if (pte_swp_uffd_wp(orig_pte))
pte = pte_mkuffd_wp(pte);
else if (is_writable_device_exclusive_entry(entry))
pte = maybe_mkwrite(pte_mkdirty(pte), vma);
@@ -744,7 +746,7 @@ static int
try_restore_exclusive_pte(pte_t *src_pte, struct vm_area_struct *vma,
unsigned long addr)
{
- swp_entry_t entry = pte_to_swp_entry(*src_pte);
+ swp_entry_t entry = pte_to_swp_entry(ptep_get(src_pte));
struct page *page = pfn_swap_entry_to_page(entry);
if (trylock_page(page)) {
@@ -768,9 +770,10 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
struct vm_area_struct *src_vma, unsigned long addr, int *rss)
{
unsigned long vm_flags = dst_vma->vm_flags;
- pte_t pte = *src_pte;
+ pte_t orig_pte = ptep_get(src_pte);
+ pte_t pte = orig_pte;
struct page *page;
- swp_entry_t entry = pte_to_swp_entry(pte);
+ swp_entry_t entry = pte_to_swp_entry(orig_pte);
if (likely(!non_swap_entry(entry))) {
if (swap_duplicate(entry) < 0)
@@ -785,8 +788,8 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
spin_unlock(&mmlist_lock);
}
/* Mark the swap entry as shared. */
- if (pte_swp_exclusive(*src_pte)) {
- pte = pte_swp_clear_exclusive(*src_pte);
+ if (pte_swp_exclusive(orig_pte)) {
+ pte = pte_swp_clear_exclusive(orig_pte);
set_pte_at(src_mm, addr, src_pte, pte);
}
rss[MM_SWAPENTS]++;
@@ -805,9 +808,9 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
entry = make_readable_migration_entry(
swp_offset(entry));
pte = swp_entry_to_pte(entry);
- if (pte_swp_soft_dirty(*src_pte))
+ if (pte_swp_soft_dirty(orig_pte))
pte = pte_swp_mksoft_dirty(pte);
- if (pte_swp_uffd_wp(*src_pte))
+ if (pte_swp_uffd_wp(orig_pte))
pte = pte_swp_mkuffd_wp(pte);
set_pte_at(src_mm, addr, src_pte, pte);
}
@@ -840,7 +843,7 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
entry = make_readable_device_private_entry(
swp_offset(entry));
pte = swp_entry_to_pte(entry);
- if (pte_swp_uffd_wp(*src_pte))
+ if (pte_swp_uffd_wp(orig_pte))
pte = pte_swp_mkuffd_wp(pte);
set_pte_at(src_mm, addr, src_pte, pte);
}
@@ -904,7 +907,7 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma
/* All done, just insert the new page copy in the child */
pte = mk_pte(&new_folio->page, dst_vma->vm_page_prot);
pte = maybe_mkwrite(pte_mkdirty(pte), dst_vma);
- if (userfaultfd_pte_wp(dst_vma, *src_pte))
+ if (userfaultfd_pte_wp(dst_vma, ptep_get(src_pte)))
/* Uffd-wp needs to be delivered to dest pte as well */
pte = pte_mkuffd_wp(pte);
set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte);
@@ -922,7 +925,7 @@ copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
{
struct mm_struct *src_mm = src_vma->vm_mm;
unsigned long vm_flags = src_vma->vm_flags;
- pte_t pte = *src_pte;
+ pte_t pte = ptep_get(src_pte);
struct page *page;
struct folio *folio;
@@ -1002,6 +1005,7 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
struct mm_struct *src_mm = src_vma->vm_mm;
pte_t *orig_src_pte, *orig_dst_pte;
pte_t *src_pte, *dst_pte;
+ pte_t ptent;
spinlock_t *src_ptl, *dst_ptl;
int progress, ret = 0;
int rss[NR_MM_COUNTERS];
@@ -1012,13 +1016,25 @@ again:
progress = 0;
init_rss_vec(rss);
+ /*
+ * copy_pmd_range()'s prior pmd_none_or_clear_bad(src_pmd), and the
+ * error handling here, assume that exclusive mmap_lock on dst and src
+ * protects anon from unexpected THP transitions; with shmem and file
+ * protected by mmap_lock-less collapse skipping areas with anon_vma
+ * (whereas vma_needs_copy() skips areas without anon_vma). A rework
+ * can remove such assumptions later, but this is good enough for now.
+ */
dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
if (!dst_pte) {
ret = -ENOMEM;
goto out;
}
- src_pte = pte_offset_map(src_pmd, addr);
- src_ptl = pte_lockptr(src_mm, src_pmd);
+ src_pte = pte_offset_map_nolock(src_mm, src_pmd, addr, &src_ptl);
+ if (!src_pte) {
+ pte_unmap_unlock(dst_pte, dst_ptl);
+ /* ret == 0 */
+ goto out;
+ }
spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
orig_src_pte = src_pte;
orig_dst_pte = dst_pte;
@@ -1035,17 +1051,18 @@ again:
spin_needbreak(src_ptl) || spin_needbreak(dst_ptl))
break;
}
- if (pte_none(*src_pte)) {
+ ptent = ptep_get(src_pte);
+ if (pte_none(ptent)) {
progress++;
continue;
}
- if (unlikely(!pte_present(*src_pte))) {
+ if (unlikely(!pte_present(ptent))) {
ret = copy_nonpresent_pte(dst_mm, src_mm,
dst_pte, src_pte,
dst_vma, src_vma,
addr, rss);
if (ret == -EIO) {
- entry = pte_to_swp_entry(*src_pte);
+ entry = pte_to_swp_entry(ptep_get(src_pte));
break;
} else if (ret == -EBUSY) {
break;
@@ -1083,8 +1100,7 @@ again:
} while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
arch_leave_lazy_mmu_mode();
- spin_unlock(src_ptl);
- pte_unmap(orig_src_pte);
+ pte_unmap_unlock(orig_src_pte, src_ptl);
add_mm_rss_vec(dst_mm, rss);
pte_unmap_unlock(orig_dst_pte, dst_ptl);
cond_resched();
@@ -1388,14 +1404,15 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
swp_entry_t entry;
tlb_change_page_size(tlb, PAGE_SIZE);
-again:
init_rss_vec(rss);
- start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
- pte = start_pte;
+ start_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+ if (!pte)
+ return addr;
+
flush_tlb_batched_pending(mm);
arch_enter_lazy_mmu_mode();
do {
- pte_t ptent = *pte;
+ pte_t ptent = ptep_get(pte);
struct page *page;
if (pte_none(ptent))
@@ -1507,17 +1524,10 @@ again:
* If we forced a TLB flush (either due to running out of
* batch buffers or because we needed to flush dirty TLB
* entries before releasing the ptl), free the batched
- * memory too. Restart if we didn't do everything.
+ * memory too. Come back again if we didn't do everything.
*/
- if (force_flush) {
- force_flush = 0;
+ if (force_flush)
tlb_flush_mmu(tlb);
- }
-
- if (addr != end) {
- cond_resched();
- goto again;
- }
return addr;
}
@@ -1536,8 +1546,10 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
if (next - addr != HPAGE_PMD_SIZE)
__split_huge_pmd(vma, pmd, addr, false, NULL);
- else if (zap_huge_pmd(tlb, vma, pmd, addr))
- goto next;
+ else if (zap_huge_pmd(tlb, vma, pmd, addr)) {
+ addr = next;
+ continue;
+ }
/* fall through */
} else if (details && details->single_folio &&
folio_test_pmd_mappable(details->single_folio) &&
@@ -1550,20 +1562,14 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
*/
spin_unlock(ptl);
}
-
- /*
- * Here there can be other concurrent MADV_DONTNEED or
- * trans huge page faults running, and if the pmd is
- * none or trans huge it can change under us. This is
- * because MADV_DONTNEED holds the mmap_lock in read
- * mode.
- */
- if (pmd_none_or_trans_huge_or_clear_bad(pmd))
- goto next;
- next = zap_pte_range(tlb, vma, pmd, addr, next, details);
-next:
- cond_resched();
- } while (pmd++, addr = next, addr != end);
+ if (pmd_none(*pmd)) {
+ addr = next;
+ continue;
+ }
+ addr = zap_pte_range(tlb, vma, pmd, addr, next, details);
+ if (addr != next)
+ pmd--;
+ } while (pmd++, cond_resched(), addr != end);
return addr;
}
@@ -1821,7 +1827,7 @@ static int validate_page_before_insert(struct page *page)
static int insert_page_into_pte_locked(struct vm_area_struct *vma, pte_t *pte,
unsigned long addr, struct page *page, pgprot_t prot)
{
- if (!pte_none(*pte))
+ if (!pte_none(ptep_get(pte)))
return -EBUSY;
/* Ok, finally just insert the thing.. */
get_page(page);
@@ -1905,6 +1911,10 @@ more:
const int batch_size = min_t(int, pages_to_write_in_pmd, 8);
start_pte = pte_offset_map_lock(mm, pmd, addr, &pte_lock);
+ if (!start_pte) {
+ ret = -EFAULT;
+ goto out;
+ }
for (pte = start_pte; pte_idx < batch_size; ++pte, ++pte_idx) {
int err = insert_page_in_batch_locked(vma, pte,
addr, pages[curr_page_idx], prot);
@@ -2111,7 +2121,8 @@ static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr,
pte = get_locked_pte(mm, addr, &ptl);
if (!pte)
return VM_FAULT_OOM;
- if (!pte_none(*pte)) {
+ entry = ptep_get(pte);
+ if (!pte_none(entry)) {
if (mkwrite) {
/*
* For read faults on private mappings the PFN passed
@@ -2123,11 +2134,11 @@ static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr,
* allocation and mapping invalidation so just skip the
* update.
*/
- if (pte_pfn(*pte) != pfn_t_to_pfn(pfn)) {
- WARN_ON_ONCE(!is_zero_pfn(pte_pfn(*pte)));
+ if (pte_pfn(entry) != pfn_t_to_pfn(pfn)) {
+ WARN_ON_ONCE(!is_zero_pfn(pte_pfn(entry)));
goto out_unlock;
}
- entry = pte_mkyoung(*pte);
+ entry = pte_mkyoung(entry);
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
if (ptep_set_access_flags(vma, addr, pte, entry, 1))
update_mmu_cache(vma, addr, pte);
@@ -2339,7 +2350,7 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
return -ENOMEM;
arch_enter_lazy_mmu_mode();
do {
- BUG_ON(!pte_none(*pte));
+ BUG_ON(!pte_none(ptep_get(pte)));
if (!pfn_modify_allowed(pfn, prot)) {
err = -EACCES;
break;
@@ -2572,15 +2583,15 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
mapped_pte = pte = (mm == &init_mm) ?
pte_offset_kernel(pmd, addr) :
pte_offset_map_lock(mm, pmd, addr, &ptl);
+ if (!pte)
+ return -EINVAL;
}
- BUG_ON(pmd_huge(*pmd));
-
arch_enter_lazy_mmu_mode();
if (fn) {
do {
- if (create || !pte_none(*pte)) {
+ if (create || !pte_none(ptep_get(pte))) {
err = fn(pte++, addr, data);
if (err)
break;
@@ -2781,10 +2792,9 @@ static inline int pte_unmap_same(struct vm_fault *vmf)
int same = 1;
#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPTION)
if (sizeof(pte_t) > sizeof(unsigned long)) {
- spinlock_t *ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
- spin_lock(ptl);
- same = pte_same(*vmf->pte, vmf->orig_pte);
- spin_unlock(ptl);
+ spin_lock(vmf->ptl);
+ same = pte_same(ptep_get(vmf->pte), vmf->orig_pte);
+ spin_unlock(vmf->ptl);
}
#endif
pte_unmap(vmf->pte);
@@ -2804,7 +2814,6 @@ static inline int __wp_page_copy_user(struct page *dst, struct page *src,
int ret;
void *kaddr;
void __user *uaddr;
- bool locked = false;
struct vm_area_struct *vma = vmf->vma;
struct mm_struct *mm = vma->vm_mm;
unsigned long addr = vmf->address;
@@ -2830,17 +2839,18 @@ static inline int __wp_page_copy_user(struct page *dst, struct page *src,
* On architectures with software "accessed" bits, we would
* take a double page fault, so mark it accessed here.
*/
+ vmf->pte = NULL;
if (!arch_has_hw_pte_young() && !pte_young(vmf->orig_pte)) {
pte_t entry;
vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl);
- locked = true;
- if (!likely(pte_same(*vmf->pte, vmf->orig_pte))) {
+ if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte))) {
/*
* Other thread has already handled the fault
* and update local tlb only
*/
- update_mmu_tlb(vma, addr, vmf->pte);
+ if (vmf->pte)
+ update_mmu_tlb(vma, addr, vmf->pte);
ret = -EAGAIN;
goto pte_unlock;
}
@@ -2857,15 +2867,15 @@ static inline int __wp_page_copy_user(struct page *dst, struct page *src,
* zeroes.
*/
if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) {
- if (locked)
+ if (vmf->pte)
goto warn;
/* Re-validate under PTL if the page is still mapped */
vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl);
- locked = true;
- if (!likely(pte_same(*vmf->pte, vmf->orig_pte))) {
+ if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte))) {
/* The PTE changed under us, update local tlb */
- update_mmu_tlb(vma, addr, vmf->pte);
+ if (vmf->pte)
+ update_mmu_tlb(vma, addr, vmf->pte);
ret = -EAGAIN;
goto pte_unlock;
}
@@ -2888,7 +2898,7 @@ warn:
ret = 0;
pte_unlock:
- if (locked)
+ if (vmf->pte)
pte_unmap_unlock(vmf->pte, vmf->ptl);
kunmap_atomic(kaddr);
flush_dcache_page(dst);
@@ -3110,7 +3120,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
* Re-check the pte - we dropped the lock
*/
vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, &vmf->ptl);
- if (likely(pte_same(*vmf->pte, vmf->orig_pte))) {
+ if (likely(vmf->pte && pte_same(ptep_get(vmf->pte), vmf->orig_pte))) {
if (old_folio) {
if (!folio_test_anon(old_folio)) {
dec_mm_counter(mm, mm_counter_file(&old_folio->page));
@@ -3178,19 +3188,20 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
/* Free the old page.. */
new_folio = old_folio;
page_copied = 1;
- } else {
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ } else if (vmf->pte) {
update_mmu_tlb(vma, vmf->address, vmf->pte);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
}
- if (new_folio)
- folio_put(new_folio);
-
- pte_unmap_unlock(vmf->pte, vmf->ptl);
/*
* No need to double call mmu_notifier->invalidate_range() callback as
* the above ptep_clear_flush_notify() did already call it.
*/
mmu_notifier_invalidate_range_only_end(&range);
+
+ if (new_folio)
+ folio_put(new_folio);
if (old_folio) {
if (page_copied)
free_swap_cache(&old_folio->page);
@@ -3230,11 +3241,13 @@ vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf)
WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED));
vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address,
&vmf->ptl);
+ if (!vmf->pte)
+ return VM_FAULT_NOPAGE;
/*
* We might have raced with another page fault while we released the
* pte_offset_map_lock.
*/
- if (!pte_same(*vmf->pte, vmf->orig_pte)) {
+ if (!pte_same(ptep_get(vmf->pte), vmf->orig_pte)) {
update_mmu_tlb(vmf->vma, vmf->address, vmf->pte);
pte_unmap_unlock(vmf->pte, vmf->ptl);
return VM_FAULT_NOPAGE;
@@ -3329,7 +3342,7 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
struct folio *folio = NULL;
if (likely(!unshare)) {
- if (userfaultfd_pte_wp(vma, *vmf->pte)) {
+ if (userfaultfd_pte_wp(vma, ptep_get(vmf->pte))) {
pte_unmap_unlock(vmf->pte, vmf->ptl);
return handle_userfault(vmf, VM_UFFD_WP);
}
@@ -3388,8 +3401,8 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
goto copy;
if (!folio_test_lru(folio))
/*
- * Note: We cannot easily detect+handle references from
- * remote LRU pagevecs or references to LRU folios.
+ * We cannot easily detect+handle references from
+ * remote LRU caches or references to LRU folios.
*/
lru_add_drain();
if (folio_ref_count(folio) > 1 + folio_test_swapcache(folio))
@@ -3591,10 +3604,11 @@ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf)
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
&vmf->ptl);
- if (likely(pte_same(*vmf->pte, vmf->orig_pte)))
+ if (likely(vmf->pte && pte_same(ptep_get(vmf->pte), vmf->orig_pte)))
restore_exclusive_pte(vma, vmf->page, vmf->address, vmf->pte);
- pte_unmap_unlock(vmf->pte, vmf->ptl);
+ if (vmf->pte)
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
folio_unlock(folio);
folio_put(folio);
@@ -3625,6 +3639,8 @@ static vm_fault_t pte_marker_clear(struct vm_fault *vmf)
{
vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd,
vmf->address, &vmf->ptl);
+ if (!vmf->pte)
+ return 0;
/*
* Be careful so that we will only recover a special uffd-wp pte into a
* none pte. Otherwise it means the pte could have changed, so retry.
@@ -3633,7 +3649,7 @@ static vm_fault_t pte_marker_clear(struct vm_fault *vmf)
* quickly from a PTE_MARKER_UFFD_WP into PTE_MARKER_SWAPIN_ERROR.
* So is_pte_marker() check is not enough to safely drop the pte.
*/
- if (pte_same(vmf->orig_pte, *vmf->pte))
+ if (pte_same(vmf->orig_pte, ptep_get(vmf->pte)))
pte_clear(vmf->vma->vm_mm, vmf->address, vmf->pte);
pte_unmap_unlock(vmf->pte, vmf->ptl);
return 0;
@@ -3728,10 +3744,10 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
vmf->page = pfn_swap_entry_to_page(entry);
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
vmf->address, &vmf->ptl);
- if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) {
- spin_unlock(vmf->ptl);
- goto out;
- }
+ if (unlikely(!vmf->pte ||
+ !pte_same(ptep_get(vmf->pte),
+ vmf->orig_pte)))
+ goto unlock;
/*
* Get a page reference while we know the page can't be
@@ -3807,7 +3823,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
*/
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
vmf->address, &vmf->ptl);
- if (likely(pte_same(*vmf->pte, vmf->orig_pte)))
+ if (likely(vmf->pte &&
+ pte_same(ptep_get(vmf->pte), vmf->orig_pte)))
ret = VM_FAULT_OOM;
goto unlock;
}
@@ -3863,7 +3880,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
* If we want to map a page that's in the swapcache writable, we
* have to detect via the refcount if we're really the exclusive
* owner. Try removing the extra reference from the local LRU
- * pagevecs if required.
+ * caches if required.
*/
if ((vmf->flags & FAULT_FLAG_WRITE) && folio == swapcache &&
!folio_test_ksm(folio) && !folio_test_lru(folio))
@@ -3877,7 +3894,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
*/
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
&vmf->ptl);
- if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte)))
+ if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte)))
goto out_nomap;
if (unlikely(!folio_test_uptodate(folio))) {
@@ -4003,13 +4020,15 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
/* No need to invalidate - it was non-present before */
update_mmu_cache(vma, vmf->address, vmf->pte);
unlock:
- pte_unmap_unlock(vmf->pte, vmf->ptl);
+ if (vmf->pte)
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
out:
if (si)
put_swap_device(si);
return ret;
out_nomap:
- pte_unmap_unlock(vmf->pte, vmf->ptl);
+ if (vmf->pte)
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
out_page:
folio_unlock(folio);
out_release:
@@ -4041,22 +4060,12 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
return VM_FAULT_SIGBUS;
/*
- * Use pte_alloc() instead of pte_alloc_map(). We can't run
- * pte_offset_map() on pmds where a huge pmd might be created
- * from a different thread.
- *
- * pte_alloc_map() is safe to use under mmap_write_lock(mm) or when
- * parallel threads are excluded by other means.
- *
- * Here we only have mmap_read_lock(mm).
+ * Use pte_alloc() instead of pte_alloc_map(), so that OOM can
+ * be distinguished from a transient failure of pte_offset_map().
*/
if (pte_alloc(vma->vm_mm, vmf->pmd))
return VM_FAULT_OOM;
- /* See comment in handle_pte_fault() */
- if (unlikely(pmd_trans_unstable(vmf->pmd)))
- return 0;
-
/* Use the zero-page for reads */
if (!(vmf->flags & FAULT_FLAG_WRITE) &&
!mm_forbids_zeropage(vma->vm_mm)) {
@@ -4064,6 +4073,8 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
vma->vm_page_prot));
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
vmf->address, &vmf->ptl);
+ if (!vmf->pte)
+ goto unlock;
if (vmf_pte_changed(vmf)) {
update_mmu_tlb(vma, vmf->address, vmf->pte);
goto unlock;
@@ -4104,6 +4115,8 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
&vmf->ptl);
+ if (!vmf->pte)
+ goto release;
if (vmf_pte_changed(vmf)) {
update_mmu_tlb(vma, vmf->address, vmf->pte);
goto release;
@@ -4131,7 +4144,8 @@ setpte:
/* No need to invalidate - it was non-present before */
update_mmu_cache(vma, vmf->address, vmf->pte);
unlock:
- pte_unmap_unlock(vmf->pte, vmf->ptl);
+ if (vmf->pte)
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
return ret;
release:
folio_put(folio);
@@ -4325,9 +4339,9 @@ void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
static bool vmf_pte_changed(struct vm_fault *vmf)
{
if (vmf->flags & FAULT_FLAG_ORIG_PTE_VALID)
- return !pte_same(*vmf->pte, vmf->orig_pte);
+ return !pte_same(ptep_get(vmf->pte), vmf->orig_pte);
- return !pte_none(*vmf->pte);
+ return !pte_none(ptep_get(vmf->pte));
}
/**
@@ -4380,15 +4394,10 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
return VM_FAULT_OOM;
}
- /*
- * See comment in handle_pte_fault() for how this scenario happens, we
- * need to return NOPAGE so that we drop this page.
- */
- if (pmd_devmap_trans_unstable(vmf->pmd))
- return VM_FAULT_NOPAGE;
-
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
vmf->address, &vmf->ptl);
+ if (!vmf->pte)
+ return VM_FAULT_NOPAGE;
/* Re-check under ptl */
if (likely(!vmf_pte_changed(vmf))) {
@@ -4630,17 +4639,11 @@ static vm_fault_t do_fault(struct vm_fault *vmf)
* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND
*/
if (!vma->vm_ops->fault) {
- /*
- * If we find a migration pmd entry or a none pmd entry, which
- * should never happen, return SIGBUS
- */
- if (unlikely(!pmd_present(*vmf->pmd)))
+ vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd,
+ vmf->address, &vmf->ptl);
+ if (unlikely(!vmf->pte))
ret = VM_FAULT_SIGBUS;
else {
- vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm,
- vmf->pmd,
- vmf->address,
- &vmf->ptl);
/*
* Make sure this is not a temporary clearing of pte
* by holding ptl and checking again. A R/M/W update
@@ -4648,7 +4651,7 @@ static vm_fault_t do_fault(struct vm_fault *vmf)
* we don't have concurrent modification by hardware
* followed by an update.
*/
- if (unlikely(pte_none(*vmf->pte)))
+ if (unlikely(pte_none(ptep_get(vmf->pte))))
ret = VM_FAULT_SIGBUS;
else
ret = VM_FAULT_NOPAGE;
@@ -4703,9 +4706,8 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
* validation through pte_unmap_same(). It's of NUMA type but
* the pfn may be screwed if the read is non atomic.
*/
- vmf->ptl = pte_lockptr(vma->vm_mm, vmf->pmd);
spin_lock(vmf->ptl);
- if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) {
+ if (unlikely(!pte_same(ptep_get(vmf->pte), vmf->orig_pte))) {
pte_unmap_unlock(vmf->pte, vmf->ptl);
goto out;
}
@@ -4774,9 +4776,11 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
flags |= TNF_MIGRATED;
} else {
flags |= TNF_MIGRATE_FAIL;
- vmf->pte = pte_offset_map(vmf->pmd, vmf->address);
- spin_lock(vmf->ptl);
- if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) {
+ vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
+ vmf->address, &vmf->ptl);
+ if (unlikely(!vmf->pte))
+ goto out;
+ if (unlikely(!pte_same(ptep_get(vmf->pte), vmf->orig_pte))) {
pte_unmap_unlock(vmf->pte, vmf->ptl);
goto out;
}
@@ -4905,38 +4909,18 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
vmf->flags &= ~FAULT_FLAG_ORIG_PTE_VALID;
} else {
/*
- * If a huge pmd materialized under us just retry later. Use
- * pmd_trans_unstable() via pmd_devmap_trans_unstable() instead
- * of pmd_trans_huge() to ensure the pmd didn't become
- * pmd_trans_huge under us and then back to pmd_none, as a
- * result of MADV_DONTNEED running immediately after a huge pmd
- * fault in a different thread of this mm, in turn leading to a
- * misleading pmd_trans_huge() retval. All we have to ensure is
- * that it is a regular pmd that we can walk with
- * pte_offset_map() and we can do that through an atomic read
- * in C, which is what pmd_trans_unstable() provides.
- */
- if (pmd_devmap_trans_unstable(vmf->pmd))
- return 0;
- /*
* A regular pmd is established and it can't morph into a huge
- * pmd from under us anymore at this point because we hold the
- * mmap_lock read mode and khugepaged takes it in write mode.
- * So now it's safe to run pte_offset_map().
+ * pmd by anon khugepaged, since that takes mmap_lock in write
+ * mode; but shmem or file collapse to THP could still morph
+ * it into a huge pmd: just retry later if so.
*/
- vmf->pte = pte_offset_map(vmf->pmd, vmf->address);
- vmf->orig_pte = *vmf->pte;
+ vmf->pte = pte_offset_map_nolock(vmf->vma->vm_mm, vmf->pmd,
+ vmf->address, &vmf->ptl);
+ if (unlikely(!vmf->pte))
+ return 0;
+ vmf->orig_pte = ptep_get_lockless(vmf->pte);
vmf->flags |= FAULT_FLAG_ORIG_PTE_VALID;
- /*
- * some architectures can have larger ptes than wordsize,
- * e.g.ppc44x-defconfig has CONFIG_PTE_64BIT=y and
- * CONFIG_32BIT=y, so READ_ONCE cannot guarantee atomic
- * accesses. The code below just needs a consistent view
- * for the ifs and we later double check anyway with the
- * ptl lock held. So here a barrier will do.
- */
- barrier();
if (pte_none(vmf->orig_pte)) {
pte_unmap(vmf->pte);
vmf->pte = NULL;
@@ -4952,10 +4936,9 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma))
return do_numa_page(vmf);
- vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
spin_lock(vmf->ptl);
entry = vmf->orig_pte;
- if (unlikely(!pte_same(*vmf->pte, entry))) {
+ if (unlikely(!pte_same(ptep_get(vmf->pte), entry))) {
update_mmu_tlb(vmf->vma, vmf->address, vmf->pte);
goto unlock;
}
@@ -5060,9 +5043,8 @@ retry_pud:
if (!(ret & VM_FAULT_FALLBACK))
return ret;
} else {
- vmf.orig_pmd = *vmf.pmd;
+ vmf.orig_pmd = pmdp_get_lockless(vmf.pmd);
- barrier();
if (unlikely(is_swap_pmd(vmf.orig_pmd))) {
VM_BUG_ON(thp_migration_supported() &&
!is_pmd_migration_entry(vmf.orig_pmd));
@@ -5439,11 +5421,10 @@ int follow_pte(struct mm_struct *mm, unsigned long address,
pmd = pmd_offset(pud, address);
VM_BUG_ON(pmd_trans_huge(*pmd));
- if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
- goto out;
-
ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
- if (!pte_present(*ptep))
+ if (!ptep)
+ goto out;
+ if (!pte_present(ptep_get(ptep)))
goto unlock;
*ptepp = ptep;
return 0;
@@ -5480,7 +5461,7 @@ int follow_pfn(struct vm_area_struct *vma, unsigned long address,
ret = follow_pte(vma->vm_mm, address, &ptep, &ptl);
if (ret)
return ret;
- *pfn = pte_pfn(*ptep);
+ *pfn = pte_pfn(ptep_get(ptep));
pte_unmap_unlock(ptep, ptl);
return 0;
}
@@ -5500,7 +5481,7 @@ int follow_phys(struct vm_area_struct *vma,
if (follow_pte(vma->vm_mm, address, &ptep, &ptl))
goto out;
- pte = *ptep;
+ pte = ptep_get(ptep);
if ((flags & FOLL_WRITE) && !pte_write(pte))
goto unlock;
@@ -5544,7 +5525,7 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
retry:
if (follow_pte(vma->vm_mm, addr, &ptep, &ptl))
return -EINVAL;
- pte = *ptep;
+ pte = ptep_get(ptep);
pte_unmap_unlock(ptep, ptl);
prot = pgprot_val(pte_pgprot(pte));
@@ -5560,7 +5541,7 @@ retry:
if (follow_pte(vma->vm_mm, addr, &ptep, &ptl))
goto out_unmap;
- if (!pte_same(pte, *ptep)) {
+ if (!pte_same(pte, ptep_get(ptep))) {
pte_unmap_unlock(ptep, ptl);
iounmap(maddr);
@@ -5587,7 +5568,6 @@ EXPORT_SYMBOL_GPL(generic_access_phys);
int __access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf,
int len, unsigned int gup_flags)
{
- struct vm_area_struct *vma;
void *old_buf = buf;
int write = gup_flags & FOLL_WRITE;
@@ -5596,29 +5576,30 @@ int __access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf,
/* ignore errors, just check how much was successfully transferred */
while (len) {
- int bytes, ret, offset;
+ int bytes, offset;
void *maddr;
- struct page *page = NULL;
+ struct vm_area_struct *vma = NULL;
+ struct page *page = get_user_page_vma_remote(mm, addr,
+ gup_flags, &vma);
- ret = get_user_pages_remote(mm, addr, 1,
- gup_flags, &page, &vma, NULL);
- if (ret <= 0) {
+ if (IS_ERR_OR_NULL(page)) {
#ifndef CONFIG_HAVE_IOREMAP_PROT
break;
#else
+ int res = 0;
+
/*
* Check if this is a VM_IO | VM_PFNMAP VMA, which
* we can access using slightly different code.
*/
- vma = vma_lookup(mm, addr);
if (!vma)
break;
if (vma->vm_ops && vma->vm_ops->access)
- ret = vma->vm_ops->access(vma, addr, buf,
+ res = vma->vm_ops->access(vma, addr, buf,
len, write);
- if (ret <= 0)
+ if (res <= 0)
break;
- bytes = ret;
+ bytes = res;
#endif
} else {
bytes = len;
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 8e0fa209d533..3f231cf1b410 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -13,7 +13,6 @@
#include <linux/pagemap.h>
#include <linux/compiler.h>
#include <linux/export.h>
-#include <linux/pagevec.h>
#include <linux/writeback.h>
#include <linux/slab.h>
#include <linux/sysctl.h>
@@ -325,7 +324,7 @@ int __ref __add_pages(int nid, unsigned long pfn, unsigned long nr_pages,
}
if (check_pfn_span(pfn, nr_pages)) {
- WARN(1, "Misaligned %s start: %#lx end: #%lx\n", __func__, pfn, pfn + nr_pages - 1);
+ WARN(1, "Misaligned %s start: %#lx end: %#lx\n", __func__, pfn, pfn + nr_pages - 1);
return -EINVAL;
}
@@ -492,18 +491,6 @@ void __ref remove_pfn_range_from_zone(struct zone *zone,
set_zone_contiguous(zone);
}
-static void __remove_section(unsigned long pfn, unsigned long nr_pages,
- unsigned long map_offset,
- struct vmem_altmap *altmap)
-{
- struct mem_section *ms = __pfn_to_section(pfn);
-
- if (WARN_ON_ONCE(!valid_section(ms)))
- return;
-
- sparse_remove_section(ms, pfn, nr_pages, map_offset, altmap);
-}
-
/**
* __remove_pages() - remove sections of pages
* @pfn: starting pageframe (must be aligned to start of a section)
@@ -520,12 +507,9 @@ void __remove_pages(unsigned long pfn, unsigned long nr_pages,
{
const unsigned long end_pfn = pfn + nr_pages;
unsigned long cur_nr_pages;
- unsigned long map_offset = 0;
-
- map_offset = vmem_altmap_offset(altmap);
if (check_pfn_span(pfn, nr_pages)) {
- WARN(1, "Misaligned %s start: %#lx end: #%lx\n", __func__, pfn, pfn + nr_pages - 1);
+ WARN(1, "Misaligned %s start: %#lx end: %#lx\n", __func__, pfn, pfn + nr_pages - 1);
return;
}
@@ -534,8 +518,7 @@ void __remove_pages(unsigned long pfn, unsigned long nr_pages,
/* Select all remaining pages up to the next section boundary */
cur_nr_pages = min(end_pfn - pfn,
SECTION_ALIGN_UP(pfn + 1) - pfn);
- __remove_section(pfn, cur_nr_pages, map_offset, altmap);
- map_offset = 0;
+ sparse_remove_section(pfn, cur_nr_pages, altmap);
}
}
@@ -1172,16 +1155,6 @@ failed_addition:
return ret;
}
-static void reset_node_present_pages(pg_data_t *pgdat)
-{
- struct zone *z;
-
- for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++)
- z->present_pages = 0;
-
- pgdat->node_present_pages = 0;
-}
-
/* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
static pg_data_t __ref *hotadd_init_pgdat(int nid)
{
@@ -1204,15 +1177,6 @@ static pg_data_t __ref *hotadd_init_pgdat(int nid)
*/
build_all_zonelists(pgdat);
- /*
- * When memory is hot-added, all the memory is in offline state. So
- * clear all zones' present_pages because they will be updated in
- * online_pages() and offline_pages().
- * TODO: should be in free_area_init_core_hotplug?
- */
- reset_node_managed_pages(pgdat);
- reset_node_present_pages(pgdat);
-
return pgdat;
}
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 1756389a0609..edc25195f5bd 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -508,20 +508,23 @@ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr,
unsigned long flags = qp->flags;
bool has_unmovable = false;
pte_t *pte, *mapped_pte;
+ pte_t ptent;
spinlock_t *ptl;
ptl = pmd_trans_huge_lock(pmd, vma);
if (ptl)
return queue_folios_pmd(pmd, ptl, addr, end, walk);
- if (pmd_trans_unstable(pmd))
- return 0;
-
mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
+ if (!pte) {
+ walk->action = ACTION_AGAIN;
+ return 0;
+ }
for (; addr != end; pte++, addr += PAGE_SIZE) {
- if (!pte_present(*pte))
+ ptent = ptep_get(pte);
+ if (!pte_present(ptent))
continue;
- folio = vm_normal_folio(vma, addr, *pte);
+ folio = vm_normal_folio(vma, addr, ptent);
if (!folio || folio_is_zone_device(folio))
continue;
/*
@@ -1195,24 +1198,22 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
* list of pages handed to migrate_pages()--which is how we get here--
* is in virtual address order.
*/
-static struct page *new_page(struct page *page, unsigned long start)
+static struct folio *new_folio(struct folio *src, unsigned long start)
{
- struct folio *dst, *src = page_folio(page);
struct vm_area_struct *vma;
unsigned long address;
VMA_ITERATOR(vmi, current->mm, start);
gfp_t gfp = GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL;
for_each_vma(vmi, vma) {
- address = page_address_in_vma(page, vma);
+ address = page_address_in_vma(&src->page, vma);
if (address != -EFAULT)
break;
}
if (folio_test_hugetlb(src)) {
- dst = alloc_hugetlb_folio_vma(folio_hstate(src),
+ return alloc_hugetlb_folio_vma(folio_hstate(src),
vma, address);
- return &dst->page;
}
if (folio_test_large(src))
@@ -1221,9 +1222,8 @@ static struct page *new_page(struct page *page, unsigned long start)
/*
* if !vma, vma_alloc_folio() will use task or system default policy
*/
- dst = vma_alloc_folio(gfp, folio_order(src), vma, address,
+ return vma_alloc_folio(gfp, folio_order(src), vma, address,
folio_test_large(src));
- return &dst->page;
}
#else
@@ -1239,7 +1239,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
return -ENOSYS;
}
-static struct page *new_page(struct page *page, unsigned long start)
+static struct folio *new_folio(struct folio *src, unsigned long start)
{
return NULL;
}
@@ -1334,7 +1334,7 @@ static long do_mbind(unsigned long start, unsigned long len,
if (!list_empty(&pagelist)) {
WARN_ON_ONCE(flags & MPOL_MF_LAZY);
- nr_failed = migrate_pages(&pagelist, new_page, NULL,
+ nr_failed = migrate_pages(&pagelist, new_folio, NULL,
start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND, NULL);
if (nr_failed)
putback_movable_pages(&pagelist);
diff --git a/mm/migrate.c b/mm/migrate.c
index 01cac26a3127..24baad2571e3 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -21,7 +21,6 @@
#include <linux/buffer_head.h>
#include <linux/mm_inline.h>
#include <linux/nsproxy.h>
-#include <linux/pagevec.h>
#include <linux/ksm.h>
#include <linux/rmap.h>
#include <linux/topology.h>
@@ -188,6 +187,7 @@ static bool remove_migration_pte(struct folio *folio,
while (page_vma_mapped_walk(&pvmw)) {
rmap_t rmap_flags = RMAP_NONE;
+ pte_t old_pte;
pte_t pte;
swp_entry_t entry;
struct page *new;
@@ -210,17 +210,18 @@ static bool remove_migration_pte(struct folio *folio,
folio_get(folio);
pte = mk_pte(new, READ_ONCE(vma->vm_page_prot));
- if (pte_swp_soft_dirty(*pvmw.pte))
+ old_pte = ptep_get(pvmw.pte);
+ if (pte_swp_soft_dirty(old_pte))
pte = pte_mksoft_dirty(pte);
- entry = pte_to_swp_entry(*pvmw.pte);
+ entry = pte_to_swp_entry(old_pte);
if (!is_migration_entry_young(entry))
pte = pte_mkold(pte);
if (folio_test_dirty(folio) && is_migration_entry_dirty(entry))
pte = pte_mkdirty(pte);
if (is_writable_migration_entry(entry))
pte = pte_mkwrite(pte);
- else if (pte_swp_uffd_wp(*pvmw.pte))
+ else if (pte_swp_uffd_wp(old_pte))
pte = pte_mkuffd_wp(pte);
if (folio_test_anon(folio) && !is_readable_migration_entry(entry))
@@ -234,9 +235,9 @@ static bool remove_migration_pte(struct folio *folio,
entry = make_readable_device_private_entry(
page_to_pfn(new));
pte = swp_entry_to_pte(entry);
- if (pte_swp_soft_dirty(*pvmw.pte))
+ if (pte_swp_soft_dirty(old_pte))
pte = pte_swp_mksoft_dirty(pte);
- if (pte_swp_uffd_wp(*pvmw.pte))
+ if (pte_swp_uffd_wp(old_pte))
pte = pte_swp_mkuffd_wp(pte);
}
@@ -296,14 +297,21 @@ void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked)
* get to the page and wait until migration is finished.
* When we return from this function the fault will be retried.
*/
-void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
- spinlock_t *ptl)
+void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
+ unsigned long address)
{
+ spinlock_t *ptl;
+ pte_t *ptep;
pte_t pte;
swp_entry_t entry;
- spin_lock(ptl);
- pte = *ptep;
+ ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
+ if (!ptep)
+ return;
+
+ pte = ptep_get(ptep);
+ pte_unmap(ptep);
+
if (!is_swap_pte(pte))
goto out;
@@ -311,18 +319,10 @@ void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
if (!is_migration_entry(entry))
goto out;
- migration_entry_wait_on_locked(entry, ptep, ptl);
+ migration_entry_wait_on_locked(entry, ptl);
return;
out:
- pte_unmap_unlock(ptep, ptl);
-}
-
-void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
- unsigned long address)
-{
- spinlock_t *ptl = pte_lockptr(mm, pmd);
- pte_t *ptep = pte_offset_map(pmd, address);
- __migration_entry_wait(mm, ptep, ptl);
+ spin_unlock(ptl);
}
#ifdef CONFIG_HUGETLB_PAGE
@@ -332,9 +332,9 @@ void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
*
* This function will release the vma lock before returning.
*/
-void __migration_entry_wait_huge(struct vm_area_struct *vma,
- pte_t *ptep, spinlock_t *ptl)
+void migration_entry_wait_huge(struct vm_area_struct *vma, pte_t *ptep)
{
+ spinlock_t *ptl = huge_pte_lockptr(hstate_vma(vma), vma->vm_mm, ptep);
pte_t pte;
hugetlb_vma_assert_locked(vma);
@@ -352,16 +352,9 @@ void __migration_entry_wait_huge(struct vm_area_struct *vma,
* lock release in migration_entry_wait_on_locked().
*/
hugetlb_vma_unlock_read(vma);
- migration_entry_wait_on_locked(pte_to_swp_entry(pte), NULL, ptl);
+ migration_entry_wait_on_locked(pte_to_swp_entry(pte), ptl);
}
}
-
-void migration_entry_wait_huge(struct vm_area_struct *vma, pte_t *pte)
-{
- spinlock_t *ptl = huge_pte_lockptr(hstate_vma(vma), vma->vm_mm, pte);
-
- __migration_entry_wait_huge(vma, pte, ptl);
-}
#endif
#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
@@ -372,7 +365,7 @@ void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd)
ptl = pmd_lock(mm, pmd);
if (!is_pmd_migration_entry(*pmd))
goto unlock;
- migration_entry_wait_on_locked(pmd_to_swp_entry(*pmd), NULL, ptl);
+ migration_entry_wait_on_locked(pmd_to_swp_entry(*pmd), ptl);
return;
unlock:
spin_unlock(ptl);
@@ -492,6 +485,11 @@ int folio_migrate_mapping(struct address_space *mapping,
if (folio_test_swapbacked(folio) && !folio_test_swapcache(folio)) {
__mod_lruvec_state(old_lruvec, NR_SHMEM, -nr);
__mod_lruvec_state(new_lruvec, NR_SHMEM, nr);
+
+ if (folio_test_pmd_mappable(folio)) {
+ __mod_lruvec_state(old_lruvec, NR_SHMEM_THPS, -nr);
+ __mod_lruvec_state(new_lruvec, NR_SHMEM_THPS, nr);
+ }
}
#ifdef CONFIG_SWAP
if (folio_test_swapcache(folio)) {
@@ -692,37 +690,32 @@ static bool buffer_migrate_lock_buffers(struct buffer_head *head,
enum migrate_mode mode)
{
struct buffer_head *bh = head;
+ struct buffer_head *failed_bh;
- /* Simple case, sync compaction */
- if (mode != MIGRATE_ASYNC) {
- do {
- lock_buffer(bh);
- bh = bh->b_this_page;
-
- } while (bh != head);
-
- return true;
- }
-
- /* async case, we cannot block on lock_buffer so use trylock_buffer */
do {
if (!trylock_buffer(bh)) {
- /*
- * We failed to lock the buffer and cannot stall in
- * async migration. Release the taken locks
- */
- struct buffer_head *failed_bh = bh;
- bh = head;
- while (bh != failed_bh) {
- unlock_buffer(bh);
- bh = bh->b_this_page;
- }
- return false;
+ if (mode == MIGRATE_ASYNC)
+ goto unlock;
+ if (mode == MIGRATE_SYNC_LIGHT && !buffer_uptodate(bh))
+ goto unlock;
+ lock_buffer(bh);
}
bh = bh->b_this_page;
} while (bh != head);
+
return true;
+
+unlock:
+ /* We failed to lock the buffer and cannot stall. */
+ failed_bh = bh;
+ bh = head;
+ while (bh != failed_bh) {
+ unlock_buffer(bh);
+ bh = bh->b_this_page;
+ }
+
+ return false;
}
static int __buffer_migrate_folio(struct address_space *mapping,
@@ -1072,15 +1065,13 @@ static void migrate_folio_undo_src(struct folio *src,
}
/* Restore the destination folio to the original state upon failure */
-static void migrate_folio_undo_dst(struct folio *dst,
- bool locked,
- free_page_t put_new_page,
- unsigned long private)
+static void migrate_folio_undo_dst(struct folio *dst, bool locked,
+ free_folio_t put_new_folio, unsigned long private)
{
if (locked)
folio_unlock(dst);
- if (put_new_page)
- put_new_page(&dst->page, private);
+ if (put_new_folio)
+ put_new_folio(dst, private);
else
folio_put(dst);
}
@@ -1104,14 +1095,13 @@ static void migrate_folio_done(struct folio *src,
}
/* Obtain the lock on page, remove all ptes. */
-static int migrate_folio_unmap(new_page_t get_new_page, free_page_t put_new_page,
- unsigned long private, struct folio *src,
- struct folio **dstp, enum migrate_mode mode,
- enum migrate_reason reason, struct list_head *ret)
+static int migrate_folio_unmap(new_folio_t get_new_folio,
+ free_folio_t put_new_folio, unsigned long private,
+ struct folio *src, struct folio **dstp, enum migrate_mode mode,
+ enum migrate_reason reason, struct list_head *ret)
{
struct folio *dst;
int rc = -EAGAIN;
- struct page *newpage = NULL;
int page_was_mapped = 0;
struct anon_vma *anon_vma = NULL;
bool is_lru = !__PageMovable(&src->page);
@@ -1128,10 +1118,9 @@ static int migrate_folio_unmap(new_page_t get_new_page, free_page_t put_new_page
return MIGRATEPAGE_SUCCESS;
}
- newpage = get_new_page(&src->page, private);
- if (!newpage)
+ dst = get_new_folio(src, private);
+ if (!dst)
return -ENOMEM;
- dst = page_folio(newpage);
*dstp = dst;
dst->private = NULL;
@@ -1156,6 +1145,14 @@ static int migrate_folio_unmap(new_page_t get_new_page, free_page_t put_new_page
if (current->flags & PF_MEMALLOC)
goto out;
+ /*
+ * In "light" mode, we can wait for transient locks (eg
+ * inserting a page into the page table), but it's not
+ * worth waiting for I/O.
+ */
+ if (mode == MIGRATE_SYNC_LIGHT && !folio_test_uptodate(src))
+ goto out;
+
folio_lock(src);
}
locked = true;
@@ -1251,13 +1248,13 @@ out:
ret = NULL;
migrate_folio_undo_src(src, page_was_mapped, anon_vma, locked, ret);
- migrate_folio_undo_dst(dst, dst_locked, put_new_page, private);
+ migrate_folio_undo_dst(dst, dst_locked, put_new_folio, private);
return rc;
}
/* Migrate the folio to the newly allocated folio in dst. */
-static int migrate_folio_move(free_page_t put_new_page, unsigned long private,
+static int migrate_folio_move(free_folio_t put_new_folio, unsigned long private,
struct folio *src, struct folio *dst,
enum migrate_mode mode, enum migrate_reason reason,
struct list_head *ret)
@@ -1329,7 +1326,7 @@ out:
}
migrate_folio_undo_src(src, page_was_mapped, anon_vma, true, ret);
- migrate_folio_undo_dst(dst, true, put_new_page, private);
+ migrate_folio_undo_dst(dst, true, put_new_folio, private);
return rc;
}
@@ -1352,16 +1349,14 @@ out:
* because then pte is replaced with migration swap entry and direct I/O code
* will wait in the page fault for migration to complete.
*/
-static int unmap_and_move_huge_page(new_page_t get_new_page,
- free_page_t put_new_page, unsigned long private,
- struct page *hpage, int force,
- enum migrate_mode mode, int reason,
- struct list_head *ret)
+static int unmap_and_move_huge_page(new_folio_t get_new_folio,
+ free_folio_t put_new_folio, unsigned long private,
+ struct folio *src, int force, enum migrate_mode mode,
+ int reason, struct list_head *ret)
{
- struct folio *dst, *src = page_folio(hpage);
+ struct folio *dst;
int rc = -EAGAIN;
int page_was_mapped = 0;
- struct page *new_hpage;
struct anon_vma *anon_vma = NULL;
struct address_space *mapping = NULL;
@@ -1371,10 +1366,9 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
return MIGRATEPAGE_SUCCESS;
}
- new_hpage = get_new_page(hpage, private);
- if (!new_hpage)
+ dst = get_new_folio(src, private);
+ if (!dst)
return -ENOMEM;
- dst = page_folio(new_hpage);
if (!folio_trylock(src)) {
if (!force)
@@ -1415,7 +1409,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
* semaphore in write mode here and set TTU_RMAP_LOCKED
* to let lower levels know we have taken the lock.
*/
- mapping = hugetlb_page_mapping_lock_write(hpage);
+ mapping = hugetlb_page_mapping_lock_write(&src->page);
if (unlikely(!mapping))
goto unlock_put_anon;
@@ -1445,7 +1439,7 @@ put_anon:
if (rc == MIGRATEPAGE_SUCCESS) {
move_hugetlb_state(src, dst, reason);
- put_new_page = NULL;
+ put_new_folio = NULL;
}
out_unlock:
@@ -1461,8 +1455,8 @@ out:
* it. Otherwise, put_page() will drop the reference grabbed during
* isolation.
*/
- if (put_new_page)
- put_new_page(new_hpage, private);
+ if (put_new_folio)
+ put_new_folio(dst, private);
else
folio_putback_active_hugetlb(dst);
@@ -1509,8 +1503,8 @@ struct migrate_pages_stats {
* exist any more. It is caller's responsibility to call putback_movable_pages()
* only if ret != 0.
*/
-static int migrate_hugetlbs(struct list_head *from, new_page_t get_new_page,
- free_page_t put_new_page, unsigned long private,
+static int migrate_hugetlbs(struct list_head *from, new_folio_t get_new_folio,
+ free_folio_t put_new_folio, unsigned long private,
enum migrate_mode mode, int reason,
struct migrate_pages_stats *stats,
struct list_head *ret_folios)
@@ -1548,9 +1542,9 @@ static int migrate_hugetlbs(struct list_head *from, new_page_t get_new_page,
continue;
}
- rc = unmap_and_move_huge_page(get_new_page,
- put_new_page, private,
- &folio->page, pass > 2, mode,
+ rc = unmap_and_move_huge_page(get_new_folio,
+ put_new_folio, private,
+ folio, pass > 2, mode,
reason, ret_folios);
/*
* The rules are:
@@ -1607,20 +1601,17 @@ static int migrate_hugetlbs(struct list_head *from, new_page_t get_new_page,
* deadlock (e.g., for loop device). So, if mode != MIGRATE_ASYNC, the
* length of the from list must be <= 1.
*/
-static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page,
- free_page_t put_new_page, unsigned long private,
- enum migrate_mode mode, int reason, struct list_head *ret_folios,
- struct list_head *split_folios, struct migrate_pages_stats *stats,
- int nr_pass)
+static int migrate_pages_batch(struct list_head *from,
+ new_folio_t get_new_folio, free_folio_t put_new_folio,
+ unsigned long private, enum migrate_mode mode, int reason,
+ struct list_head *ret_folios, struct list_head *split_folios,
+ struct migrate_pages_stats *stats, int nr_pass)
{
int retry = 1;
- int large_retry = 1;
int thp_retry = 1;
int nr_failed = 0;
int nr_retry_pages = 0;
- int nr_large_failed = 0;
int pass = 0;
- bool is_large = false;
bool is_thp = false;
struct folio *folio, *folio2, *dst = NULL, *dst2;
int rc, rc_saved = 0, nr_pages;
@@ -1631,20 +1622,13 @@ static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page,
VM_WARN_ON_ONCE(mode != MIGRATE_ASYNC &&
!list_empty(from) && !list_is_singular(from));
- for (pass = 0; pass < nr_pass && (retry || large_retry); pass++) {
+ for (pass = 0; pass < nr_pass && retry; pass++) {
retry = 0;
- large_retry = 0;
thp_retry = 0;
nr_retry_pages = 0;
list_for_each_entry_safe(folio, folio2, from, lru) {
- /*
- * Large folio statistics is based on the source large
- * folio. Capture required information that might get
- * lost during migration.
- */
- is_large = folio_test_large(folio);
- is_thp = is_large && folio_test_pmd_mappable(folio);
+ is_thp = folio_test_large(folio) && folio_test_pmd_mappable(folio);
nr_pages = folio_nr_pages(folio);
cond_resched();
@@ -1660,7 +1644,7 @@ static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page,
* list is processed.
*/
if (!thp_migration_supported() && is_thp) {
- nr_large_failed++;
+ nr_failed++;
stats->nr_thp_failed++;
if (!try_split_folio(folio, split_folios)) {
stats->nr_thp_split++;
@@ -1671,8 +1655,9 @@ static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page,
continue;
}
- rc = migrate_folio_unmap(get_new_page, put_new_page, private,
- folio, &dst, mode, reason, ret_folios);
+ rc = migrate_folio_unmap(get_new_folio, put_new_folio,
+ private, folio, &dst, mode, reason,
+ ret_folios);
/*
* The rules are:
* Success: folio will be freed
@@ -1688,38 +1673,33 @@ static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page,
* When memory is low, don't bother to try to migrate
* other folios, move unmapped folios, then exit.
*/
- if (is_large) {
- nr_large_failed++;
- stats->nr_thp_failed += is_thp;
- /* Large folio NUMA faulting doesn't split to retry. */
- if (!nosplit) {
- int ret = try_split_folio(folio, split_folios);
-
- if (!ret) {
- stats->nr_thp_split += is_thp;
- break;
- } else if (reason == MR_LONGTERM_PIN &&
- ret == -EAGAIN) {
- /*
- * Try again to split large folio to
- * mitigate the failure of longterm pinning.
- */
- large_retry++;
- thp_retry += is_thp;
- nr_retry_pages += nr_pages;
- /* Undo duplicated failure counting. */
- nr_large_failed--;
- stats->nr_thp_failed -= is_thp;
- break;
- }
+ nr_failed++;
+ stats->nr_thp_failed += is_thp;
+ /* Large folio NUMA faulting doesn't split to retry. */
+ if (folio_test_large(folio) && !nosplit) {
+ int ret = try_split_folio(folio, split_folios);
+
+ if (!ret) {
+ stats->nr_thp_split += is_thp;
+ break;
+ } else if (reason == MR_LONGTERM_PIN &&
+ ret == -EAGAIN) {
+ /*
+ * Try again to split large folio to
+ * mitigate the failure of longterm pinning.
+ */
+ retry++;
+ thp_retry += is_thp;
+ nr_retry_pages += nr_pages;
+ /* Undo duplicated failure counting. */
+ nr_failed--;
+ stats->nr_thp_failed -= is_thp;
+ break;
}
- } else {
- nr_failed++;
}
stats->nr_failed_pages += nr_pages + nr_retry_pages;
/* nr_failed isn't updated for not used */
- nr_large_failed += large_retry;
stats->nr_thp_failed += thp_retry;
rc_saved = rc;
if (list_empty(&unmap_folios))
@@ -1727,12 +1707,8 @@ static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page,
else
goto move;
case -EAGAIN:
- if (is_large) {
- large_retry++;
- thp_retry += is_thp;
- } else {
- retry++;
- }
+ retry++;
+ thp_retry += is_thp;
nr_retry_pages += nr_pages;
break;
case MIGRATEPAGE_SUCCESS:
@@ -1750,20 +1726,14 @@ static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page,
* removed from migration folio list and not
* retried in the next outer loop.
*/
- if (is_large) {
- nr_large_failed++;
- stats->nr_thp_failed += is_thp;
- } else {
- nr_failed++;
- }
-
+ nr_failed++;
+ stats->nr_thp_failed += is_thp;
stats->nr_failed_pages += nr_pages;
break;
}
}
}
nr_failed += retry;
- nr_large_failed += large_retry;
stats->nr_thp_failed += thp_retry;
stats->nr_failed_pages += nr_retry_pages;
move:
@@ -1771,22 +1741,20 @@ move:
try_to_unmap_flush();
retry = 1;
- for (pass = 0; pass < nr_pass && (retry || large_retry); pass++) {
+ for (pass = 0; pass < nr_pass && retry; pass++) {
retry = 0;
- large_retry = 0;
thp_retry = 0;
nr_retry_pages = 0;
dst = list_first_entry(&dst_folios, struct folio, lru);
dst2 = list_next_entry(dst, lru);
list_for_each_entry_safe(folio, folio2, &unmap_folios, lru) {
- is_large = folio_test_large(folio);
- is_thp = is_large && folio_test_pmd_mappable(folio);
+ is_thp = folio_test_large(folio) && folio_test_pmd_mappable(folio);
nr_pages = folio_nr_pages(folio);
cond_resched();
- rc = migrate_folio_move(put_new_page, private,
+ rc = migrate_folio_move(put_new_folio, private,
folio, dst, mode,
reason, ret_folios);
/*
@@ -1797,12 +1765,8 @@ move:
*/
switch(rc) {
case -EAGAIN:
- if (is_large) {
- large_retry++;
- thp_retry += is_thp;
- } else {
- retry++;
- }
+ retry++;
+ thp_retry += is_thp;
nr_retry_pages += nr_pages;
break;
case MIGRATEPAGE_SUCCESS:
@@ -1810,13 +1774,8 @@ move:
stats->nr_thp_succeeded += is_thp;
break;
default:
- if (is_large) {
- nr_large_failed++;
- stats->nr_thp_failed += is_thp;
- } else {
- nr_failed++;
- }
-
+ nr_failed++;
+ stats->nr_thp_failed += is_thp;
stats->nr_failed_pages += nr_pages;
break;
}
@@ -1825,14 +1784,10 @@ move:
}
}
nr_failed += retry;
- nr_large_failed += large_retry;
stats->nr_thp_failed += thp_retry;
stats->nr_failed_pages += nr_retry_pages;
- if (rc_saved)
- rc = rc_saved;
- else
- rc = nr_failed + nr_large_failed;
+ rc = rc_saved ? : nr_failed;
out:
/* Cleanup remaining folios */
dst = list_first_entry(&dst_folios, struct folio, lru);
@@ -1845,7 +1800,7 @@ out:
migrate_folio_undo_src(folio, page_was_mapped, anon_vma,
true, ret_folios);
list_del(&dst->lru);
- migrate_folio_undo_dst(dst, true, put_new_page, private);
+ migrate_folio_undo_dst(dst, true, put_new_folio, private);
dst = dst2;
dst2 = list_next_entry(dst, lru);
}
@@ -1853,10 +1808,11 @@ out:
return rc;
}
-static int migrate_pages_sync(struct list_head *from, new_page_t get_new_page,
- free_page_t put_new_page, unsigned long private,
- enum migrate_mode mode, int reason, struct list_head *ret_folios,
- struct list_head *split_folios, struct migrate_pages_stats *stats)
+static int migrate_pages_sync(struct list_head *from, new_folio_t get_new_folio,
+ free_folio_t put_new_folio, unsigned long private,
+ enum migrate_mode mode, int reason,
+ struct list_head *ret_folios, struct list_head *split_folios,
+ struct migrate_pages_stats *stats)
{
int rc, nr_failed = 0;
LIST_HEAD(folios);
@@ -1864,7 +1820,7 @@ static int migrate_pages_sync(struct list_head *from, new_page_t get_new_page,
memset(&astats, 0, sizeof(astats));
/* Try to migrate in batch with MIGRATE_ASYNC mode firstly */
- rc = migrate_pages_batch(from, get_new_page, put_new_page, private, MIGRATE_ASYNC,
+ rc = migrate_pages_batch(from, get_new_folio, put_new_folio, private, MIGRATE_ASYNC,
reason, &folios, split_folios, &astats,
NR_MAX_MIGRATE_ASYNC_RETRY);
stats->nr_succeeded += astats.nr_succeeded;
@@ -1886,7 +1842,7 @@ static int migrate_pages_sync(struct list_head *from, new_page_t get_new_page,
list_splice_tail_init(&folios, from);
while (!list_empty(from)) {
list_move(from->next, &folios);
- rc = migrate_pages_batch(&folios, get_new_page, put_new_page,
+ rc = migrate_pages_batch(&folios, get_new_folio, put_new_folio,
private, mode, reason, ret_folios,
split_folios, stats, NR_MAX_MIGRATE_SYNC_RETRY);
list_splice_tail_init(&folios, ret_folios);
@@ -1903,11 +1859,11 @@ static int migrate_pages_sync(struct list_head *from, new_page_t get_new_page,
* supplied as the target for the page migration
*
* @from: The list of folios to be migrated.
- * @get_new_page: The function used to allocate free folios to be used
+ * @get_new_folio: The function used to allocate free folios to be used
* as the target of the folio migration.
- * @put_new_page: The function used to free target folios if migration
+ * @put_new_folio: The function used to free target folios if migration
* fails, or NULL if no special handling is necessary.
- * @private: Private data to be passed on to get_new_page()
+ * @private: Private data to be passed on to get_new_folio()
* @mode: The migration mode that specifies the constraints for
* folio migration, if any.
* @reason: The reason for folio migration.
@@ -1924,8 +1880,8 @@ static int migrate_pages_sync(struct list_head *from, new_page_t get_new_page,
* considered as the number of non-migrated large folio, no matter how many
* split folios of the large folio are migrated successfully.
*/
-int migrate_pages(struct list_head *from, new_page_t get_new_page,
- free_page_t put_new_page, unsigned long private,
+int migrate_pages(struct list_head *from, new_folio_t get_new_folio,
+ free_folio_t put_new_folio, unsigned long private,
enum migrate_mode mode, int reason, unsigned int *ret_succeeded)
{
int rc, rc_gather;
@@ -1940,7 +1896,7 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
memset(&stats, 0, sizeof(stats));
- rc_gather = migrate_hugetlbs(from, get_new_page, put_new_page, private,
+ rc_gather = migrate_hugetlbs(from, get_new_folio, put_new_folio, private,
mode, reason, &stats, &ret_folios);
if (rc_gather < 0)
goto out;
@@ -1963,12 +1919,14 @@ again:
else
list_splice_init(from, &folios);
if (mode == MIGRATE_ASYNC)
- rc = migrate_pages_batch(&folios, get_new_page, put_new_page, private,
- mode, reason, &ret_folios, &split_folios, &stats,
- NR_MAX_MIGRATE_PAGES_RETRY);
+ rc = migrate_pages_batch(&folios, get_new_folio, put_new_folio,
+ private, mode, reason, &ret_folios,
+ &split_folios, &stats,
+ NR_MAX_MIGRATE_PAGES_RETRY);
else
- rc = migrate_pages_sync(&folios, get_new_page, put_new_page, private,
- mode, reason, &ret_folios, &split_folios, &stats);
+ rc = migrate_pages_sync(&folios, get_new_folio, put_new_folio,
+ private, mode, reason, &ret_folios,
+ &split_folios, &stats);
list_splice_tail_init(&folios, &ret_folios);
if (rc < 0) {
rc_gather = rc;
@@ -1981,8 +1939,9 @@ again:
* is counted as 1 failure already. And, we only try to migrate
* with minimal effort, force MIGRATE_ASYNC mode and retry once.
*/
- migrate_pages_batch(&split_folios, get_new_page, put_new_page, private,
- MIGRATE_ASYNC, reason, &ret_folios, NULL, &stats, 1);
+ migrate_pages_batch(&split_folios, get_new_folio,
+ put_new_folio, private, MIGRATE_ASYNC, reason,
+ &ret_folios, NULL, &stats, 1);
list_splice_tail_init(&split_folios, &ret_folios);
}
rc_gather += rc;
@@ -2017,14 +1976,11 @@ out:
return rc_gather;
}
-struct page *alloc_migration_target(struct page *page, unsigned long private)
+struct folio *alloc_migration_target(struct folio *src, unsigned long private)
{
- struct folio *folio = page_folio(page);
struct migration_target_control *mtc;
gfp_t gfp_mask;
unsigned int order = 0;
- struct folio *hugetlb_folio = NULL;
- struct folio *new_folio = NULL;
int nid;
int zidx;
@@ -2032,33 +1988,30 @@ struct page *alloc_migration_target(struct page *page, unsigned long private)
gfp_mask = mtc->gfp_mask;
nid = mtc->nid;
if (nid == NUMA_NO_NODE)
- nid = folio_nid(folio);
+ nid = folio_nid(src);
- if (folio_test_hugetlb(folio)) {
- struct hstate *h = folio_hstate(folio);
+ if (folio_test_hugetlb(src)) {
+ struct hstate *h = folio_hstate(src);
gfp_mask = htlb_modify_alloc_mask(h, gfp_mask);
- hugetlb_folio = alloc_hugetlb_folio_nodemask(h, nid,
+ return alloc_hugetlb_folio_nodemask(h, nid,
mtc->nmask, gfp_mask);
- return &hugetlb_folio->page;
}
- if (folio_test_large(folio)) {
+ if (folio_test_large(src)) {
/*
* clear __GFP_RECLAIM to make the migration callback
* consistent with regular THP allocations.
*/
gfp_mask &= ~__GFP_RECLAIM;
gfp_mask |= GFP_TRANSHUGE;
- order = folio_order(folio);
+ order = folio_order(src);
}
- zidx = zone_idx(folio_zone(folio));
+ zidx = zone_idx(folio_zone(src));
if (is_highmem_idx(zidx) || zidx == ZONE_MOVABLE)
gfp_mask |= __GFP_HIGHMEM;
- new_folio = __folio_alloc(gfp_mask, order, nid, mtc->nmask);
-
- return &new_folio->page;
+ return __folio_alloc(gfp_mask, order, nid, mtc->nmask);
}
#ifdef CONFIG_NUMA
@@ -2509,13 +2462,12 @@ static bool migrate_balanced_pgdat(struct pglist_data *pgdat,
return false;
}
-static struct page *alloc_misplaced_dst_page(struct page *page,
+static struct folio *alloc_misplaced_dst_folio(struct folio *src,
unsigned long data)
{
int nid = (int) data;
- int order = compound_order(page);
+ int order = folio_order(src);
gfp_t gfp = __GFP_THISNODE;
- struct folio *new;
if (order > 0)
gfp |= GFP_TRANSHUGE_LIGHT;
@@ -2524,9 +2476,7 @@ static struct page *alloc_misplaced_dst_page(struct page *page,
__GFP_NOWARN;
gfp &= ~__GFP_RECLAIM;
}
- new = __folio_alloc_node(gfp, order, nid);
-
- return &new->page;
+ return __folio_alloc_node(gfp, order, nid);
}
static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
@@ -2604,7 +2554,7 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
goto out;
list_add(&page->lru, &migratepages);
- nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page,
+ nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_folio,
NULL, node, MIGRATE_ASYNC,
MR_NUMA_MISPLACED, &nr_succeeded);
if (nr_remaining) {
diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index d30c9de60b0d..8365158460ed 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -83,9 +83,6 @@ again:
if (is_huge_zero_page(page)) {
spin_unlock(ptl);
split_huge_pmd(vma, pmdp, addr);
- if (pmd_trans_unstable(pmdp))
- return migrate_vma_collect_skip(start, end,
- walk);
} else {
int ret;
@@ -100,16 +97,12 @@ again:
if (ret)
return migrate_vma_collect_skip(start, end,
walk);
- if (pmd_none(*pmdp))
- return migrate_vma_collect_hole(start, end, -1,
- walk);
}
}
- if (unlikely(pmd_bad(*pmdp)))
- return migrate_vma_collect_skip(start, end, walk);
-
ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
+ if (!ptep)
+ goto again;
arch_enter_lazy_mmu_mode();
for (; addr < end; addr += PAGE_SIZE, ptep++) {
@@ -118,7 +111,7 @@ again:
swp_entry_t entry;
pte_t pte;
- pte = *ptep;
+ pte = ptep_get(ptep);
if (pte_none(pte)) {
if (vma_is_anonymous(vma)) {
@@ -201,7 +194,7 @@ again:
bool anon_exclusive;
pte_t swp_pte;
- flush_cache_page(vma, addr, pte_pfn(*ptep));
+ flush_cache_page(vma, addr, pte_pfn(pte));
anon_exclusive = PageAnon(page) && PageAnonExclusive(page);
if (anon_exclusive) {
pte = ptep_clear_flush(vma, addr, ptep);
@@ -383,7 +376,7 @@ static unsigned long migrate_device_unmap(unsigned long *src_pfns,
/* ZONE_DEVICE pages are not on LRU */
if (!is_zone_device_page(page)) {
if (!PageLRU(page) && allow_drain) {
- /* Drain CPU's pagevec */
+ /* Drain CPU's lru cache */
lru_add_drain_all();
allow_drain = false;
}
@@ -580,6 +573,7 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
pud_t *pudp;
pmd_t *pmdp;
pte_t *ptep;
+ pte_t orig_pte;
/* Only allow populating anonymous memory */
if (!vma_is_anonymous(vma))
@@ -595,27 +589,10 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
pmdp = pmd_alloc(mm, pudp, addr);
if (!pmdp)
goto abort;
-
if (pmd_trans_huge(*pmdp) || pmd_devmap(*pmdp))
goto abort;
-
- /*
- * Use pte_alloc() instead of pte_alloc_map(). We can't run
- * pte_offset_map() on pmds where a huge pmd might be created
- * from a different thread.
- *
- * pte_alloc_map() is safe to use under mmap_write_lock(mm) or when
- * parallel threads are excluded by other means.
- *
- * Here we only have mmap_read_lock(mm).
- */
if (pte_alloc(mm, pmdp))
goto abort;
-
- /* See the comment in pte_alloc_one_map() */
- if (unlikely(pmd_trans_unstable(pmdp)))
- goto abort;
-
if (unlikely(anon_vma_prepare(vma)))
goto abort;
if (mem_cgroup_charge(page_folio(page), vma->vm_mm, GFP_KERNEL))
@@ -650,17 +627,20 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
}
ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
+ if (!ptep)
+ goto abort;
+ orig_pte = ptep_get(ptep);
if (check_stable_address_space(mm))
goto unlock_abort;
- if (pte_present(*ptep)) {
- unsigned long pfn = pte_pfn(*ptep);
+ if (pte_present(orig_pte)) {
+ unsigned long pfn = pte_pfn(orig_pte);
if (!is_zero_pfn(pfn))
goto unlock_abort;
flush = true;
- } else if (!pte_none(*ptep))
+ } else if (!pte_none(orig_pte))
goto unlock_abort;
/*
@@ -677,7 +657,7 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
get_page(page);
if (flush) {
- flush_cache_page(vma, addr, pte_pfn(*ptep));
+ flush_cache_page(vma, addr, pte_pfn(orig_pte));
ptep_clear_flush_notify(vma, addr, ptep);
set_pte_at_notify(mm, addr, ptep, entry);
update_mmu_cache(vma, addr, ptep);
diff --git a/mm/mincore.c b/mm/mincore.c
index 2d5be013a25a..b7f7a516b26c 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -113,14 +113,13 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
goto out;
}
- if (pmd_trans_unstable(pmd)) {
- __mincore_unmapped_range(addr, end, vma, vec);
- goto out;
- }
-
ptep = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
+ if (!ptep) {
+ walk->action = ACTION_AGAIN;
+ return 0;
+ }
for (; addr != end; ptep++, addr += PAGE_SIZE) {
- pte_t pte = *ptep;
+ pte_t pte = ptep_get(ptep);
/* We need to do cache lookup too for pte markers */
if (pte_none_mostly(pte))
diff --git a/mm/mlock.c b/mm/mlock.c
index 40b43f8740df..d7db94519884 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -312,6 +312,7 @@ static int mlock_pte_range(pmd_t *pmd, unsigned long addr,
struct vm_area_struct *vma = walk->vma;
spinlock_t *ptl;
pte_t *start_pte, *pte;
+ pte_t ptent;
struct folio *folio;
ptl = pmd_trans_huge_lock(pmd, vma);
@@ -329,10 +330,15 @@ static int mlock_pte_range(pmd_t *pmd, unsigned long addr,
}
start_pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+ if (!start_pte) {
+ walk->action = ACTION_AGAIN;
+ return 0;
+ }
for (pte = start_pte; addr != end; pte++, addr += PAGE_SIZE) {
- if (!pte_present(*pte))
+ ptent = ptep_get(pte);
+ if (!pte_present(ptent))
continue;
- folio = vm_normal_folio(vma, addr, *pte);
+ folio = vm_normal_folio(vma, addr, ptent);
if (!folio || folio_is_zone_device(folio))
continue;
if (folio_test_large(folio))
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 1cfc08e25f93..a1963c3322af 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -259,6 +259,8 @@ static int __init cmdline_parse_core(char *p, unsigned long *core,
return 0;
}
+bool mirrored_kernelcore __initdata_memblock;
+
/*
* kernelcore=size sets the amount of memory for use for allocations that
* cannot be reclaimed or migrated.
@@ -644,10 +646,8 @@ static inline void pgdat_set_deferred_range(pg_data_t *pgdat)
}
/* Returns true if the struct page for the pfn is initialised */
-static inline bool __meminit early_page_initialised(unsigned long pfn)
+static inline bool __meminit early_page_initialised(unsigned long pfn, int nid)
{
- int nid = early_pfn_to_nid(pfn);
-
if (node_online(nid) && pfn >= NODE_DATA(nid)->first_deferred_pfn)
return false;
@@ -693,15 +693,14 @@ defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
return false;
}
-static void __meminit init_reserved_page(unsigned long pfn)
+static void __meminit init_reserved_page(unsigned long pfn, int nid)
{
pg_data_t *pgdat;
- int nid, zid;
+ int zid;
- if (early_page_initialised(pfn))
+ if (early_page_initialised(pfn, nid))
return;
- nid = early_pfn_to_nid(pfn);
pgdat = NODE_DATA(nid);
for (zid = 0; zid < MAX_NR_ZONES; zid++) {
@@ -715,7 +714,7 @@ static void __meminit init_reserved_page(unsigned long pfn)
#else
static inline void pgdat_set_deferred_range(pg_data_t *pgdat) {}
-static inline bool early_page_initialised(unsigned long pfn)
+static inline bool early_page_initialised(unsigned long pfn, int nid)
{
return true;
}
@@ -725,7 +724,7 @@ static inline bool defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
return false;
}
-static inline void init_reserved_page(unsigned long pfn)
+static inline void init_reserved_page(unsigned long pfn, int nid)
{
}
#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
@@ -736,7 +735,8 @@ static inline void init_reserved_page(unsigned long pfn)
* marks the pages PageReserved. The remaining valid pages are later
* sent to the buddy page allocator.
*/
-void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end)
+void __meminit reserve_bootmem_region(phys_addr_t start,
+ phys_addr_t end, int nid)
{
unsigned long start_pfn = PFN_DOWN(start);
unsigned long end_pfn = PFN_UP(end);
@@ -745,7 +745,7 @@ void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end)
if (pfn_valid(start_pfn)) {
struct page *page = pfn_to_page(start_pfn);
- init_reserved_page(start_pfn);
+ init_reserved_page(start_pfn, nid);
/* Avoid false-positive PageTail() */
INIT_LIST_HEAD(&page->lru);
@@ -1166,24 +1166,15 @@ unsigned long __init absent_pages_in_range(unsigned long start_pfn,
/* Return the number of page frames in holes in a zone on a node */
static unsigned long __init zone_absent_pages_in_node(int nid,
unsigned long zone_type,
- unsigned long node_start_pfn,
- unsigned long node_end_pfn)
+ unsigned long zone_start_pfn,
+ unsigned long zone_end_pfn)
{
- unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
- unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
- unsigned long zone_start_pfn, zone_end_pfn;
unsigned long nr_absent;
- /* When hotadd a new node from cpu_up(), the node should be empty */
- if (!node_start_pfn && !node_end_pfn)
+ /* zone is empty, we don't have any absent pages */
+ if (zone_start_pfn == zone_end_pfn)
return 0;
- zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);
- zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);
-
- adjust_zone_range_for_zone_movable(nid, zone_type,
- node_start_pfn, node_end_pfn,
- &zone_start_pfn, &zone_end_pfn);
nr_absent = __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
/*
@@ -1227,9 +1218,6 @@ static unsigned long __init zone_spanned_pages_in_node(int nid,
{
unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
- /* When hotadd a new node from cpu_up(), the node should be empty */
- if (!node_start_pfn && !node_end_pfn)
- return 0;
/* Get the start and end of the zone */
*zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);
@@ -1250,6 +1238,24 @@ static unsigned long __init zone_spanned_pages_in_node(int nid,
return *zone_end_pfn - *zone_start_pfn;
}
+static void __init reset_memoryless_node_totalpages(struct pglist_data *pgdat)
+{
+ struct zone *z;
+
+ for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++) {
+ z->zone_start_pfn = 0;
+ z->spanned_pages = 0;
+ z->present_pages = 0;
+#if defined(CONFIG_MEMORY_HOTPLUG)
+ z->present_early_pages = 0;
+#endif
+ }
+
+ pgdat->node_spanned_pages = 0;
+ pgdat->node_present_pages = 0;
+ pr_debug("On node %d totalpages: 0\n", pgdat->node_id);
+}
+
static void __init calculate_node_totalpages(struct pglist_data *pgdat,
unsigned long node_start_pfn,
unsigned long node_end_pfn)
@@ -1261,7 +1267,7 @@ static void __init calculate_node_totalpages(struct pglist_data *pgdat,
struct zone *zone = pgdat->node_zones + i;
unsigned long zone_start_pfn, zone_end_pfn;
unsigned long spanned, absent;
- unsigned long size, real_size;
+ unsigned long real_size;
spanned = zone_spanned_pages_in_node(pgdat->node_id, i,
node_start_pfn,
@@ -1269,23 +1275,22 @@ static void __init calculate_node_totalpages(struct pglist_data *pgdat,
&zone_start_pfn,
&zone_end_pfn);
absent = zone_absent_pages_in_node(pgdat->node_id, i,
- node_start_pfn,
- node_end_pfn);
+ zone_start_pfn,
+ zone_end_pfn);
- size = spanned;
- real_size = size - absent;
+ real_size = spanned - absent;
- if (size)
+ if (spanned)
zone->zone_start_pfn = zone_start_pfn;
else
zone->zone_start_pfn = 0;
- zone->spanned_pages = size;
+ zone->spanned_pages = spanned;
zone->present_pages = real_size;
#if defined(CONFIG_MEMORY_HOTPLUG)
zone->present_early_pages = real_size;
#endif
- totalpages += size;
+ totalpages += spanned;
realtotalpages += real_size;
}
@@ -1506,6 +1511,8 @@ void __ref free_area_init_core_hotplug(struct pglist_data *pgdat)
pgdat->kswapd_order = 0;
pgdat->kswapd_highest_zoneidx = 0;
pgdat->node_start_pfn = 0;
+ pgdat->node_present_pages = 0;
+
for_each_online_cpu(cpu) {
struct per_cpu_nodestat *p;
@@ -1513,8 +1520,17 @@ void __ref free_area_init_core_hotplug(struct pglist_data *pgdat)
memset(p, 0, sizeof(*p));
}
- for (z = 0; z < MAX_NR_ZONES; z++)
- zone_init_internals(&pgdat->node_zones[z], z, nid, 0);
+ /*
+ * When memory is hot-added, all the memory is in offline state. So
+ * clear all zones' present_pages and managed_pages because they will
+ * be updated in online_pages() and offline_pages().
+ */
+ for (z = 0; z < MAX_NR_ZONES; z++) {
+ struct zone *zone = pgdat->node_zones + z;
+
+ zone->present_pages = 0;
+ zone_init_internals(zone, z, nid, 0);
+ }
}
#endif
@@ -1582,7 +1598,6 @@ static void __init free_area_init_core(struct pglist_data *pgdat)
if (!size)
continue;
- set_pageblock_order();
setup_usemap(zone);
init_currently_empty_zone(zone, zone->zone_start_pfn, size);
}
@@ -1706,11 +1721,13 @@ static void __init free_area_init_node(int nid)
pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid,
(u64)start_pfn << PAGE_SHIFT,
end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0);
+
+ calculate_node_totalpages(pgdat, start_pfn, end_pfn);
} else {
pr_info("Initmem setup node %d as memoryless\n", nid);
- }
- calculate_node_totalpages(pgdat, start_pfn, end_pfn);
+ reset_memoryless_node_totalpages(pgdat);
+ }
alloc_node_mem_map(pgdat);
pgdat_set_deferred_range(pgdat);
@@ -1720,7 +1737,7 @@ static void __init free_area_init_node(int nid)
}
/* Any regular or high memory on that node ? */
-static void check_for_memory(pg_data_t *pgdat, int nid)
+static void check_for_memory(pg_data_t *pgdat)
{
enum zone_type zone_type;
@@ -1728,9 +1745,9 @@ static void check_for_memory(pg_data_t *pgdat, int nid)
struct zone *zone = &pgdat->node_zones[zone_type];
if (populated_zone(zone)) {
if (IS_ENABLED(CONFIG_HIGHMEM))
- node_set_state(nid, N_HIGH_MEMORY);
+ node_set_state(pgdat->node_id, N_HIGH_MEMORY);
if (zone_type <= ZONE_NORMAL)
- node_set_state(nid, N_NORMAL_MEMORY);
+ node_set_state(pgdat->node_id, N_NORMAL_MEMORY);
break;
}
}
@@ -1749,11 +1766,6 @@ void __init setup_nr_node_ids(void)
}
#endif
-static void __init free_area_init_memoryless_node(int nid)
-{
- free_area_init_node(nid);
-}
-
/*
* Some architectures, e.g. ARC may have ZONE_HIGHMEM below ZONE_NORMAL. For
* such cases we allow max_zone_pfn sorted in the descending order
@@ -1852,6 +1864,8 @@ void __init free_area_init(unsigned long *max_zone_pfn)
/* Initialise every node */
mminit_verify_pageflags_layout();
setup_nr_node_ids();
+ set_pageblock_order();
+
for_each_node(nid) {
pg_data_t *pgdat;
@@ -1864,7 +1878,7 @@ void __init free_area_init(unsigned long *max_zone_pfn)
panic("Cannot allocate %zuB for node %d.\n",
sizeof(*pgdat), nid);
arch_refresh_nodedata(nid, pgdat);
- free_area_init_memoryless_node(nid);
+ free_area_init_node(nid);
/*
* We do not want to confuse userspace by sysfs
@@ -1885,7 +1899,7 @@ void __init free_area_init(unsigned long *max_zone_pfn)
/* Any memory on that node */
if (pgdat->node_present_pages)
node_set_state(nid, N_MEMORY);
- check_for_memory(pgdat, nid);
+ check_for_memory(pgdat);
}
memmap_init();
@@ -2335,6 +2349,28 @@ void __init init_cma_reserved_pageblock(struct page *page)
}
#endif
+void set_zone_contiguous(struct zone *zone)
+{
+ unsigned long block_start_pfn = zone->zone_start_pfn;
+ unsigned long block_end_pfn;
+
+ block_end_pfn = pageblock_end_pfn(block_start_pfn);
+ for (; block_start_pfn < zone_end_pfn(zone);
+ block_start_pfn = block_end_pfn,
+ block_end_pfn += pageblock_nr_pages) {
+
+ block_end_pfn = min(block_end_pfn, zone_end_pfn(zone));
+
+ if (!__pageblock_pfn_to_page(block_start_pfn,
+ block_end_pfn, zone))
+ return;
+ cond_resched();
+ }
+
+ /* We confirm that there is no hole */
+ zone->contiguous = true;
+}
+
void __init page_alloc_init_late(void)
{
struct zone *zone;
@@ -2375,6 +2411,8 @@ void __init page_alloc_init_late(void)
/* Initialize page ext after all struct pages are initialized. */
if (deferred_struct_pages)
page_ext_init();
+
+ page_alloc_sysctl_init();
}
#ifndef __HAVE_ARCH_RESERVED_KERNEL_PAGES
@@ -2539,8 +2577,14 @@ void __init set_dma_reserve(unsigned long new_dma_reserve)
void __init memblock_free_pages(struct page *page, unsigned long pfn,
unsigned int order)
{
- if (!early_page_initialised(pfn))
- return;
+
+ if (IS_ENABLED(CONFIG_DEFERRED_STRUCT_PAGE_INIT)) {
+ int nid = early_pfn_to_nid(pfn);
+
+ if (!early_page_initialised(pfn, nid))
+ return;
+ }
+
if (!kmsan_memblock_free_pages(page, order)) {
/* KMSAN will take care of these pages. */
return;
@@ -2548,6 +2592,12 @@ void __init memblock_free_pages(struct page *page, unsigned long pfn,
__free_pages_core(page, order);
}
+DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, init_on_alloc);
+EXPORT_SYMBOL(init_on_alloc);
+
+DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_FREE_DEFAULT_ON, init_on_free);
+EXPORT_SYMBOL(init_on_free);
+
static bool _init_on_alloc_enabled_early __read_mostly
= IS_ENABLED(CONFIG_INIT_ON_ALLOC_DEFAULT_ON);
static int __init early_init_on_alloc(char *buf)
diff --git a/mm/mmap.c b/mm/mmap.c
index d600404580b2..8f1000bc35df 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -182,7 +182,8 @@ static int check_brk_limits(unsigned long addr, unsigned long len)
if (IS_ERR_VALUE(mapped_addr))
return mapped_addr;
- return mlock_future_check(current->mm, current->mm->def_flags, len);
+ return mlock_future_ok(current->mm, current->mm->def_flags, len)
+ ? 0 : -EAGAIN;
}
static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *brkvma,
unsigned long addr, unsigned long request, unsigned long flags);
@@ -300,61 +301,40 @@ out:
}
#if defined(CONFIG_DEBUG_VM_MAPLE_TREE)
-extern void mt_validate(struct maple_tree *mt);
-extern void mt_dump(const struct maple_tree *mt);
-
-/* Validate the maple tree */
-static void validate_mm_mt(struct mm_struct *mm)
-{
- struct maple_tree *mt = &mm->mm_mt;
- struct vm_area_struct *vma_mt;
-
- MA_STATE(mas, mt, 0, 0);
-
- mt_validate(&mm->mm_mt);
- mas_for_each(&mas, vma_mt, ULONG_MAX) {
- if ((vma_mt->vm_start != mas.index) ||
- (vma_mt->vm_end - 1 != mas.last)) {
- pr_emerg("issue in %s\n", current->comm);
- dump_stack();
- dump_vma(vma_mt);
- pr_emerg("mt piv: %p %lu - %lu\n", vma_mt,
- mas.index, mas.last);
- pr_emerg("mt vma: %p %lu - %lu\n", vma_mt,
- vma_mt->vm_start, vma_mt->vm_end);
-
- mt_dump(mas.tree);
- if (vma_mt->vm_end != mas.last + 1) {
- pr_err("vma: %p vma_mt %lu-%lu\tmt %lu-%lu\n",
- mm, vma_mt->vm_start, vma_mt->vm_end,
- mas.index, mas.last);
- mt_dump(mas.tree);
- }
- VM_BUG_ON_MM(vma_mt->vm_end != mas.last + 1, mm);
- if (vma_mt->vm_start != mas.index) {
- pr_err("vma: %p vma_mt %p %lu - %lu doesn't match\n",
- mm, vma_mt, vma_mt->vm_start, vma_mt->vm_end);
- mt_dump(mas.tree);
- }
- VM_BUG_ON_MM(vma_mt->vm_start != mas.index, mm);
- }
- }
-}
-
static void validate_mm(struct mm_struct *mm)
{
int bug = 0;
int i = 0;
struct vm_area_struct *vma;
- MA_STATE(mas, &mm->mm_mt, 0, 0);
-
- validate_mm_mt(mm);
+ VMA_ITERATOR(vmi, mm, 0);
- mas_for_each(&mas, vma, ULONG_MAX) {
+ mt_validate(&mm->mm_mt);
+ for_each_vma(vmi, vma) {
#ifdef CONFIG_DEBUG_VM_RB
struct anon_vma *anon_vma = vma->anon_vma;
struct anon_vma_chain *avc;
+#endif
+ unsigned long vmi_start, vmi_end;
+ bool warn = 0;
+
+ vmi_start = vma_iter_addr(&vmi);
+ vmi_end = vma_iter_end(&vmi);
+ if (VM_WARN_ON_ONCE_MM(vma->vm_end != vmi_end, mm))
+ warn = 1;
+
+ if (VM_WARN_ON_ONCE_MM(vma->vm_start != vmi_start, mm))
+ warn = 1;
+ if (warn) {
+ pr_emerg("issue in %s\n", current->comm);
+ dump_stack();
+ dump_vma(vma);
+ pr_emerg("tree range: %px start %lx end %lx\n", vma,
+ vmi_start, vmi_end - 1);
+ vma_iter_dump_tree(&vmi);
+ }
+
+#ifdef CONFIG_DEBUG_VM_RB
if (anon_vma) {
anon_vma_lock_read(anon_vma);
list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
@@ -365,14 +345,13 @@ static void validate_mm(struct mm_struct *mm)
i++;
}
if (i != mm->map_count) {
- pr_emerg("map_count %d mas_for_each %d\n", mm->map_count, i);
+ pr_emerg("map_count %d vma iterator %d\n", mm->map_count, i);
bug = 1;
}
VM_BUG_ON_MM(bug, mm);
}
#else /* !CONFIG_DEBUG_VM_MAPLE_TREE */
-#define validate_mm_mt(root) do { } while (0)
#define validate_mm(mm) do { } while (0)
#endif /* CONFIG_DEBUG_VM_MAPLE_TREE */
@@ -1167,21 +1146,21 @@ static inline unsigned long round_hint_to_min(unsigned long hint)
return hint;
}
-int mlock_future_check(struct mm_struct *mm, unsigned long flags,
- unsigned long len)
+bool mlock_future_ok(struct mm_struct *mm, unsigned long flags,
+ unsigned long bytes)
{
- unsigned long locked, lock_limit;
+ unsigned long locked_pages, limit_pages;
- /* mlock MCL_FUTURE? */
- if (flags & VM_LOCKED) {
- locked = len >> PAGE_SHIFT;
- locked += mm->locked_vm;
- lock_limit = rlimit(RLIMIT_MEMLOCK);
- lock_limit >>= PAGE_SHIFT;
- if (locked > lock_limit && !capable(CAP_IPC_LOCK))
- return -EAGAIN;
- }
- return 0;
+ if (!(flags & VM_LOCKED) || capable(CAP_IPC_LOCK))
+ return true;
+
+ locked_pages = bytes >> PAGE_SHIFT;
+ locked_pages += mm->locked_vm;
+
+ limit_pages = rlimit(RLIMIT_MEMLOCK);
+ limit_pages >>= PAGE_SHIFT;
+
+ return locked_pages <= limit_pages;
}
static inline u64 file_mmap_size_max(struct file *file, struct inode *inode)
@@ -1293,7 +1272,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
if (!can_do_mlock())
return -EPERM;
- if (mlock_future_check(mm, vm_flags, len))
+ if (!mlock_future_ok(mm, vm_flags, len))
return -EAGAIN;
if (file) {
@@ -1475,6 +1454,48 @@ SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg)
}
#endif /* __ARCH_WANT_SYS_OLD_MMAP */
+static bool vm_ops_needs_writenotify(const struct vm_operations_struct *vm_ops)
+{
+ return vm_ops && (vm_ops->page_mkwrite || vm_ops->pfn_mkwrite);
+}
+
+static bool vma_is_shared_writable(struct vm_area_struct *vma)
+{
+ return (vma->vm_flags & (VM_WRITE | VM_SHARED)) ==
+ (VM_WRITE | VM_SHARED);
+}
+
+static bool vma_fs_can_writeback(struct vm_area_struct *vma)
+{
+ /* No managed pages to writeback. */
+ if (vma->vm_flags & VM_PFNMAP)
+ return false;
+
+ return vma->vm_file && vma->vm_file->f_mapping &&
+ mapping_can_writeback(vma->vm_file->f_mapping);
+}
+
+/*
+ * Does this VMA require the underlying folios to have their dirty state
+ * tracked?
+ */
+bool vma_needs_dirty_tracking(struct vm_area_struct *vma)
+{
+ /* Only shared, writable VMAs require dirty tracking. */
+ if (!vma_is_shared_writable(vma))
+ return false;
+
+ /* Does the filesystem need to be notified? */
+ if (vm_ops_needs_writenotify(vma->vm_ops))
+ return true;
+
+ /*
+ * Even if the filesystem doesn't indicate a need for writenotify, if it
+ * can writeback, dirty tracking is still required.
+ */
+ return vma_fs_can_writeback(vma);
+}
+
/*
* Some shared mappings will want the pages marked read-only
* to track write events. If so, we'll downgrade vm_page_prot
@@ -1483,21 +1504,18 @@ SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg)
*/
int vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot)
{
- vm_flags_t vm_flags = vma->vm_flags;
- const struct vm_operations_struct *vm_ops = vma->vm_ops;
-
/* If it was private or non-writable, the write bit is already clear */
- if ((vm_flags & (VM_WRITE|VM_SHARED)) != ((VM_WRITE|VM_SHARED)))
+ if (!vma_is_shared_writable(vma))
return 0;
/* The backer wishes to know when pages are first written to? */
- if (vm_ops && (vm_ops->page_mkwrite || vm_ops->pfn_mkwrite))
+ if (vm_ops_needs_writenotify(vma->vm_ops))
return 1;
/* The open routine did something to the protections that pgprot_modify
* won't preserve? */
if (pgprot_val(vm_page_prot) !=
- pgprot_val(vm_pgprot_modify(vm_page_prot, vm_flags)))
+ pgprot_val(vm_pgprot_modify(vm_page_prot, vma->vm_flags)))
return 0;
/*
@@ -1511,13 +1529,8 @@ int vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot)
if (userfaultfd_wp(vma))
return 1;
- /* Specialty mapping? */
- if (vm_flags & VM_PFNMAP)
- return 0;
-
/* Can the mapping track the dirty pages? */
- return vma->vm_file && vma->vm_file->f_mapping &&
- mapping_can_writeback(vma->vm_file->f_mapping);
+ return vma_fs_can_writeback(vma);
}
/*
@@ -1911,7 +1924,7 @@ static int acct_stack_growth(struct vm_area_struct *vma,
return -ENOMEM;
/* mlock limit tests */
- if (mlock_future_check(mm, vma->vm_flags, grow << PAGE_SHIFT))
+ if (!mlock_future_ok(mm, vma->vm_flags, grow << PAGE_SHIFT))
return -ENOMEM;
/* Check to ensure the stack will not grow into a hugetlb-only region */
@@ -2234,7 +2247,7 @@ int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
struct vm_area_struct *new;
int err;
- validate_mm_mt(vma->vm_mm);
+ validate_mm(vma->vm_mm);
WARN_ON(vma->vm_start >= addr);
WARN_ON(vma->vm_end <= addr);
@@ -2292,7 +2305,7 @@ int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
/* Success. */
if (new_below)
vma_next(vmi);
- validate_mm_mt(vma->vm_mm);
+ validate_mm(vma->vm_mm);
return 0;
out_free_mpol:
@@ -2301,7 +2314,7 @@ out_free_vmi:
vma_iter_free(vmi);
out_free_vma:
vm_area_free(new);
- validate_mm_mt(vma->vm_mm);
+ validate_mm(vma->vm_mm);
return err;
}
@@ -2394,28 +2407,32 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
locked_vm += vma_pages(next);
count++;
+ if (unlikely(uf)) {
+ /*
+ * If userfaultfd_unmap_prep returns an error the vmas
+ * will remain split, but userland will get a
+ * highly unexpected error anyway. This is no
+ * different than the case where the first of the two
+ * __split_vma fails, but we don't undo the first
+ * split, despite we could. This is unlikely enough
+ * failure that it's not worth optimizing it for.
+ */
+ error = userfaultfd_unmap_prep(next, start, end, uf);
+
+ if (error)
+ goto userfaultfd_error;
+ }
#ifdef CONFIG_DEBUG_VM_MAPLE_TREE
BUG_ON(next->vm_start < start);
BUG_ON(next->vm_start > end);
#endif
}
- next = vma_next(vmi);
- if (unlikely(uf)) {
- /*
- * If userfaultfd_unmap_prep returns an error the vmas
- * will remain split, but userland will get a
- * highly unexpected error anyway. This is no
- * different than the case where the first of the two
- * __split_vma fails, but we don't undo the first
- * split, despite we could. This is unlikely enough
- * failure that it's not worth optimizing it for.
- */
- error = userfaultfd_unmap_prep(mm, start, end, uf);
+ if (vma_iter_end(vmi) > end)
+ next = vma_iter_load(vmi);
- if (error)
- goto userfaultfd_error;
- }
+ if (!next)
+ next = vma_next(vmi);
#if defined(CONFIG_DEBUG_VM_MAPLE_TREE)
/* Make sure no VMAs are about to be lost. */
@@ -2620,6 +2637,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
}
cannot_expand:
+ if (prev)
+ vma_iter_next_range(&vmi);
+
/*
* Determine the object being mapped and call the appropriate
* specific mapper. the address has already been validated, but
@@ -2933,7 +2953,7 @@ int do_vma_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
arch_unmap(mm, start, end);
ret = do_vmi_align_munmap(vmi, vma, mm, start, end, uf, downgrade);
- validate_mm_mt(mm);
+ validate_mm(mm);
return ret;
}
@@ -2955,7 +2975,7 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
struct mm_struct *mm = current->mm;
struct vma_prepare vp;
- validate_mm_mt(mm);
+ validate_mm(mm);
/*
* Check against address space limits by the changed size
* Note: This happens *after* clearing old mappings in some code paths.
@@ -3196,7 +3216,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
bool faulted_in_anon_vma = true;
VMA_ITERATOR(vmi, mm, addr);
- validate_mm_mt(mm);
+ validate_mm(mm);
/*
* If anonymous vma has not yet been faulted, update new pgoff
* to match new location, to increase its chance of merging.
@@ -3255,7 +3275,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
goto out_vma_link;
*need_rmap_locks = false;
}
- validate_mm_mt(mm);
+ validate_mm(mm);
return new_vma;
out_vma_link:
@@ -3271,7 +3291,7 @@ out_free_mempol:
out_free_vma:
vm_area_free(new_vma);
out:
- validate_mm_mt(mm);
+ validate_mm(mm);
return NULL;
}
@@ -3408,7 +3428,7 @@ static struct vm_area_struct *__install_special_mapping(
int ret;
struct vm_area_struct *vma;
- validate_mm_mt(mm);
+ validate_mm(mm);
vma = vm_area_alloc(mm);
if (unlikely(vma == NULL))
return ERR_PTR(-ENOMEM);
@@ -3431,12 +3451,12 @@ static struct vm_area_struct *__install_special_mapping(
perf_event_mmap(vma);
- validate_mm_mt(mm);
+ validate_mm(mm);
return vma;
out:
vm_area_free(vma);
- validate_mm_mt(mm);
+ validate_mm(mm);
return ERR_PTR(ret);
}
diff --git a/mm/mprotect.c b/mm/mprotect.c
index c59e7561698c..6f658d483704 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -93,22 +93,9 @@ static long change_pte_range(struct mmu_gather *tlb,
bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
tlb_change_page_size(tlb, PAGE_SIZE);
-
- /*
- * Can be called with only the mmap_lock for reading by
- * prot_numa so we must check the pmd isn't constantly
- * changing from under us from pmd_none to pmd_trans_huge
- * and/or the other way around.
- */
- if (pmd_trans_unstable(pmd))
- return 0;
-
- /*
- * The pmd points to a regular pte so the pmd can't change
- * from under us even if the mmap_lock is only hold for
- * reading.
- */
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+ if (!pte)
+ return -EAGAIN;
/* Get target node for single threaded private VMAs */
if (prot_numa && !(vma->vm_flags & VM_SHARED) &&
@@ -118,7 +105,7 @@ static long change_pte_range(struct mmu_gather *tlb,
flush_tlb_batched_pending(vma->vm_mm);
arch_enter_lazy_mmu_mode();
do {
- oldpte = *pte;
+ oldpte = ptep_get(pte);
if (pte_present(oldpte)) {
pte_t ptent;
@@ -302,31 +289,6 @@ static long change_pte_range(struct mmu_gather *tlb,
}
/*
- * Used when setting automatic NUMA hinting protection where it is
- * critical that a numa hinting PMD is not confused with a bad PMD.
- */
-static inline int pmd_none_or_clear_bad_unless_trans_huge(pmd_t *pmd)
-{
- pmd_t pmdval = pmdp_get_lockless(pmd);
-
- /* See pmd_none_or_trans_huge_or_clear_bad for info on barrier */
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- barrier();
-#endif
-
- if (pmd_none(pmdval))
- return 1;
- if (pmd_trans_huge(pmdval))
- return 0;
- if (unlikely(pmd_bad(pmdval))) {
- pmd_clear_bad(pmd);
- return 1;
- }
-
- return 0;
-}
-
-/*
* Return true if we want to split THPs into PTE mappings in change
* protection procedure, false otherwise.
*/
@@ -403,7 +365,8 @@ static inline long change_pmd_range(struct mmu_gather *tlb,
pmd = pmd_offset(pud, addr);
do {
long ret;
-
+ pmd_t _pmd;
+again:
next = pmd_addr_end(addr, end);
ret = change_pmd_prepare(vma, pmd, cp_flags);
@@ -411,16 +374,8 @@ static inline long change_pmd_range(struct mmu_gather *tlb,
pages = ret;
break;
}
- /*
- * Automatic NUMA balancing walks the tables with mmap_lock
- * held for read. It's possible a parallel update to occur
- * between pmd_trans_huge() and a pmd_none_or_clear_bad()
- * check leading to a false positive and clearing.
- * Hence, it's necessary to atomically read the PMD value
- * for all the checks.
- */
- if (!is_swap_pmd(*pmd) && !pmd_devmap(*pmd) &&
- pmd_none_or_clear_bad_unless_trans_huge(pmd))
+
+ if (pmd_none(*pmd))
goto next;
/* invoke the mmu notifier if the pmd is populated */
@@ -431,7 +386,8 @@ static inline long change_pmd_range(struct mmu_gather *tlb,
mmu_notifier_invalidate_range_start(&range);
}
- if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
+ _pmd = pmdp_get_lockless(pmd);
+ if (is_swap_pmd(_pmd) || pmd_trans_huge(_pmd) || pmd_devmap(_pmd)) {
if ((next - addr != HPAGE_PMD_SIZE) ||
pgtable_split_needed(vma, cp_flags)) {
__split_huge_pmd(vma, pmd, addr, false, NULL);
@@ -446,15 +402,10 @@ static inline long change_pmd_range(struct mmu_gather *tlb,
break;
}
} else {
- /*
- * change_huge_pmd() does not defer TLB flushes,
- * so no need to propagate the tlb argument.
- */
- int nr_ptes = change_huge_pmd(tlb, vma, pmd,
+ ret = change_huge_pmd(tlb, vma, pmd,
addr, newprot, cp_flags);
-
- if (nr_ptes) {
- if (nr_ptes == HPAGE_PMD_NR) {
+ if (ret) {
+ if (ret == HPAGE_PMD_NR) {
pages += HPAGE_PMD_NR;
nr_huge_updates++;
}
@@ -465,8 +416,12 @@ static inline long change_pmd_range(struct mmu_gather *tlb,
}
/* fall through, the trans huge pmd just split */
}
- pages += change_pte_range(tlb, vma, pmd, addr, next,
- newprot, cp_flags);
+
+ ret = change_pte_range(tlb, vma, pmd, addr, next, newprot,
+ cp_flags);
+ if (ret < 0)
+ goto again;
+ pages += ret;
next:
cond_resched();
} while (pmd++, addr = next, addr != end);
@@ -589,7 +544,8 @@ long change_protection(struct mmu_gather *tlb,
static int prot_none_pte_entry(pte_t *pte, unsigned long addr,
unsigned long next, struct mm_walk *walk)
{
- return pfn_modify_allowed(pte_pfn(*pte), *(pgprot_t *)(walk->private)) ?
+ return pfn_modify_allowed(pte_pfn(ptep_get(pte)),
+ *(pgprot_t *)(walk->private)) ?
0 : -EACCES;
}
@@ -597,7 +553,8 @@ static int prot_none_hugetlb_entry(pte_t *pte, unsigned long hmask,
unsigned long addr, unsigned long next,
struct mm_walk *walk)
{
- return pfn_modify_allowed(pte_pfn(*pte), *(pgprot_t *)(walk->private)) ?
+ return pfn_modify_allowed(pte_pfn(ptep_get(pte)),
+ *(pgprot_t *)(walk->private)) ?
0 : -EACCES;
}
diff --git a/mm/mremap.c b/mm/mremap.c
index 3185724d8b13..fe6b722ae633 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -133,7 +133,7 @@ static pte_t move_soft_dirty_pte(pte_t pte)
return pte;
}
-static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
+static int move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
unsigned long old_addr, unsigned long old_end,
struct vm_area_struct *new_vma, pmd_t *new_pmd,
unsigned long new_addr, bool need_rmap_locks)
@@ -143,6 +143,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
spinlock_t *old_ptl, *new_ptl;
bool force_flush = false;
unsigned long len = old_end - old_addr;
+ int err = 0;
/*
* When need_rmap_locks is true, we take the i_mmap_rwsem and anon_vma
@@ -170,8 +171,16 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
* pte locks because exclusive mmap_lock prevents deadlock.
*/
old_pte = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl);
- new_pte = pte_offset_map(new_pmd, new_addr);
- new_ptl = pte_lockptr(mm, new_pmd);
+ if (!old_pte) {
+ err = -EAGAIN;
+ goto out;
+ }
+ new_pte = pte_offset_map_nolock(mm, new_pmd, new_addr, &new_ptl);
+ if (!new_pte) {
+ pte_unmap_unlock(old_pte, old_ptl);
+ err = -EAGAIN;
+ goto out;
+ }
if (new_ptl != old_ptl)
spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
flush_tlb_batched_pending(vma->vm_mm);
@@ -179,7 +188,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE,
new_pte++, new_addr += PAGE_SIZE) {
- if (pte_none(*old_pte))
+ if (pte_none(ptep_get(old_pte)))
continue;
pte = ptep_get_and_clear(mm, old_addr, old_pte);
@@ -208,8 +217,10 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
spin_unlock(new_ptl);
pte_unmap(new_pte - 1);
pte_unmap_unlock(old_pte - 1, old_ptl);
+out:
if (need_rmap_locks)
drop_rmap_locks(vma);
+ return err;
}
#ifndef arch_supports_page_table_move
@@ -537,6 +548,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
new_pmd = alloc_new_pmd(vma->vm_mm, vma, new_addr);
if (!new_pmd)
break;
+again:
if (is_swap_pmd(*old_pmd) || pmd_trans_huge(*old_pmd) ||
pmd_devmap(*old_pmd)) {
if (extent == HPAGE_PMD_SIZE &&
@@ -544,8 +556,6 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
old_pmd, new_pmd, need_rmap_locks))
continue;
split_huge_pmd(vma, old_pmd, old_addr);
- if (pmd_trans_unstable(old_pmd))
- continue;
} else if (IS_ENABLED(CONFIG_HAVE_MOVE_PMD) &&
extent == PMD_SIZE) {
/*
@@ -556,11 +566,13 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
old_pmd, new_pmd, true))
continue;
}
-
+ if (pmd_none(*old_pmd))
+ continue;
if (pte_alloc(new_vma->vm_mm, new_pmd))
break;
- move_ptes(vma, old_pmd, old_addr, old_addr + extent, new_vma,
- new_pmd, new_addr, need_rmap_locks);
+ if (move_ptes(vma, old_pmd, old_addr, old_addr + extent,
+ new_vma, new_pmd, new_addr, need_rmap_locks) < 0)
+ goto again;
}
mmu_notifier_invalidate_range_end(&range);
@@ -775,7 +787,7 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP))
return ERR_PTR(-EFAULT);
- if (mlock_future_check(mm, vma->vm_flags, new_len - old_len))
+ if (!mlock_future_ok(mm, vma->vm_flags, new_len - old_len))
return ERR_PTR(-EAGAIN);
if (!may_expand_vm(mm, vma->vm_flags,
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 044e1eed720e..612b5597d3af 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -1130,12 +1130,10 @@ bool out_of_memory(struct oom_control *oc)
/*
* The OOM killer does not compensate for IO-less reclaim.
- * pagefault_out_of_memory lost its gfp context so we have to
- * make sure exclude 0 mask - all other users should have at least
- * ___GFP_DIRECT_RECLAIM to get here. But mem_cgroup_oom() has to
- * invoke the OOM killer even if it is a GFP_NOFS allocation.
+ * But mem_cgroup_oom() has to invoke the OOM killer even
+ * if it is a GFP_NOFS allocation.
*/
- if (oc->gfp_mask && !(oc->gfp_mask & __GFP_FS) && !is_memcg_oom(oc))
+ if (!(oc->gfp_mask & __GFP_FS) && !is_memcg_oom(oc))
return true;
/*
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index db7943999007..1d17fb1ec863 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2597,7 +2597,7 @@ EXPORT_SYMBOL(noop_dirty_folio);
/*
* Helper function for set_page_dirty family.
*
- * Caller must hold lock_page_memcg().
+ * Caller must hold folio_memcg_lock().
*
* NOTE: This relies on being atomic wrt interrupts.
*/
@@ -2631,7 +2631,7 @@ static void folio_account_dirtied(struct folio *folio,
/*
* Helper function for deaccounting dirty page without writeback.
*
- * Caller must hold lock_page_memcg().
+ * Caller must hold folio_memcg_lock().
*/
void folio_account_cleaned(struct folio *folio, struct bdi_writeback *wb)
{
@@ -2650,7 +2650,7 @@ void folio_account_cleaned(struct folio *folio, struct bdi_writeback *wb)
* If warn is true, then emit a warning if the folio is not uptodate and has
* not been truncated.
*
- * The caller must hold lock_page_memcg(). Most callers have the folio
+ * The caller must hold folio_memcg_lock(). Most callers have the folio
* locked. A few have the folio blocked from truncation through other
* means (eg zap_vma_pages() has it mapped and is holding the page table
* lock). This can also be called from mark_buffer_dirty(), which I
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d239fba3f31c..7d3460c7a480 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -18,21 +18,14 @@
#include <linux/stddef.h>
#include <linux/mm.h>
#include <linux/highmem.h>
-#include <linux/swap.h>
-#include <linux/swapops.h>
#include <linux/interrupt.h>
-#include <linux/pagemap.h>
#include <linux/jiffies.h>
-#include <linux/memblock.h>
#include <linux/compiler.h>
#include <linux/kernel.h>
#include <linux/kasan.h>
#include <linux/kmsan.h>
#include <linux/module.h>
#include <linux/suspend.h>
-#include <linux/pagevec.h>
-#include <linux/blkdev.h>
-#include <linux/slab.h>
#include <linux/ratelimit.h>
#include <linux/oom.h>
#include <linux/topology.h>
@@ -41,19 +34,8 @@
#include <linux/cpuset.h>
#include <linux/memory_hotplug.h>
#include <linux/nodemask.h>
-#include <linux/vmalloc.h>
#include <linux/vmstat.h>
-#include <linux/mempolicy.h>
-#include <linux/memremap.h>
-#include <linux/stop_machine.h>
-#include <linux/random.h>
-#include <linux/sort.h>
-#include <linux/pfn.h>
-#include <linux/backing-dev.h>
#include <linux/fault-inject.h>
-#include <linux/page-isolation.h>
-#include <linux/debugobjects.h>
-#include <linux/kmemleak.h>
#include <linux/compaction.h>
#include <trace/events/kmem.h>
#include <trace/events/oom.h>
@@ -61,26 +43,19 @@
#include <linux/mm_inline.h>
#include <linux/mmu_notifier.h>
#include <linux/migrate.h>
-#include <linux/hugetlb.h>
-#include <linux/sched/rt.h>
#include <linux/sched/mm.h>
#include <linux/page_owner.h>
#include <linux/page_table_check.h>
-#include <linux/kthread.h>
#include <linux/memcontrol.h>
#include <linux/ftrace.h>
#include <linux/lockdep.h>
-#include <linux/nmi.h>
#include <linux/psi.h>
#include <linux/khugepaged.h>
#include <linux/delayacct.h>
-#include <asm/sections.h>
-#include <asm/tlbflush.h>
#include <asm/div64.h>
#include "internal.h"
#include "shuffle.h"
#include "page_reporting.h"
-#include "swap.h"
/* Free Page Internal flags: for internal, non-pcp variants of free_pages(). */
typedef int __bitwise fpi_t;
@@ -227,18 +202,7 @@ nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
};
EXPORT_SYMBOL(node_states);
-atomic_long_t _totalram_pages __read_mostly;
-EXPORT_SYMBOL(_totalram_pages);
-unsigned long totalreserve_pages __read_mostly;
-unsigned long totalcma_pages __read_mostly;
-
-int percpu_pagelist_high_fraction;
gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
-DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, init_on_alloc);
-EXPORT_SYMBOL(init_on_alloc);
-
-DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_FREE_DEFAULT_ON, init_on_free);
-EXPORT_SYMBOL(init_on_free);
/*
* A cached value of the page's pageblock's migratetype, used when the page is
@@ -258,44 +222,6 @@ static inline void set_pcppage_migratetype(struct page *page, int migratetype)
page->index = migratetype;
}
-#ifdef CONFIG_PM_SLEEP
-/*
- * The following functions are used by the suspend/hibernate code to temporarily
- * change gfp_allowed_mask in order to avoid using I/O during memory allocations
- * while devices are suspended. To avoid races with the suspend/hibernate code,
- * they should always be called with system_transition_mutex held
- * (gfp_allowed_mask also should only be modified with system_transition_mutex
- * held, unless the suspend/hibernate code is guaranteed not to run in parallel
- * with that modification).
- */
-
-static gfp_t saved_gfp_mask;
-
-void pm_restore_gfp_mask(void)
-{
- WARN_ON(!mutex_is_locked(&system_transition_mutex));
- if (saved_gfp_mask) {
- gfp_allowed_mask = saved_gfp_mask;
- saved_gfp_mask = 0;
- }
-}
-
-void pm_restrict_gfp_mask(void)
-{
- WARN_ON(!mutex_is_locked(&system_transition_mutex));
- WARN_ON(saved_gfp_mask);
- saved_gfp_mask = gfp_allowed_mask;
- gfp_allowed_mask &= ~(__GFP_IO | __GFP_FS);
-}
-
-bool pm_suspended_storage(void)
-{
- if ((gfp_allowed_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS))
- return false;
- return true;
-}
-#endif /* CONFIG_PM_SLEEP */
-
#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
unsigned int pageblock_order __read_mostly;
#endif
@@ -314,7 +240,7 @@ static void __free_pages_ok(struct page *page, unsigned int order,
* TBD: should special case ZONE_DMA32 machines here - in those we normally
* don't need any ZONE_NORMAL reservation
*/
-int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES] = {
+static int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES] = {
#ifdef CONFIG_ZONE_DMA
[ZONE_DMA] = 256,
#endif
@@ -358,7 +284,7 @@ const char * const migratetype_names[MIGRATE_TYPES] = {
#endif
};
-compound_page_dtor * const compound_page_dtors[NR_COMPOUND_DTORS] = {
+static compound_page_dtor * const compound_page_dtors[NR_COMPOUND_DTORS] = {
[NULL_COMPOUND_DTOR] = NULL,
[COMPOUND_PAGE_DTOR] = free_compound_page,
#ifdef CONFIG_HUGETLB_PAGE
@@ -371,10 +297,8 @@ compound_page_dtor * const compound_page_dtors[NR_COMPOUND_DTORS] = {
int min_free_kbytes = 1024;
int user_min_free_kbytes = -1;
-int watermark_boost_factor __read_mostly = 15000;
-int watermark_scale_factor = 10;
-
-bool mirrored_kernelcore __initdata_memblock;
+static int watermark_boost_factor __read_mostly = 15000;
+static int watermark_scale_factor = 10;
/* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
int movable_zone;
@@ -556,13 +480,6 @@ static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
return ret;
}
-static int page_is_consistent(struct zone *zone, struct page *page)
-{
- if (zone != page_zone(page))
- return 0;
-
- return 1;
-}
/*
* Temporary debugging check for pages not lying within a given zone.
*/
@@ -570,7 +487,7 @@ static int __maybe_unused bad_range(struct zone *zone, struct page *page)
{
if (page_outside_zone_boundaries(zone, page))
return 1;
- if (!page_is_consistent(zone, page))
+ if (zone != page_zone(page))
return 1;
return 0;
@@ -710,75 +627,6 @@ void destroy_large_folio(struct folio *folio)
compound_page_dtors[dtor](&folio->page);
}
-#ifdef CONFIG_DEBUG_PAGEALLOC
-unsigned int _debug_guardpage_minorder;
-
-bool _debug_pagealloc_enabled_early __read_mostly
- = IS_ENABLED(CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT);
-EXPORT_SYMBOL(_debug_pagealloc_enabled_early);
-DEFINE_STATIC_KEY_FALSE(_debug_pagealloc_enabled);
-EXPORT_SYMBOL(_debug_pagealloc_enabled);
-
-DEFINE_STATIC_KEY_FALSE(_debug_guardpage_enabled);
-
-static int __init early_debug_pagealloc(char *buf)
-{
- return kstrtobool(buf, &_debug_pagealloc_enabled_early);
-}
-early_param("debug_pagealloc", early_debug_pagealloc);
-
-static int __init debug_guardpage_minorder_setup(char *buf)
-{
- unsigned long res;
-
- if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) {
- pr_err("Bad debug_guardpage_minorder value\n");
- return 0;
- }
- _debug_guardpage_minorder = res;
- pr_info("Setting debug_guardpage_minorder to %lu\n", res);
- return 0;
-}
-early_param("debug_guardpage_minorder", debug_guardpage_minorder_setup);
-
-static inline bool set_page_guard(struct zone *zone, struct page *page,
- unsigned int order, int migratetype)
-{
- if (!debug_guardpage_enabled())
- return false;
-
- if (order >= debug_guardpage_minorder())
- return false;
-
- __SetPageGuard(page);
- INIT_LIST_HEAD(&page->buddy_list);
- set_page_private(page, order);
- /* Guard pages are not available for any usage */
- if (!is_migrate_isolate(migratetype))
- __mod_zone_freepage_state(zone, -(1 << order), migratetype);
-
- return true;
-}
-
-static inline void clear_page_guard(struct zone *zone, struct page *page,
- unsigned int order, int migratetype)
-{
- if (!debug_guardpage_enabled())
- return;
-
- __ClearPageGuard(page);
-
- set_page_private(page, 0);
- if (!is_migrate_isolate(migratetype))
- __mod_zone_freepage_state(zone, (1 << order), migratetype);
-}
-#else
-static inline bool set_page_guard(struct zone *zone, struct page *page,
- unsigned int order, int migratetype) { return false; }
-static inline void clear_page_guard(struct zone *zone, struct page *page,
- unsigned int order, int migratetype) {}
-#endif
-
static inline void set_buddy_order(struct page *page, unsigned int order)
{
set_page_private(page, order);
@@ -885,7 +733,7 @@ static inline struct page *get_page_from_free_area(struct free_area *area,
int migratetype)
{
return list_first_entry_or_null(&area->free_list[migratetype],
- struct page, lru);
+ struct page, buddy_list);
}
/*
@@ -1137,6 +985,11 @@ static inline bool free_page_is_bad(struct page *page)
return true;
}
+static inline bool is_check_pages_enabled(void)
+{
+ return static_branch_unlikely(&check_pages_enabled);
+}
+
static int free_tail_page_prepare(struct page *head_page, struct page *page)
{
struct folio *folio = (struct folio *)head_page;
@@ -1148,7 +1001,7 @@ static int free_tail_page_prepare(struct page *head_page, struct page *page)
*/
BUILD_BUG_ON((unsigned long)LIST_POISON1 & 1);
- if (!static_branch_unlikely(&check_pages_enabled)) {
+ if (!is_check_pages_enabled()) {
ret = 0;
goto out;
}
@@ -1534,7 +1387,7 @@ struct page *__pageblock_pfn_to_page(unsigned long start_pfn,
/* end_pfn is one past the range we are checking */
end_pfn--;
- if (!pfn_valid(start_pfn) || !pfn_valid(end_pfn))
+ if (!pfn_valid(end_pfn))
return NULL;
start_page = pfn_to_online_page(start_pfn);
@@ -1553,33 +1406,6 @@ struct page *__pageblock_pfn_to_page(unsigned long start_pfn,
return start_page;
}
-void set_zone_contiguous(struct zone *zone)
-{
- unsigned long block_start_pfn = zone->zone_start_pfn;
- unsigned long block_end_pfn;
-
- block_end_pfn = pageblock_end_pfn(block_start_pfn);
- for (; block_start_pfn < zone_end_pfn(zone);
- block_start_pfn = block_end_pfn,
- block_end_pfn += pageblock_nr_pages) {
-
- block_end_pfn = min(block_end_pfn, zone_end_pfn(zone));
-
- if (!__pageblock_pfn_to_page(block_start_pfn,
- block_end_pfn, zone))
- return;
- cond_resched();
- }
-
- /* We confirm that there is no hole */
- zone->contiguous = true;
-}
-
-void clear_zone_contiguous(struct zone *zone)
-{
- zone->contiguous = false;
-}
-
/*
* The order of subdivision here is critical for the IO subsystem.
* Please do not alter this order without good reasons and regression
@@ -2514,61 +2340,6 @@ void drain_all_pages(struct zone *zone)
__drain_all_pages(zone, false);
}
-#ifdef CONFIG_HIBERNATION
-
-/*
- * Touch the watchdog for every WD_PAGE_COUNT pages.
- */
-#define WD_PAGE_COUNT (128*1024)
-
-void mark_free_pages(struct zone *zone)
-{
- unsigned long pfn, max_zone_pfn, page_count = WD_PAGE_COUNT;
- unsigned long flags;
- unsigned int order, t;
- struct page *page;
-
- if (zone_is_empty(zone))
- return;
-
- spin_lock_irqsave(&zone->lock, flags);
-
- max_zone_pfn = zone_end_pfn(zone);
- for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
- if (pfn_valid(pfn)) {
- page = pfn_to_page(pfn);
-
- if (!--page_count) {
- touch_nmi_watchdog();
- page_count = WD_PAGE_COUNT;
- }
-
- if (page_zone(page) != zone)
- continue;
-
- if (!swsusp_page_is_forbidden(page))
- swsusp_unset_page_free(page);
- }
-
- for_each_migratetype_order(order, t) {
- list_for_each_entry(page,
- &zone->free_area[order].free_list[t], buddy_list) {
- unsigned long i;
-
- pfn = page_to_pfn(page);
- for (i = 0; i < (1UL << order); i++) {
- if (!--page_count) {
- touch_nmi_watchdog();
- page_count = WD_PAGE_COUNT;
- }
- swsusp_set_page_free(pfn_to_page(pfn + i));
- }
- }
- }
- spin_unlock_irqrestore(&zone->lock, flags);
-}
-#endif /* CONFIG_PM */
-
static bool free_unref_page_prepare(struct page *page, unsigned long pfn,
unsigned int order)
{
@@ -3065,7 +2836,8 @@ struct page *rmqueue(struct zone *preferred_zone,
out:
/* Separate test+clear to avoid unnecessary atomics */
- if (unlikely(test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags))) {
+ if ((alloc_flags & ALLOC_KSWAPD) &&
+ unlikely(test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags))) {
clear_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
wakeup_kswapd(zone, 0, 0, zone_idx(zone));
}
@@ -3074,80 +2846,6 @@ out:
return page;
}
-#ifdef CONFIG_FAIL_PAGE_ALLOC
-
-static struct {
- struct fault_attr attr;
-
- bool ignore_gfp_highmem;
- bool ignore_gfp_reclaim;
- u32 min_order;
-} fail_page_alloc = {
- .attr = FAULT_ATTR_INITIALIZER,
- .ignore_gfp_reclaim = true,
- .ignore_gfp_highmem = true,
- .min_order = 1,
-};
-
-static int __init setup_fail_page_alloc(char *str)
-{
- return setup_fault_attr(&fail_page_alloc.attr, str);
-}
-__setup("fail_page_alloc=", setup_fail_page_alloc);
-
-static bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
-{
- int flags = 0;
-
- if (order < fail_page_alloc.min_order)
- return false;
- if (gfp_mask & __GFP_NOFAIL)
- return false;
- if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))
- return false;
- if (fail_page_alloc.ignore_gfp_reclaim &&
- (gfp_mask & __GFP_DIRECT_RECLAIM))
- return false;
-
- /* See comment in __should_failslab() */
- if (gfp_mask & __GFP_NOWARN)
- flags |= FAULT_NOWARN;
-
- return should_fail_ex(&fail_page_alloc.attr, 1 << order, flags);
-}
-
-#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
-
-static int __init fail_page_alloc_debugfs(void)
-{
- umode_t mode = S_IFREG | 0600;
- struct dentry *dir;
-
- dir = fault_create_debugfs_attr("fail_page_alloc", NULL,
- &fail_page_alloc.attr);
-
- debugfs_create_bool("ignore-gfp-wait", mode, dir,
- &fail_page_alloc.ignore_gfp_reclaim);
- debugfs_create_bool("ignore-gfp-highmem", mode, dir,
- &fail_page_alloc.ignore_gfp_highmem);
- debugfs_create_u32("min-order", mode, dir, &fail_page_alloc.min_order);
-
- return 0;
-}
-
-late_initcall(fail_page_alloc_debugfs);
-
-#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
-
-#else /* CONFIG_FAIL_PAGE_ALLOC */
-
-static inline bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
-{
- return false;
-}
-
-#endif /* CONFIG_FAIL_PAGE_ALLOC */
-
noinline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
{
return __should_fail_alloc_page(gfp_mask, order);
@@ -3794,56 +3492,41 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
if (fatal_signal_pending(current))
return false;
- if (compaction_made_progress(compact_result))
- (*compaction_retries)++;
-
- /*
- * compaction considers all the zone as desperately out of memory
- * so it doesn't really make much sense to retry except when the
- * failure could be caused by insufficient priority
- */
- if (compaction_failed(compact_result))
- goto check_priority;
-
/*
- * compaction was skipped because there are not enough order-0 pages
- * to work with, so we retry only if it looks like reclaim can help.
+ * Compaction was skipped due to a lack of free order-0
+ * migration targets. Continue if reclaim can help.
*/
- if (compaction_needs_reclaim(compact_result)) {
+ if (compact_result == COMPACT_SKIPPED) {
ret = compaction_zonelist_suitable(ac, order, alloc_flags);
goto out;
}
/*
- * make sure the compaction wasn't deferred or didn't bail out early
- * due to locks contention before we declare that we should give up.
- * But the next retry should use a higher priority if allowed, so
- * we don't just keep bailing out endlessly.
+ * Compaction managed to coalesce some page blocks, but the
+ * allocation failed presumably due to a race. Retry some.
*/
- if (compaction_withdrawn(compact_result)) {
- goto check_priority;
- }
+ if (compact_result == COMPACT_SUCCESS) {
+ /*
+ * !costly requests are much more important than
+ * __GFP_RETRY_MAYFAIL costly ones because they are de
+ * facto nofail and invoke OOM killer to move on while
+ * costly can fail and users are ready to cope with
+ * that. 1/4 retries is rather arbitrary but we would
+ * need much more detailed feedback from compaction to
+ * make a better decision.
+ */
+ if (order > PAGE_ALLOC_COSTLY_ORDER)
+ max_retries /= 4;
- /*
- * !costly requests are much more important than __GFP_RETRY_MAYFAIL
- * costly ones because they are de facto nofail and invoke OOM
- * killer to move on while costly can fail and users are ready
- * to cope with that. 1/4 retries is rather arbitrary but we
- * would need much more detailed feedback from compaction to
- * make a better decision.
- */
- if (order > PAGE_ALLOC_COSTLY_ORDER)
- max_retries /= 4;
- if (*compaction_retries <= max_retries) {
- ret = true;
- goto out;
+ if (++(*compaction_retries) <= max_retries) {
+ ret = true;
+ goto out;
+ }
}
/*
- * Make sure there are attempts at the highest priority if we exhausted
- * all retries or failed at the lower priorities.
+ * Compaction failed. Retry with increasing priority.
*/
-check_priority:
min_priority = (order > PAGE_ALLOC_COSTLY_ORDER) ?
MIN_COMPACT_COSTLY_PRIORITY : MIN_COMPACT_PRIORITY;
@@ -5163,383 +4846,6 @@ unsigned long nr_free_buffer_pages(void)
}
EXPORT_SYMBOL_GPL(nr_free_buffer_pages);
-static inline void show_node(struct zone *zone)
-{
- if (IS_ENABLED(CONFIG_NUMA))
- printk("Node %d ", zone_to_nid(zone));
-}
-
-long si_mem_available(void)
-{
- long available;
- unsigned long pagecache;
- unsigned long wmark_low = 0;
- unsigned long pages[NR_LRU_LISTS];
- unsigned long reclaimable;
- struct zone *zone;
- int lru;
-
- for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++)
- pages[lru] = global_node_page_state(NR_LRU_BASE + lru);
-
- for_each_zone(zone)
- wmark_low += low_wmark_pages(zone);
-
- /*
- * Estimate the amount of memory available for userspace allocations,
- * without causing swapping or OOM.
- */
- available = global_zone_page_state(NR_FREE_PAGES) - totalreserve_pages;
-
- /*
- * Not all the page cache can be freed, otherwise the system will
- * start swapping or thrashing. Assume at least half of the page
- * cache, or the low watermark worth of cache, needs to stay.
- */
- pagecache = pages[LRU_ACTIVE_FILE] + pages[LRU_INACTIVE_FILE];
- pagecache -= min(pagecache / 2, wmark_low);
- available += pagecache;
-
- /*
- * Part of the reclaimable slab and other kernel memory consists of
- * items that are in use, and cannot be freed. Cap this estimate at the
- * low watermark.
- */
- reclaimable = global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B) +
- global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE);
- available += reclaimable - min(reclaimable / 2, wmark_low);
-
- if (available < 0)
- available = 0;
- return available;
-}
-EXPORT_SYMBOL_GPL(si_mem_available);
-
-void si_meminfo(struct sysinfo *val)
-{
- val->totalram = totalram_pages();
- val->sharedram = global_node_page_state(NR_SHMEM);
- val->freeram = global_zone_page_state(NR_FREE_PAGES);
- val->bufferram = nr_blockdev_pages();
- val->totalhigh = totalhigh_pages();
- val->freehigh = nr_free_highpages();
- val->mem_unit = PAGE_SIZE;
-}
-
-EXPORT_SYMBOL(si_meminfo);
-
-#ifdef CONFIG_NUMA
-void si_meminfo_node(struct sysinfo *val, int nid)
-{
- int zone_type; /* needs to be signed */
- unsigned long managed_pages = 0;
- unsigned long managed_highpages = 0;
- unsigned long free_highpages = 0;
- pg_data_t *pgdat = NODE_DATA(nid);
-
- for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)
- managed_pages += zone_managed_pages(&pgdat->node_zones[zone_type]);
- val->totalram = managed_pages;
- val->sharedram = node_page_state(pgdat, NR_SHMEM);
- val->freeram = sum_zone_node_page_state(nid, NR_FREE_PAGES);
-#ifdef CONFIG_HIGHMEM
- for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
- struct zone *zone = &pgdat->node_zones[zone_type];
-
- if (is_highmem(zone)) {
- managed_highpages += zone_managed_pages(zone);
- free_highpages += zone_page_state(zone, NR_FREE_PAGES);
- }
- }
- val->totalhigh = managed_highpages;
- val->freehigh = free_highpages;
-#else
- val->totalhigh = managed_highpages;
- val->freehigh = free_highpages;
-#endif
- val->mem_unit = PAGE_SIZE;
-}
-#endif
-
-/*
- * Determine whether the node should be displayed or not, depending on whether
- * SHOW_MEM_FILTER_NODES was passed to show_free_areas().
- */
-static bool show_mem_node_skip(unsigned int flags, int nid, nodemask_t *nodemask)
-{
- if (!(flags & SHOW_MEM_FILTER_NODES))
- return false;
-
- /*
- * no node mask - aka implicit memory numa policy. Do not bother with
- * the synchronization - read_mems_allowed_begin - because we do not
- * have to be precise here.
- */
- if (!nodemask)
- nodemask = &cpuset_current_mems_allowed;
-
- return !node_isset(nid, *nodemask);
-}
-
-static void show_migration_types(unsigned char type)
-{
- static const char types[MIGRATE_TYPES] = {
- [MIGRATE_UNMOVABLE] = 'U',
- [MIGRATE_MOVABLE] = 'M',
- [MIGRATE_RECLAIMABLE] = 'E',
- [MIGRATE_HIGHATOMIC] = 'H',
-#ifdef CONFIG_CMA
- [MIGRATE_CMA] = 'C',
-#endif
-#ifdef CONFIG_MEMORY_ISOLATION
- [MIGRATE_ISOLATE] = 'I',
-#endif
- };
- char tmp[MIGRATE_TYPES + 1];
- char *p = tmp;
- int i;
-
- for (i = 0; i < MIGRATE_TYPES; i++) {
- if (type & (1 << i))
- *p++ = types[i];
- }
-
- *p = '\0';
- printk(KERN_CONT "(%s) ", tmp);
-}
-
-static bool node_has_managed_zones(pg_data_t *pgdat, int max_zone_idx)
-{
- int zone_idx;
- for (zone_idx = 0; zone_idx <= max_zone_idx; zone_idx++)
- if (zone_managed_pages(pgdat->node_zones + zone_idx))
- return true;
- return false;
-}
-
-/*
- * Show free area list (used inside shift_scroll-lock stuff)
- * We also calculate the percentage fragmentation. We do this by counting the
- * memory on each free list with the exception of the first item on the list.
- *
- * Bits in @filter:
- * SHOW_MEM_FILTER_NODES: suppress nodes that are not allowed by current's
- * cpuset.
- */
-void __show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_zone_idx)
-{
- unsigned long free_pcp = 0;
- int cpu, nid;
- struct zone *zone;
- pg_data_t *pgdat;
-
- for_each_populated_zone(zone) {
- if (zone_idx(zone) > max_zone_idx)
- continue;
- if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
- continue;
-
- for_each_online_cpu(cpu)
- free_pcp += per_cpu_ptr(zone->per_cpu_pageset, cpu)->count;
- }
-
- printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"
- " active_file:%lu inactive_file:%lu isolated_file:%lu\n"
- " unevictable:%lu dirty:%lu writeback:%lu\n"
- " slab_reclaimable:%lu slab_unreclaimable:%lu\n"
- " mapped:%lu shmem:%lu pagetables:%lu\n"
- " sec_pagetables:%lu bounce:%lu\n"
- " kernel_misc_reclaimable:%lu\n"
- " free:%lu free_pcp:%lu free_cma:%lu\n",
- global_node_page_state(NR_ACTIVE_ANON),
- global_node_page_state(NR_INACTIVE_ANON),
- global_node_page_state(NR_ISOLATED_ANON),
- global_node_page_state(NR_ACTIVE_FILE),
- global_node_page_state(NR_INACTIVE_FILE),
- global_node_page_state(NR_ISOLATED_FILE),
- global_node_page_state(NR_UNEVICTABLE),
- global_node_page_state(NR_FILE_DIRTY),
- global_node_page_state(NR_WRITEBACK),
- global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B),
- global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B),
- global_node_page_state(NR_FILE_MAPPED),
- global_node_page_state(NR_SHMEM),
- global_node_page_state(NR_PAGETABLE),
- global_node_page_state(NR_SECONDARY_PAGETABLE),
- global_zone_page_state(NR_BOUNCE),
- global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE),
- global_zone_page_state(NR_FREE_PAGES),
- free_pcp,
- global_zone_page_state(NR_FREE_CMA_PAGES));
-
- for_each_online_pgdat(pgdat) {
- if (show_mem_node_skip(filter, pgdat->node_id, nodemask))
- continue;
- if (!node_has_managed_zones(pgdat, max_zone_idx))
- continue;
-
- printk("Node %d"
- " active_anon:%lukB"
- " inactive_anon:%lukB"
- " active_file:%lukB"
- " inactive_file:%lukB"
- " unevictable:%lukB"
- " isolated(anon):%lukB"
- " isolated(file):%lukB"
- " mapped:%lukB"
- " dirty:%lukB"
- " writeback:%lukB"
- " shmem:%lukB"
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- " shmem_thp: %lukB"
- " shmem_pmdmapped: %lukB"
- " anon_thp: %lukB"
-#endif
- " writeback_tmp:%lukB"
- " kernel_stack:%lukB"
-#ifdef CONFIG_SHADOW_CALL_STACK
- " shadow_call_stack:%lukB"
-#endif
- " pagetables:%lukB"
- " sec_pagetables:%lukB"
- " all_unreclaimable? %s"
- "\n",
- pgdat->node_id,
- K(node_page_state(pgdat, NR_ACTIVE_ANON)),
- K(node_page_state(pgdat, NR_INACTIVE_ANON)),
- K(node_page_state(pgdat, NR_ACTIVE_FILE)),
- K(node_page_state(pgdat, NR_INACTIVE_FILE)),
- K(node_page_state(pgdat, NR_UNEVICTABLE)),
- K(node_page_state(pgdat, NR_ISOLATED_ANON)),
- K(node_page_state(pgdat, NR_ISOLATED_FILE)),
- K(node_page_state(pgdat, NR_FILE_MAPPED)),
- K(node_page_state(pgdat, NR_FILE_DIRTY)),
- K(node_page_state(pgdat, NR_WRITEBACK)),
- K(node_page_state(pgdat, NR_SHMEM)),
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- K(node_page_state(pgdat, NR_SHMEM_THPS)),
- K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED)),
- K(node_page_state(pgdat, NR_ANON_THPS)),
-#endif
- K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),
- node_page_state(pgdat, NR_KERNEL_STACK_KB),
-#ifdef CONFIG_SHADOW_CALL_STACK
- node_page_state(pgdat, NR_KERNEL_SCS_KB),
-#endif
- K(node_page_state(pgdat, NR_PAGETABLE)),
- K(node_page_state(pgdat, NR_SECONDARY_PAGETABLE)),
- pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ?
- "yes" : "no");
- }
-
- for_each_populated_zone(zone) {
- int i;
-
- if (zone_idx(zone) > max_zone_idx)
- continue;
- if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
- continue;
-
- free_pcp = 0;
- for_each_online_cpu(cpu)
- free_pcp += per_cpu_ptr(zone->per_cpu_pageset, cpu)->count;
-
- show_node(zone);
- printk(KERN_CONT
- "%s"
- " free:%lukB"
- " boost:%lukB"
- " min:%lukB"
- " low:%lukB"
- " high:%lukB"
- " reserved_highatomic:%luKB"
- " active_anon:%lukB"
- " inactive_anon:%lukB"
- " active_file:%lukB"
- " inactive_file:%lukB"
- " unevictable:%lukB"
- " writepending:%lukB"
- " present:%lukB"
- " managed:%lukB"
- " mlocked:%lukB"
- " bounce:%lukB"
- " free_pcp:%lukB"
- " local_pcp:%ukB"
- " free_cma:%lukB"
- "\n",
- zone->name,
- K(zone_page_state(zone, NR_FREE_PAGES)),
- K(zone->watermark_boost),
- K(min_wmark_pages(zone)),
- K(low_wmark_pages(zone)),
- K(high_wmark_pages(zone)),
- K(zone->nr_reserved_highatomic),
- K(zone_page_state(zone, NR_ZONE_ACTIVE_ANON)),
- K(zone_page_state(zone, NR_ZONE_INACTIVE_ANON)),
- K(zone_page_state(zone, NR_ZONE_ACTIVE_FILE)),
- K(zone_page_state(zone, NR_ZONE_INACTIVE_FILE)),
- K(zone_page_state(zone, NR_ZONE_UNEVICTABLE)),
- K(zone_page_state(zone, NR_ZONE_WRITE_PENDING)),
- K(zone->present_pages),
- K(zone_managed_pages(zone)),
- K(zone_page_state(zone, NR_MLOCK)),
- K(zone_page_state(zone, NR_BOUNCE)),
- K(free_pcp),
- K(this_cpu_read(zone->per_cpu_pageset->count)),
- K(zone_page_state(zone, NR_FREE_CMA_PAGES)));
- printk("lowmem_reserve[]:");
- for (i = 0; i < MAX_NR_ZONES; i++)
- printk(KERN_CONT " %ld", zone->lowmem_reserve[i]);
- printk(KERN_CONT "\n");
- }
-
- for_each_populated_zone(zone) {
- unsigned int order;
- unsigned long nr[MAX_ORDER + 1], flags, total = 0;
- unsigned char types[MAX_ORDER + 1];
-
- if (zone_idx(zone) > max_zone_idx)
- continue;
- if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
- continue;
- show_node(zone);
- printk(KERN_CONT "%s: ", zone->name);
-
- spin_lock_irqsave(&zone->lock, flags);
- for (order = 0; order <= MAX_ORDER; order++) {
- struct free_area *area = &zone->free_area[order];
- int type;
-
- nr[order] = area->nr_free;
- total += nr[order] << order;
-
- types[order] = 0;
- for (type = 0; type < MIGRATE_TYPES; type++) {
- if (!free_area_empty(area, type))
- types[order] |= 1 << type;
- }
- }
- spin_unlock_irqrestore(&zone->lock, flags);
- for (order = 0; order <= MAX_ORDER; order++) {
- printk(KERN_CONT "%lu*%lukB ",
- nr[order], K(1UL) << order);
- if (nr[order])
- show_migration_types(types[order]);
- }
- printk(KERN_CONT "= %lukB\n", K(total));
- }
-
- for_each_online_node(nid) {
- if (show_mem_node_skip(filter, nid, nodemask))
- continue;
- hugetlb_show_meminfo_node(nid);
- }
-
- printk("%ld total pagecache pages\n", global_node_page_state(NR_FILE_PAGES));
-
- show_swap_cache_info();
-}
-
static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
{
zoneref->zone = zone;
@@ -5586,12 +4892,12 @@ static int __parse_numa_zonelist_order(char *s)
return 0;
}
-char numa_zonelist_order[] = "Node";
-
+static char numa_zonelist_order[] = "Node";
+#define NUMA_ZONELIST_ORDER_LEN 16
/*
* sysctl handler for numa_zonelist_order
*/
-int numa_zonelist_order_handler(struct ctl_table *table, int write,
+static int numa_zonelist_order_handler(struct ctl_table *table, int write,
void *buffer, size_t *length, loff_t *ppos)
{
if (write)
@@ -5599,7 +4905,6 @@ int numa_zonelist_order_handler(struct ctl_table *table, int write,
return proc_dostring(table, write, buffer, length, ppos);
}
-
static int node_load[MAX_NUMNODES];
/**
@@ -6002,6 +5307,7 @@ static int zone_batchsize(struct zone *zone)
#endif
}
+static int percpu_pagelist_high_fraction;
static int zone_highsize(struct zone *zone, int batch, int cpu_online)
{
#ifdef CONFIG_MMU
@@ -6531,7 +5837,7 @@ postcore_initcall(init_per_zone_wmark_min)
* that we can call two helper functions whenever min_free_kbytes
* changes.
*/
-int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write,
+static int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write,
void *buffer, size_t *length, loff_t *ppos)
{
int rc;
@@ -6547,7 +5853,7 @@ int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write,
return 0;
}
-int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write,
+static int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write,
void *buffer, size_t *length, loff_t *ppos)
{
int rc;
@@ -6577,7 +5883,7 @@ static void setup_min_unmapped_ratio(void)
}
-int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write,
+static int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write,
void *buffer, size_t *length, loff_t *ppos)
{
int rc;
@@ -6604,7 +5910,7 @@ static void setup_min_slab_ratio(void)
sysctl_min_slab_ratio) / 100;
}
-int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write,
+static int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write,
void *buffer, size_t *length, loff_t *ppos)
{
int rc;
@@ -6628,8 +5934,8 @@ int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write,
* minimum watermarks. The lowmem reserve ratio can only make sense
* if in function of the boot time zone sizes.
*/
-int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, int write,
- void *buffer, size_t *length, loff_t *ppos)
+static int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table,
+ int write, void *buffer, size_t *length, loff_t *ppos)
{
int i;
@@ -6649,7 +5955,7 @@ int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, int write,
* cpu. It is the fraction of total pages in each zone that a hot per cpu
* pagelist can have before it gets flushed back to buddy allocator.
*/
-int percpu_pagelist_high_fraction_sysctl_handler(struct ctl_table *table,
+static int percpu_pagelist_high_fraction_sysctl_handler(struct ctl_table *table,
int write, void *buffer, size_t *length, loff_t *ppos)
{
struct zone *zone;
@@ -6682,9 +5988,83 @@ out:
return ret;
}
+static struct ctl_table page_alloc_sysctl_table[] = {
+ {
+ .procname = "min_free_kbytes",
+ .data = &min_free_kbytes,
+ .maxlen = sizeof(min_free_kbytes),
+ .mode = 0644,
+ .proc_handler = min_free_kbytes_sysctl_handler,
+ .extra1 = SYSCTL_ZERO,
+ },
+ {
+ .procname = "watermark_boost_factor",
+ .data = &watermark_boost_factor,
+ .maxlen = sizeof(watermark_boost_factor),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ },
+ {
+ .procname = "watermark_scale_factor",
+ .data = &watermark_scale_factor,
+ .maxlen = sizeof(watermark_scale_factor),
+ .mode = 0644,
+ .proc_handler = watermark_scale_factor_sysctl_handler,
+ .extra1 = SYSCTL_ONE,
+ .extra2 = SYSCTL_THREE_THOUSAND,
+ },
+ {
+ .procname = "percpu_pagelist_high_fraction",
+ .data = &percpu_pagelist_high_fraction,
+ .maxlen = sizeof(percpu_pagelist_high_fraction),
+ .mode = 0644,
+ .proc_handler = percpu_pagelist_high_fraction_sysctl_handler,
+ .extra1 = SYSCTL_ZERO,
+ },
+ {
+ .procname = "lowmem_reserve_ratio",
+ .data = &sysctl_lowmem_reserve_ratio,
+ .maxlen = sizeof(sysctl_lowmem_reserve_ratio),
+ .mode = 0644,
+ .proc_handler = lowmem_reserve_ratio_sysctl_handler,
+ },
+#ifdef CONFIG_NUMA
+ {
+ .procname = "numa_zonelist_order",
+ .data = &numa_zonelist_order,
+ .maxlen = NUMA_ZONELIST_ORDER_LEN,
+ .mode = 0644,
+ .proc_handler = numa_zonelist_order_handler,
+ },
+ {
+ .procname = "min_unmapped_ratio",
+ .data = &sysctl_min_unmapped_ratio,
+ .maxlen = sizeof(sysctl_min_unmapped_ratio),
+ .mode = 0644,
+ .proc_handler = sysctl_min_unmapped_ratio_sysctl_handler,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE_HUNDRED,
+ },
+ {
+ .procname = "min_slab_ratio",
+ .data = &sysctl_min_slab_ratio,
+ .maxlen = sizeof(sysctl_min_slab_ratio),
+ .mode = 0644,
+ .proc_handler = sysctl_min_slab_ratio_sysctl_handler,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE_HUNDRED,
+ },
+#endif
+ {}
+};
+
+void __init page_alloc_sysctl_init(void)
+{
+ register_sysctl_init("vm", page_alloc_sysctl_table);
+}
+
#ifdef CONFIG_CONTIG_ALLOC
-#if defined(CONFIG_DYNAMIC_DEBUG) || \
- (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE))
/* Usage: See admin-guide/dynamic-debug-howto.rst */
static void alloc_contig_dump_pages(struct list_head *page_list)
{
@@ -6698,11 +6078,6 @@ static void alloc_contig_dump_pages(struct list_head *page_list)
dump_page(page, "migration failure");
}
}
-#else
-static inline void alloc_contig_dump_pages(struct list_head *page_list)
-{
-}
-#endif
/* [start, end) must belong to a single zone. */
int __alloc_contig_migrate_range(struct compact_control *cc,
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index c6f3605e37ab..6599cc965e21 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -481,10 +481,9 @@ failed:
}
/**
- * start_isolate_page_range() - make page-allocation-type of range of pages to
- * be MIGRATE_ISOLATE.
- * @start_pfn: The lower PFN of the range to be isolated.
- * @end_pfn: The upper PFN of the range to be isolated.
+ * start_isolate_page_range() - mark page range MIGRATE_ISOLATE
+ * @start_pfn: The first PFN of the range to be isolated.
+ * @end_pfn: The last PFN of the range to be isolated.
* @migratetype: Migrate type to set in error recovery.
* @flags: The following flags are allowed (they can be combined in
* a bit mask)
@@ -571,8 +570,14 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
return 0;
}
-/*
- * Make isolated pages available again.
+/**
+ * undo_isolate_page_range - undo effects of start_isolate_page_range()
+ * @start_pfn: The first PFN of the isolated range
+ * @end_pfn: The last PFN of the isolated range
+ * @migratetype: New migrate type to set on the range
+ *
+ * This finds every MIGRATE_ISOLATE page block in the given range
+ * and switches it to @migratetype.
*/
void undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
int migratetype)
@@ -631,7 +636,21 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn,
return pfn;
}
-/* Caller should ensure that requested range is in a single zone */
+/**
+ * test_pages_isolated - check if pageblocks in range are isolated
+ * @start_pfn: The first PFN of the isolated range
+ * @end_pfn: The first PFN *after* the isolated range
+ * @isol_flags: Testing mode flags
+ *
+ * This tests if all in the specified range are free.
+ *
+ * If %MEMORY_OFFLINE is specified in @flags, it will consider
+ * poisoned and offlined pages free as well.
+ *
+ * Caller must ensure the requested range doesn't span zones.
+ *
+ * Returns 0 if true, -EBUSY if one or more pages are in use.
+ */
int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn,
int isol_flags)
{
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 31169b3e7f06..c93baef0148f 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -418,7 +418,7 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn,
pageblock_mt = get_pageblock_migratetype(page);
page_mt = gfp_migratetype(page_owner->gfp_mask);
ret += scnprintf(kbuf + ret, count - ret,
- "PFN %lu type %s Block %lu type %s Flags %pGp\n",
+ "PFN 0x%lx type %s Block %lu type %s Flags %pGp\n",
pfn,
migratetype_names[page_mt],
pfn >> pageblock_order,
diff --git a/mm/page_table_check.c b/mm/page_table_check.c
index f2baf97d5f38..93ec7690a0d8 100644
--- a/mm/page_table_check.c
+++ b/mm/page_table_check.c
@@ -196,7 +196,7 @@ void __page_table_check_pte_set(struct mm_struct *mm, unsigned long addr,
if (&init_mm == mm)
return;
- __page_table_check_pte_clear(mm, addr, *ptep);
+ __page_table_check_pte_clear(mm, addr, ptep_get(ptep));
if (pte_user_accessible_page(pte)) {
page_table_check_set(mm, addr, pte_pfn(pte),
PAGE_SIZE >> PAGE_SHIFT,
@@ -246,8 +246,10 @@ void __page_table_check_pte_clear_range(struct mm_struct *mm,
pte_t *ptep = pte_offset_map(&pmd, addr);
unsigned long i;
+ if (WARN_ON(!ptep))
+ return;
for (i = 0; i < PTRS_PER_PTE; i++) {
- __page_table_check_pte_clear(mm, addr, *ptep);
+ __page_table_check_pte_clear(mm, addr, ptep_get(ptep));
addr += PAGE_SIZE;
ptep++;
}
diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index 4e448cfbc6ef..49e0d28f0379 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -13,42 +13,61 @@ static inline bool not_found(struct page_vma_mapped_walk *pvmw)
return false;
}
-static bool map_pte(struct page_vma_mapped_walk *pvmw)
+static bool map_pte(struct page_vma_mapped_walk *pvmw, spinlock_t **ptlp)
{
- pvmw->pte = pte_offset_map(pvmw->pmd, pvmw->address);
- if (!(pvmw->flags & PVMW_SYNC)) {
- if (pvmw->flags & PVMW_MIGRATION) {
- if (!is_swap_pte(*pvmw->pte))
- return false;
- } else {
- /*
- * We get here when we are trying to unmap a private
- * device page from the process address space. Such
- * page is not CPU accessible and thus is mapped as
- * a special swap entry, nonetheless it still does
- * count as a valid regular mapping for the page (and
- * is accounted as such in page maps count).
- *
- * So handle this special case as if it was a normal
- * page mapping ie lock CPU page table and returns
- * true.
- *
- * For more details on device private memory see HMM
- * (include/linux/hmm.h or mm/hmm.c).
- */
- if (is_swap_pte(*pvmw->pte)) {
- swp_entry_t entry;
+ pte_t ptent;
- /* Handle un-addressable ZONE_DEVICE memory */
- entry = pte_to_swp_entry(*pvmw->pte);
- if (!is_device_private_entry(entry) &&
- !is_device_exclusive_entry(entry))
- return false;
- } else if (!pte_present(*pvmw->pte))
- return false;
- }
+ if (pvmw->flags & PVMW_SYNC) {
+ /* Use the stricter lookup */
+ pvmw->pte = pte_offset_map_lock(pvmw->vma->vm_mm, pvmw->pmd,
+ pvmw->address, &pvmw->ptl);
+ *ptlp = pvmw->ptl;
+ return !!pvmw->pte;
}
- pvmw->ptl = pte_lockptr(pvmw->vma->vm_mm, pvmw->pmd);
+
+ /*
+ * It is important to return the ptl corresponding to pte,
+ * in case *pvmw->pmd changes underneath us; so we need to
+ * return it even when choosing not to lock, in case caller
+ * proceeds to loop over next ptes, and finds a match later.
+ * Though, in most cases, page lock already protects this.
+ */
+ pvmw->pte = pte_offset_map_nolock(pvmw->vma->vm_mm, pvmw->pmd,
+ pvmw->address, ptlp);
+ if (!pvmw->pte)
+ return false;
+
+ ptent = ptep_get(pvmw->pte);
+
+ if (pvmw->flags & PVMW_MIGRATION) {
+ if (!is_swap_pte(ptent))
+ return false;
+ } else if (is_swap_pte(ptent)) {
+ swp_entry_t entry;
+ /*
+ * Handle un-addressable ZONE_DEVICE memory.
+ *
+ * We get here when we are trying to unmap a private
+ * device page from the process address space. Such
+ * page is not CPU accessible and thus is mapped as
+ * a special swap entry, nonetheless it still does
+ * count as a valid regular mapping for the page
+ * (and is accounted as such in page maps count).
+ *
+ * So handle this special case as if it was a normal
+ * page mapping ie lock CPU page table and return true.
+ *
+ * For more details on device private memory see HMM
+ * (include/linux/hmm.h or mm/hmm.c).
+ */
+ entry = pte_to_swp_entry(ptent);
+ if (!is_device_private_entry(entry) &&
+ !is_device_exclusive_entry(entry))
+ return false;
+ } else if (!pte_present(ptent)) {
+ return false;
+ }
+ pvmw->ptl = *ptlp;
spin_lock(pvmw->ptl);
return true;
}
@@ -75,33 +94,34 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw)
static bool check_pte(struct page_vma_mapped_walk *pvmw)
{
unsigned long pfn;
+ pte_t ptent = ptep_get(pvmw->pte);
if (pvmw->flags & PVMW_MIGRATION) {
swp_entry_t entry;
- if (!is_swap_pte(*pvmw->pte))
+ if (!is_swap_pte(ptent))
return false;
- entry = pte_to_swp_entry(*pvmw->pte);
+ entry = pte_to_swp_entry(ptent);
if (!is_migration_entry(entry) &&
!is_device_exclusive_entry(entry))
return false;
pfn = swp_offset_pfn(entry);
- } else if (is_swap_pte(*pvmw->pte)) {
+ } else if (is_swap_pte(ptent)) {
swp_entry_t entry;
/* Handle un-addressable ZONE_DEVICE memory */
- entry = pte_to_swp_entry(*pvmw->pte);
+ entry = pte_to_swp_entry(ptent);
if (!is_device_private_entry(entry) &&
!is_device_exclusive_entry(entry))
return false;
pfn = swp_offset_pfn(entry);
} else {
- if (!pte_present(*pvmw->pte))
+ if (!pte_present(ptent))
return false;
- pfn = pte_pfn(*pvmw->pte);
+ pfn = pte_pfn(ptent);
}
return (pfn - pvmw->pfn) < pvmw->nr_pages;
@@ -153,6 +173,7 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
struct vm_area_struct *vma = pvmw->vma;
struct mm_struct *mm = vma->vm_mm;
unsigned long end;
+ spinlock_t *ptl;
pgd_t *pgd;
p4d_t *p4d;
pud_t *pud;
@@ -210,7 +231,7 @@ restart:
* compiler and used as a stale value after we've observed a
* subsequent update.
*/
- pmde = READ_ONCE(*pvmw->pmd);
+ pmde = pmdp_get_lockless(pvmw->pmd);
if (pmd_trans_huge(pmde) || is_pmd_migration_entry(pmde) ||
(pmd_present(pmde) && pmd_devmap(pmde))) {
@@ -254,8 +275,11 @@ restart:
step_forward(pvmw, PMD_SIZE);
continue;
}
- if (!map_pte(pvmw))
+ if (!map_pte(pvmw, &ptl)) {
+ if (!pvmw->pte)
+ goto restart;
goto next_pte;
+ }
this_pte:
if (check_pte(pvmw))
return true;
@@ -275,14 +299,10 @@ next_pte:
goto restart;
}
pvmw->pte++;
- if ((pvmw->flags & PVMW_SYNC) && !pvmw->ptl) {
- pvmw->ptl = pte_lockptr(mm, pvmw->pmd);
- spin_lock(pvmw->ptl);
- }
- } while (pte_none(*pvmw->pte));
+ } while (pte_none(ptep_get(pvmw->pte)));
if (!pvmw->ptl) {
- pvmw->ptl = pte_lockptr(mm, pvmw->pmd);
+ pvmw->ptl = ptl;
spin_lock(pvmw->ptl);
}
goto this_pte;
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index cb23f8a15c13..64437105fe0d 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -46,15 +46,27 @@ static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
spinlock_t *ptl;
if (walk->no_vma) {
- pte = pte_offset_map(pmd, addr);
- err = walk_pte_range_inner(pte, addr, end, walk);
- pte_unmap(pte);
+ /*
+ * pte_offset_map() might apply user-specific validation.
+ */
+ if (walk->mm == &init_mm)
+ pte = pte_offset_kernel(pmd, addr);
+ else
+ pte = pte_offset_map(pmd, addr);
+ if (pte) {
+ err = walk_pte_range_inner(pte, addr, end, walk);
+ if (walk->mm != &init_mm)
+ pte_unmap(pte);
+ }
} else {
pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
- err = walk_pte_range_inner(pte, addr, end, walk);
- pte_unmap_unlock(pte, ptl);
+ if (pte) {
+ err = walk_pte_range_inner(pte, addr, end, walk);
+ pte_unmap_unlock(pte, ptl);
+ }
}
-
+ if (!pte)
+ walk->action = ACTION_AGAIN;
return err;
}
@@ -141,11 +153,8 @@ again:
!(ops->pte_entry))
continue;
- if (walk->vma) {
+ if (walk->vma)
split_huge_pmd(walk->vma, pmd, addr);
- if (pmd_trans_unstable(pmd))
- goto again;
- }
if (is_hugepd(__hugepd(pmd_val(*pmd))))
err = walk_hugepd_range((hugepd_t *)pmd, addr, next, walk, PMD_SHIFT);
@@ -153,6 +162,10 @@ again:
err = walk_pte_range(pmd, addr, next, walk);
if (err)
break;
+
+ if (walk->action == ACTION_AGAIN)
+ goto again;
+
} while (pmd++, addr = next, addr != end);
return err;
diff --git a/mm/percpu-internal.h b/mm/percpu-internal.h
index f9847c131998..cdd0aa597a81 100644
--- a/mm/percpu-internal.h
+++ b/mm/percpu-internal.h
@@ -41,10 +41,17 @@ struct pcpu_chunk {
struct list_head list; /* linked to pcpu_slot lists */
int free_bytes; /* free bytes in the chunk */
struct pcpu_block_md chunk_md;
- void *base_addr; /* base address of this chunk */
+ unsigned long *bound_map; /* boundary map */
+
+ /*
+ * base_addr is the base address of this chunk.
+ * To reduce false sharing, current layout is optimized to make sure
+ * base_addr locate in the different cacheline with free_bytes and
+ * chunk_md.
+ */
+ void *base_addr ____cacheline_aligned_in_smp;
unsigned long *alloc_map; /* allocation map */
- unsigned long *bound_map; /* boundary map */
struct pcpu_block_md *md_blocks; /* metadata blocks */
void *data; /* chunk data */
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index d2fc52bffafc..4d454953046f 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -10,6 +10,8 @@
#include <linux/pagemap.h>
#include <linux/hugetlb.h>
#include <linux/pgtable.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
#include <linux/mm_inline.h>
#include <asm/tlb.h>
@@ -66,7 +68,7 @@ int ptep_set_access_flags(struct vm_area_struct *vma,
unsigned long address, pte_t *ptep,
pte_t entry, int dirty)
{
- int changed = !pte_same(*ptep, entry);
+ int changed = !pte_same(ptep_get(ptep), entry);
if (changed) {
set_pte_at(vma->vm_mm, address, ptep, entry);
flush_tlb_fix_spurious_fault(vma, address, ptep);
@@ -229,3 +231,57 @@ pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
}
#endif
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+pte_t *__pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp)
+{
+ pmd_t pmdval;
+
+ /* rcu_read_lock() to be added later */
+ pmdval = pmdp_get_lockless(pmd);
+ if (pmdvalp)
+ *pmdvalp = pmdval;
+ if (unlikely(pmd_none(pmdval) || is_pmd_migration_entry(pmdval)))
+ goto nomap;
+ if (unlikely(pmd_trans_huge(pmdval) || pmd_devmap(pmdval)))
+ goto nomap;
+ if (unlikely(pmd_bad(pmdval))) {
+ pmd_clear_bad(pmd);
+ goto nomap;
+ }
+ return __pte_map(&pmdval, addr);
+nomap:
+ /* rcu_read_unlock() to be added later */
+ return NULL;
+}
+
+pte_t *pte_offset_map_nolock(struct mm_struct *mm, pmd_t *pmd,
+ unsigned long addr, spinlock_t **ptlp)
+{
+ pmd_t pmdval;
+ pte_t *pte;
+
+ pte = __pte_offset_map(pmd, addr, &pmdval);
+ if (likely(pte))
+ *ptlp = pte_lockptr(mm, &pmdval);
+ return pte;
+}
+
+pte_t *__pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd,
+ unsigned long addr, spinlock_t **ptlp)
+{
+ spinlock_t *ptl;
+ pmd_t pmdval;
+ pte_t *pte;
+again:
+ pte = __pte_offset_map(pmd, addr, &pmdval);
+ if (unlikely(!pte))
+ return pte;
+ ptl = pte_lockptr(mm, &pmdval);
+ spin_lock(ptl);
+ if (likely(pmd_same(pmdval, pmdp_get_lockless(pmd)))) {
+ *ptlp = ptl;
+ return pte;
+ }
+ pte_unmap_unlock(pte, ptl);
+ goto again;
+}
diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c
index 78dfaf9e8990..0523edab03a6 100644
--- a/mm/process_vm_access.c
+++ b/mm/process_vm_access.c
@@ -104,7 +104,7 @@ static int process_vm_rw_single_vec(unsigned long addr,
mmap_read_lock(mm);
pinned_pages = pin_user_pages_remote(mm, pa, pinned_pages,
flags, process_pages,
- NULL, &locked);
+ &locked);
if (locked)
mmap_read_unlock(mm);
if (pinned_pages <= 0)
diff --git a/mm/ptdump.c b/mm/ptdump.c
index 8adab455a68b..03c1bdae4a43 100644
--- a/mm/ptdump.c
+++ b/mm/ptdump.c
@@ -119,7 +119,7 @@ static int ptdump_pte_entry(pte_t *pte, unsigned long addr,
unsigned long next, struct mm_walk *walk)
{
struct ptdump_state *st = walk->private;
- pte_t val = ptep_get(pte);
+ pte_t val = ptep_get_lockless(pte);
if (st->effective_prot)
st->effective_prot(st, 4, pte_val(val));
diff --git a/mm/readahead.c b/mm/readahead.c
index 47afbca1d122..a9c999aa19af 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -120,7 +120,6 @@
#include <linux/export.h>
#include <linux/backing-dev.h>
#include <linux/task_io_accounting_ops.h>
-#include <linux/pagevec.h>
#include <linux/pagemap.h>
#include <linux/psi.h>
#include <linux/syscalls.h>
diff --git a/mm/rmap.c b/mm/rmap.c
index 19392e090bec..0c0d8857dfce 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -826,7 +826,8 @@ static bool folio_referenced_one(struct folio *folio,
}
if (pvmw.pte) {
- if (lru_gen_enabled() && pte_young(*pvmw.pte)) {
+ if (lru_gen_enabled() &&
+ pte_young(ptep_get(pvmw.pte))) {
lru_gen_look_around(&pvmw);
referenced++;
}
@@ -956,13 +957,13 @@ static int page_vma_mkclean_one(struct page_vma_mapped_walk *pvmw)
address = pvmw->address;
if (pvmw->pte) {
- pte_t entry;
pte_t *pte = pvmw->pte;
+ pte_t entry = ptep_get(pte);
- if (!pte_dirty(*pte) && !pte_write(*pte))
+ if (!pte_dirty(entry) && !pte_write(entry))
continue;
- flush_cache_page(vma, address, pte_pfn(*pte));
+ flush_cache_page(vma, address, pte_pfn(entry));
entry = ptep_clear_flush(vma, address, pte);
entry = pte_wrprotect(entry);
entry = pte_mkclean(entry);
@@ -1137,7 +1138,7 @@ void page_move_anon_rmap(struct page *page, struct vm_area_struct *vma)
* @folio: Folio which contains page.
* @page: Page to add to rmap.
* @vma: VM area to add page to.
- * @address: User virtual address of the mapping
+ * @address: User virtual address of the mapping
* @exclusive: the page is exclusively owned by the current process
*/
static void __page_set_anon_rmap(struct folio *folio, struct page *page,
@@ -1458,6 +1459,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
bool anon_exclusive, ret = true;
struct mmu_notifier_range range;
enum ttu_flags flags = (enum ttu_flags)(long)arg;
+ unsigned long pfn;
/*
* When racing against e.g. zap_pte_range() on another cpu,
@@ -1508,8 +1510,8 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
break;
}
- subpage = folio_page(folio,
- pte_pfn(*pvmw.pte) - folio_pfn(folio));
+ pfn = pte_pfn(ptep_get(pvmw.pte));
+ subpage = folio_page(folio, pfn - folio_pfn(folio));
address = pvmw.address;
anon_exclusive = folio_test_anon(folio) &&
PageAnonExclusive(subpage);
@@ -1571,7 +1573,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
}
pteval = huge_ptep_clear_flush(vma, address, pvmw.pte);
} else {
- flush_cache_page(vma, address, pte_pfn(*pvmw.pte));
+ flush_cache_page(vma, address, pfn);
/* Nuke the page table entry. */
if (should_defer_flush(mm, flags)) {
/*
@@ -1818,6 +1820,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
bool anon_exclusive, ret = true;
struct mmu_notifier_range range;
enum ttu_flags flags = (enum ttu_flags)(long)arg;
+ unsigned long pfn;
/*
* When racing against e.g. zap_pte_range() on another cpu,
@@ -1877,6 +1880,8 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
/* Unexpected PMD-mapped THP? */
VM_BUG_ON_FOLIO(!pvmw.pte, folio);
+ pfn = pte_pfn(ptep_get(pvmw.pte));
+
if (folio_is_zone_device(folio)) {
/*
* Our PTE is a non-present device exclusive entry and
@@ -1891,8 +1896,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
VM_BUG_ON_FOLIO(folio_nr_pages(folio) > 1, folio);
subpage = &folio->page;
} else {
- subpage = folio_page(folio,
- pte_pfn(*pvmw.pte) - folio_pfn(folio));
+ subpage = folio_page(folio, pfn - folio_pfn(folio));
}
address = pvmw.address;
anon_exclusive = folio_test_anon(folio) &&
@@ -1952,7 +1956,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
/* Nuke the hugetlb page table entry */
pteval = huge_ptep_clear_flush(vma, address, pvmw.pte);
} else {
- flush_cache_page(vma, address, pte_pfn(*pvmw.pte));
+ flush_cache_page(vma, address, pfn);
/* Nuke the page table entry. */
if (should_defer_flush(mm, flags)) {
/*
@@ -2187,6 +2191,7 @@ static bool page_make_device_exclusive_one(struct folio *folio,
struct mmu_notifier_range range;
swp_entry_t entry;
pte_t swp_pte;
+ pte_t ptent;
mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0,
vma->vm_mm, address, min(vma->vm_end,
@@ -2198,18 +2203,19 @@ static bool page_make_device_exclusive_one(struct folio *folio,
/* Unexpected PMD-mapped THP? */
VM_BUG_ON_FOLIO(!pvmw.pte, folio);
- if (!pte_present(*pvmw.pte)) {
+ ptent = ptep_get(pvmw.pte);
+ if (!pte_present(ptent)) {
ret = false;
page_vma_mapped_walk_done(&pvmw);
break;
}
subpage = folio_page(folio,
- pte_pfn(*pvmw.pte) - folio_pfn(folio));
+ pte_pfn(ptent) - folio_pfn(folio));
address = pvmw.address;
/* Nuke the page table entry. */
- flush_cache_page(vma, address, pte_pfn(*pvmw.pte));
+ flush_cache_page(vma, address, pte_pfn(ptent));
pteval = ptep_clear_flush(vma, address, pvmw.pte);
/* Set the dirty flag on the folio now the pte is gone. */
@@ -2328,7 +2334,7 @@ int make_device_exclusive_range(struct mm_struct *mm, unsigned long start,
npages = get_user_pages_remote(mm, start, npages,
FOLL_GET | FOLL_WRITE | FOLL_SPLIT_PMD,
- pages, NULL, NULL);
+ pages, NULL);
if (npages < 0)
return npages;
diff --git a/mm/secretmem.c b/mm/secretmem.c
index 0b502625cd30..86442a15d12f 100644
--- a/mm/secretmem.c
+++ b/mm/secretmem.c
@@ -35,7 +35,7 @@
#define SECRETMEM_MODE_MASK (0x0)
#define SECRETMEM_FLAGS_MASK SECRETMEM_MODE_MASK
-static bool secretmem_enable __ro_after_init;
+static bool secretmem_enable __ro_after_init = 1;
module_param_named(enable, secretmem_enable, bool, 0400);
MODULE_PARM_DESC(secretmem_enable,
"Enable secretmem and memfd_secret(2) system call");
@@ -125,7 +125,7 @@ static int secretmem_mmap(struct file *file, struct vm_area_struct *vma)
if ((vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) == 0)
return -EINVAL;
- if (mlock_future_check(vma->vm_mm, vma->vm_flags | VM_LOCKED, len))
+ if (!mlock_future_ok(vma->vm_mm, vma->vm_flags | VM_LOCKED, len))
return -EAGAIN;
vm_flags_set(vma, VM_LOCKED | VM_DONTDUMP);
diff --git a/mm/shmem.c b/mm/shmem.c
index 1f504ed982cf..2f2e0e618072 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -3858,6 +3858,7 @@ out:
static int shmem_show_options(struct seq_file *seq, struct dentry *root)
{
struct shmem_sb_info *sbinfo = SHMEM_SB(root->d_sb);
+ struct mempolicy *mpol;
if (sbinfo->max_blocks != shmem_default_max_blocks())
seq_printf(seq, ",size=%luk",
@@ -3900,7 +3901,9 @@ static int shmem_show_options(struct seq_file *seq, struct dentry *root)
if (sbinfo->huge)
seq_printf(seq, ",huge=%s", shmem_format_huge(sbinfo->huge));
#endif
- shmem_show_mpol(seq, sbinfo->mpol);
+ mpol = shmem_get_sbmpol(sbinfo);
+ shmem_show_mpol(seq, mpol);
+ mpol_put(mpol);
if (sbinfo->noswap)
seq_printf(seq, ",noswap");
return 0;
@@ -4328,7 +4331,7 @@ static struct file_system_type shmem_fs_type = {
.name = "tmpfs",
.init_fs_context = ramfs_init_fs_context,
.parameters = ramfs_fs_parameters,
- .kill_sb = kill_litter_super,
+ .kill_sb = ramfs_kill_sb,
.fs_flags = FS_USERNS_MOUNT,
};
diff --git a/mm/show_mem.c b/mm/show_mem.c
new file mode 100644
index 000000000000..01f8e9905817
--- /dev/null
+++ b/mm/show_mem.c
@@ -0,0 +1,429 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Generic show_mem() implementation
+ *
+ * Copyright (C) 2008 Johannes Weiner <hannes@saeurebad.de>
+ */
+
+#include <linux/blkdev.h>
+#include <linux/cma.h>
+#include <linux/cpuset.h>
+#include <linux/highmem.h>
+#include <linux/hugetlb.h>
+#include <linux/mm.h>
+#include <linux/mmzone.h>
+#include <linux/swap.h>
+#include <linux/vmstat.h>
+
+#include "internal.h"
+#include "swap.h"
+
+atomic_long_t _totalram_pages __read_mostly;
+EXPORT_SYMBOL(_totalram_pages);
+unsigned long totalreserve_pages __read_mostly;
+unsigned long totalcma_pages __read_mostly;
+
+static inline void show_node(struct zone *zone)
+{
+ if (IS_ENABLED(CONFIG_NUMA))
+ printk("Node %d ", zone_to_nid(zone));
+}
+
+long si_mem_available(void)
+{
+ long available;
+ unsigned long pagecache;
+ unsigned long wmark_low = 0;
+ unsigned long pages[NR_LRU_LISTS];
+ unsigned long reclaimable;
+ struct zone *zone;
+ int lru;
+
+ for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++)
+ pages[lru] = global_node_page_state(NR_LRU_BASE + lru);
+
+ for_each_zone(zone)
+ wmark_low += low_wmark_pages(zone);
+
+ /*
+ * Estimate the amount of memory available for userspace allocations,
+ * without causing swapping or OOM.
+ */
+ available = global_zone_page_state(NR_FREE_PAGES) - totalreserve_pages;
+
+ /*
+ * Not all the page cache can be freed, otherwise the system will
+ * start swapping or thrashing. Assume at least half of the page
+ * cache, or the low watermark worth of cache, needs to stay.
+ */
+ pagecache = pages[LRU_ACTIVE_FILE] + pages[LRU_INACTIVE_FILE];
+ pagecache -= min(pagecache / 2, wmark_low);
+ available += pagecache;
+
+ /*
+ * Part of the reclaimable slab and other kernel memory consists of
+ * items that are in use, and cannot be freed. Cap this estimate at the
+ * low watermark.
+ */
+ reclaimable = global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B) +
+ global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE);
+ available += reclaimable - min(reclaimable / 2, wmark_low);
+
+ if (available < 0)
+ available = 0;
+ return available;
+}
+EXPORT_SYMBOL_GPL(si_mem_available);
+
+void si_meminfo(struct sysinfo *val)
+{
+ val->totalram = totalram_pages();
+ val->sharedram = global_node_page_state(NR_SHMEM);
+ val->freeram = global_zone_page_state(NR_FREE_PAGES);
+ val->bufferram = nr_blockdev_pages();
+ val->totalhigh = totalhigh_pages();
+ val->freehigh = nr_free_highpages();
+ val->mem_unit = PAGE_SIZE;
+}
+
+EXPORT_SYMBOL(si_meminfo);
+
+#ifdef CONFIG_NUMA
+void si_meminfo_node(struct sysinfo *val, int nid)
+{
+ int zone_type; /* needs to be signed */
+ unsigned long managed_pages = 0;
+ unsigned long managed_highpages = 0;
+ unsigned long free_highpages = 0;
+ pg_data_t *pgdat = NODE_DATA(nid);
+
+ for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)
+ managed_pages += zone_managed_pages(&pgdat->node_zones[zone_type]);
+ val->totalram = managed_pages;
+ val->sharedram = node_page_state(pgdat, NR_SHMEM);
+ val->freeram = sum_zone_node_page_state(nid, NR_FREE_PAGES);
+#ifdef CONFIG_HIGHMEM
+ for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
+ struct zone *zone = &pgdat->node_zones[zone_type];
+
+ if (is_highmem(zone)) {
+ managed_highpages += zone_managed_pages(zone);
+ free_highpages += zone_page_state(zone, NR_FREE_PAGES);
+ }
+ }
+ val->totalhigh = managed_highpages;
+ val->freehigh = free_highpages;
+#else
+ val->totalhigh = managed_highpages;
+ val->freehigh = free_highpages;
+#endif
+ val->mem_unit = PAGE_SIZE;
+}
+#endif
+
+/*
+ * Determine whether the node should be displayed or not, depending on whether
+ * SHOW_MEM_FILTER_NODES was passed to show_free_areas().
+ */
+static bool show_mem_node_skip(unsigned int flags, int nid, nodemask_t *nodemask)
+{
+ if (!(flags & SHOW_MEM_FILTER_NODES))
+ return false;
+
+ /*
+ * no node mask - aka implicit memory numa policy. Do not bother with
+ * the synchronization - read_mems_allowed_begin - because we do not
+ * have to be precise here.
+ */
+ if (!nodemask)
+ nodemask = &cpuset_current_mems_allowed;
+
+ return !node_isset(nid, *nodemask);
+}
+
+static void show_migration_types(unsigned char type)
+{
+ static const char types[MIGRATE_TYPES] = {
+ [MIGRATE_UNMOVABLE] = 'U',
+ [MIGRATE_MOVABLE] = 'M',
+ [MIGRATE_RECLAIMABLE] = 'E',
+ [MIGRATE_HIGHATOMIC] = 'H',
+#ifdef CONFIG_CMA
+ [MIGRATE_CMA] = 'C',
+#endif
+#ifdef CONFIG_MEMORY_ISOLATION
+ [MIGRATE_ISOLATE] = 'I',
+#endif
+ };
+ char tmp[MIGRATE_TYPES + 1];
+ char *p = tmp;
+ int i;
+
+ for (i = 0; i < MIGRATE_TYPES; i++) {
+ if (type & (1 << i))
+ *p++ = types[i];
+ }
+
+ *p = '\0';
+ printk(KERN_CONT "(%s) ", tmp);
+}
+
+static bool node_has_managed_zones(pg_data_t *pgdat, int max_zone_idx)
+{
+ int zone_idx;
+ for (zone_idx = 0; zone_idx <= max_zone_idx; zone_idx++)
+ if (zone_managed_pages(pgdat->node_zones + zone_idx))
+ return true;
+ return false;
+}
+
+/*
+ * Show free area list (used inside shift_scroll-lock stuff)
+ * We also calculate the percentage fragmentation. We do this by counting the
+ * memory on each free list with the exception of the first item on the list.
+ *
+ * Bits in @filter:
+ * SHOW_MEM_FILTER_NODES: suppress nodes that are not allowed by current's
+ * cpuset.
+ */
+void __show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_zone_idx)
+{
+ unsigned long free_pcp = 0;
+ int cpu, nid;
+ struct zone *zone;
+ pg_data_t *pgdat;
+
+ for_each_populated_zone(zone) {
+ if (zone_idx(zone) > max_zone_idx)
+ continue;
+ if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
+ continue;
+
+ for_each_online_cpu(cpu)
+ free_pcp += per_cpu_ptr(zone->per_cpu_pageset, cpu)->count;
+ }
+
+ printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"
+ " active_file:%lu inactive_file:%lu isolated_file:%lu\n"
+ " unevictable:%lu dirty:%lu writeback:%lu\n"
+ " slab_reclaimable:%lu slab_unreclaimable:%lu\n"
+ " mapped:%lu shmem:%lu pagetables:%lu\n"
+ " sec_pagetables:%lu bounce:%lu\n"
+ " kernel_misc_reclaimable:%lu\n"
+ " free:%lu free_pcp:%lu free_cma:%lu\n",
+ global_node_page_state(NR_ACTIVE_ANON),
+ global_node_page_state(NR_INACTIVE_ANON),
+ global_node_page_state(NR_ISOLATED_ANON),
+ global_node_page_state(NR_ACTIVE_FILE),
+ global_node_page_state(NR_INACTIVE_FILE),
+ global_node_page_state(NR_ISOLATED_FILE),
+ global_node_page_state(NR_UNEVICTABLE),
+ global_node_page_state(NR_FILE_DIRTY),
+ global_node_page_state(NR_WRITEBACK),
+ global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B),
+ global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B),
+ global_node_page_state(NR_FILE_MAPPED),
+ global_node_page_state(NR_SHMEM),
+ global_node_page_state(NR_PAGETABLE),
+ global_node_page_state(NR_SECONDARY_PAGETABLE),
+ global_zone_page_state(NR_BOUNCE),
+ global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE),
+ global_zone_page_state(NR_FREE_PAGES),
+ free_pcp,
+ global_zone_page_state(NR_FREE_CMA_PAGES));
+
+ for_each_online_pgdat(pgdat) {
+ if (show_mem_node_skip(filter, pgdat->node_id, nodemask))
+ continue;
+ if (!node_has_managed_zones(pgdat, max_zone_idx))
+ continue;
+
+ printk("Node %d"
+ " active_anon:%lukB"
+ " inactive_anon:%lukB"
+ " active_file:%lukB"
+ " inactive_file:%lukB"
+ " unevictable:%lukB"
+ " isolated(anon):%lukB"
+ " isolated(file):%lukB"
+ " mapped:%lukB"
+ " dirty:%lukB"
+ " writeback:%lukB"
+ " shmem:%lukB"
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ " shmem_thp: %lukB"
+ " shmem_pmdmapped: %lukB"
+ " anon_thp: %lukB"
+#endif
+ " writeback_tmp:%lukB"
+ " kernel_stack:%lukB"
+#ifdef CONFIG_SHADOW_CALL_STACK
+ " shadow_call_stack:%lukB"
+#endif
+ " pagetables:%lukB"
+ " sec_pagetables:%lukB"
+ " all_unreclaimable? %s"
+ "\n",
+ pgdat->node_id,
+ K(node_page_state(pgdat, NR_ACTIVE_ANON)),
+ K(node_page_state(pgdat, NR_INACTIVE_ANON)),
+ K(node_page_state(pgdat, NR_ACTIVE_FILE)),
+ K(node_page_state(pgdat, NR_INACTIVE_FILE)),
+ K(node_page_state(pgdat, NR_UNEVICTABLE)),
+ K(node_page_state(pgdat, NR_ISOLATED_ANON)),
+ K(node_page_state(pgdat, NR_ISOLATED_FILE)),
+ K(node_page_state(pgdat, NR_FILE_MAPPED)),
+ K(node_page_state(pgdat, NR_FILE_DIRTY)),
+ K(node_page_state(pgdat, NR_WRITEBACK)),
+ K(node_page_state(pgdat, NR_SHMEM)),
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ K(node_page_state(pgdat, NR_SHMEM_THPS)),
+ K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED)),
+ K(node_page_state(pgdat, NR_ANON_THPS)),
+#endif
+ K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),
+ node_page_state(pgdat, NR_KERNEL_STACK_KB),
+#ifdef CONFIG_SHADOW_CALL_STACK
+ node_page_state(pgdat, NR_KERNEL_SCS_KB),
+#endif
+ K(node_page_state(pgdat, NR_PAGETABLE)),
+ K(node_page_state(pgdat, NR_SECONDARY_PAGETABLE)),
+ pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ?
+ "yes" : "no");
+ }
+
+ for_each_populated_zone(zone) {
+ int i;
+
+ if (zone_idx(zone) > max_zone_idx)
+ continue;
+ if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
+ continue;
+
+ free_pcp = 0;
+ for_each_online_cpu(cpu)
+ free_pcp += per_cpu_ptr(zone->per_cpu_pageset, cpu)->count;
+
+ show_node(zone);
+ printk(KERN_CONT
+ "%s"
+ " free:%lukB"
+ " boost:%lukB"
+ " min:%lukB"
+ " low:%lukB"
+ " high:%lukB"
+ " reserved_highatomic:%luKB"
+ " active_anon:%lukB"
+ " inactive_anon:%lukB"
+ " active_file:%lukB"
+ " inactive_file:%lukB"
+ " unevictable:%lukB"
+ " writepending:%lukB"
+ " present:%lukB"
+ " managed:%lukB"
+ " mlocked:%lukB"
+ " bounce:%lukB"
+ " free_pcp:%lukB"
+ " local_pcp:%ukB"
+ " free_cma:%lukB"
+ "\n",
+ zone->name,
+ K(zone_page_state(zone, NR_FREE_PAGES)),
+ K(zone->watermark_boost),
+ K(min_wmark_pages(zone)),
+ K(low_wmark_pages(zone)),
+ K(high_wmark_pages(zone)),
+ K(zone->nr_reserved_highatomic),
+ K(zone_page_state(zone, NR_ZONE_ACTIVE_ANON)),
+ K(zone_page_state(zone, NR_ZONE_INACTIVE_ANON)),
+ K(zone_page_state(zone, NR_ZONE_ACTIVE_FILE)),
+ K(zone_page_state(zone, NR_ZONE_INACTIVE_FILE)),
+ K(zone_page_state(zone, NR_ZONE_UNEVICTABLE)),
+ K(zone_page_state(zone, NR_ZONE_WRITE_PENDING)),
+ K(zone->present_pages),
+ K(zone_managed_pages(zone)),
+ K(zone_page_state(zone, NR_MLOCK)),
+ K(zone_page_state(zone, NR_BOUNCE)),
+ K(free_pcp),
+ K(this_cpu_read(zone->per_cpu_pageset->count)),
+ K(zone_page_state(zone, NR_FREE_CMA_PAGES)));
+ printk("lowmem_reserve[]:");
+ for (i = 0; i < MAX_NR_ZONES; i++)
+ printk(KERN_CONT " %ld", zone->lowmem_reserve[i]);
+ printk(KERN_CONT "\n");
+ }
+
+ for_each_populated_zone(zone) {
+ unsigned int order;
+ unsigned long nr[MAX_ORDER + 1], flags, total = 0;
+ unsigned char types[MAX_ORDER + 1];
+
+ if (zone_idx(zone) > max_zone_idx)
+ continue;
+ if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
+ continue;
+ show_node(zone);
+ printk(KERN_CONT "%s: ", zone->name);
+
+ spin_lock_irqsave(&zone->lock, flags);
+ for (order = 0; order <= MAX_ORDER; order++) {
+ struct free_area *area = &zone->free_area[order];
+ int type;
+
+ nr[order] = area->nr_free;
+ total += nr[order] << order;
+
+ types[order] = 0;
+ for (type = 0; type < MIGRATE_TYPES; type++) {
+ if (!free_area_empty(area, type))
+ types[order] |= 1 << type;
+ }
+ }
+ spin_unlock_irqrestore(&zone->lock, flags);
+ for (order = 0; order <= MAX_ORDER; order++) {
+ printk(KERN_CONT "%lu*%lukB ",
+ nr[order], K(1UL) << order);
+ if (nr[order])
+ show_migration_types(types[order]);
+ }
+ printk(KERN_CONT "= %lukB\n", K(total));
+ }
+
+ for_each_online_node(nid) {
+ if (show_mem_node_skip(filter, nid, nodemask))
+ continue;
+ hugetlb_show_meminfo_node(nid);
+ }
+
+ printk("%ld total pagecache pages\n", global_node_page_state(NR_FILE_PAGES));
+
+ show_swap_cache_info();
+}
+
+void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx)
+{
+ unsigned long total = 0, reserved = 0, highmem = 0;
+ struct zone *zone;
+
+ printk("Mem-Info:\n");
+ __show_free_areas(filter, nodemask, max_zone_idx);
+
+ for_each_populated_zone(zone) {
+
+ total += zone->present_pages;
+ reserved += zone->present_pages - zone_managed_pages(zone);
+
+ if (is_highmem(zone))
+ highmem += zone->present_pages;
+ }
+
+ printk("%lu pages RAM\n", total);
+ printk("%lu pages HighMem/MovableOnly\n", highmem);
+ printk("%lu pages reserved\n", reserved);
+#ifdef CONFIG_CMA
+ printk("%lu pages cma reserved\n", totalcma_pages);
+#endif
+#ifdef CONFIG_MEMORY_FAILURE
+ printk("%lu pages hwpoisoned\n", atomic_long_read(&num_poisoned_pages));
+#endif
+}
diff --git a/mm/slab.c b/mm/slab.c
index bb57f7fdbae1..b7817dcba63e 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1240,11 +1240,7 @@ void __init kmem_cache_init(void)
* Initialize the caches that provide memory for the kmem_cache_node
* structures first. Without this, further allocations will bug.
*/
- kmalloc_caches[KMALLOC_NORMAL][INDEX_NODE] = create_kmalloc_cache(
- kmalloc_info[INDEX_NODE].name[KMALLOC_NORMAL],
- kmalloc_info[INDEX_NODE].size,
- ARCH_KMALLOC_FLAGS, 0,
- kmalloc_info[INDEX_NODE].size);
+ new_kmalloc_cache(INDEX_NODE, KMALLOC_NORMAL, ARCH_KMALLOC_FLAGS);
slab_state = PARTIAL_NODE;
setup_kmalloc_cache_index_table();
diff --git a/mm/slab.h b/mm/slab.h
index bc36edd5ba4f..a59c8e5d2441 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -294,9 +294,8 @@ gfp_t kmalloc_fix_flags(gfp_t flags);
/* Functions provided by the slab allocators */
int __kmem_cache_create(struct kmem_cache *, slab_flags_t flags);
-struct kmem_cache *create_kmalloc_cache(const char *name, unsigned int size,
- slab_flags_t flags, unsigned int useroffset,
- unsigned int usersize);
+void __init new_kmalloc_cache(int idx, enum kmalloc_cache_type type,
+ slab_flags_t flags);
extern void create_boot_cache(struct kmem_cache *, const char *name,
unsigned int size, slab_flags_t flags,
unsigned int useroffset, unsigned int usersize);
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 607249785c07..43c008165f56 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -17,6 +17,8 @@
#include <linux/cpu.h>
#include <linux/uaccess.h>
#include <linux/seq_file.h>
+#include <linux/dma-mapping.h>
+#include <linux/swiotlb.h>
#include <linux/proc_fs.h>
#include <linux/debugfs.h>
#include <linux/kasan.h>
@@ -658,17 +660,16 @@ void __init create_boot_cache(struct kmem_cache *s, const char *name,
s->refcount = -1; /* Exempt from merging for now */
}
-struct kmem_cache *__init create_kmalloc_cache(const char *name,
- unsigned int size, slab_flags_t flags,
- unsigned int useroffset, unsigned int usersize)
+static struct kmem_cache *__init create_kmalloc_cache(const char *name,
+ unsigned int size,
+ slab_flags_t flags)
{
struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
if (!s)
panic("Out of memory when creating slab %s\n", name);
- create_boot_cache(s, name, size, flags | SLAB_KMALLOC, useroffset,
- usersize);
+ create_boot_cache(s, name, size, flags | SLAB_KMALLOC, 0, size);
list_add(&s->list, &slab_caches);
s->refcount = 1;
return s;
@@ -863,9 +864,22 @@ void __init setup_kmalloc_cache_index_table(void)
}
}
-static void __init
+static unsigned int __kmalloc_minalign(void)
+{
+#ifdef CONFIG_DMA_BOUNCE_UNALIGNED_KMALLOC
+ if (io_tlb_default_mem.nslabs)
+ return ARCH_KMALLOC_MINALIGN;
+#endif
+ return dma_get_cache_alignment();
+}
+
+void __init
new_kmalloc_cache(int idx, enum kmalloc_cache_type type, slab_flags_t flags)
{
+ unsigned int minalign = __kmalloc_minalign();
+ unsigned int aligned_size = kmalloc_info[idx].size;
+ int aligned_idx = idx;
+
if ((KMALLOC_RECLAIM != KMALLOC_NORMAL) && (type == KMALLOC_RECLAIM)) {
flags |= SLAB_RECLAIM_ACCOUNT;
} else if (IS_ENABLED(CONFIG_MEMCG_KMEM) && (type == KMALLOC_CGROUP)) {
@@ -878,10 +892,17 @@ new_kmalloc_cache(int idx, enum kmalloc_cache_type type, slab_flags_t flags)
flags |= SLAB_CACHE_DMA;
}
- kmalloc_caches[type][idx] = create_kmalloc_cache(
- kmalloc_info[idx].name[type],
- kmalloc_info[idx].size, flags, 0,
- kmalloc_info[idx].size);
+ if (minalign > ARCH_KMALLOC_MINALIGN) {
+ aligned_size = ALIGN(aligned_size, minalign);
+ aligned_idx = __kmalloc_index(aligned_size, false);
+ }
+
+ if (!kmalloc_caches[type][aligned_idx])
+ kmalloc_caches[type][aligned_idx] = create_kmalloc_cache(
+ kmalloc_info[aligned_idx].name[type],
+ aligned_size, flags);
+ if (idx != aligned_idx)
+ kmalloc_caches[type][idx] = kmalloc_caches[type][aligned_idx];
/*
* If CONFIG_MEMCG_KMEM is enabled, disable cache merging for
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index 10d73a0dfcec..a044a130405b 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -133,7 +133,7 @@ static void * __meminit altmap_alloc_block_buf(unsigned long size,
void __meminit vmemmap_verify(pte_t *pte, int node,
unsigned long start, unsigned long end)
{
- unsigned long pfn = pte_pfn(*pte);
+ unsigned long pfn = pte_pfn(ptep_get(pte));
int actual_node = early_pfn_to_nid(pfn);
if (node_distance(actual_node, node) > LOCAL_DISTANCE)
@@ -146,7 +146,7 @@ pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node,
struct page *reuse)
{
pte_t *pte = pte_offset_kernel(pmd, addr);
- if (pte_none(*pte)) {
+ if (pte_none(ptep_get(pte))) {
pte_t entry;
void *p;
@@ -414,7 +414,7 @@ static int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn,
* with just tail struct pages.
*/
return vmemmap_populate_range(start, end, node, NULL,
- pte_page(*pte));
+ pte_page(ptep_get(pte)));
}
size = min(end - start, pgmap_vmemmap_nr(pgmap) * sizeof(struct page));
@@ -438,7 +438,7 @@ static int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn,
*/
next += PAGE_SIZE;
rc = vmemmap_populate_range(next, last, node, NULL,
- pte_page(*pte));
+ pte_page(ptep_get(pte)));
if (rc)
return -ENOMEM;
}
diff --git a/mm/sparse.c b/mm/sparse.c
index c2afdb26039e..7a29e10193fe 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -922,10 +922,14 @@ int __meminit sparse_add_section(int nid, unsigned long start_pfn,
return 0;
}
-void sparse_remove_section(struct mem_section *ms, unsigned long pfn,
- unsigned long nr_pages, unsigned long map_offset,
- struct vmem_altmap *altmap)
+void sparse_remove_section(unsigned long pfn, unsigned long nr_pages,
+ struct vmem_altmap *altmap)
{
+ struct mem_section *ms = __pfn_to_section(pfn);
+
+ if (WARN_ON_ONCE(!valid_section(ms)))
+ return;
+
section_deactivate(pfn, nr_pages, altmap);
}
#endif /* CONFIG_MEMORY_HOTPLUG */
diff --git a/mm/swap.c b/mm/swap.c
index 423199ee8478..cd8f0150ba3a 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -76,7 +76,7 @@ static DEFINE_PER_CPU(struct cpu_fbatches, cpu_fbatches) = {
/*
* This path almost never happens for VM activity - pages are normally freed
- * via pagevecs. But it gets used by networking - and for compound pages.
+ * in batches. But it gets used by networking - and for compound pages.
*/
static void __page_cache_release(struct folio *folio)
{
@@ -1044,25 +1044,25 @@ void release_pages(release_pages_arg arg, int nr)
EXPORT_SYMBOL(release_pages);
/*
- * The pages which we're about to release may be in the deferred lru-addition
+ * The folios which we're about to release may be in the deferred lru-addition
* queues. That would prevent them from really being freed right now. That's
- * OK from a correctness point of view but is inefficient - those pages may be
+ * OK from a correctness point of view but is inefficient - those folios may be
* cache-warm and we want to give them back to the page allocator ASAP.
*
- * So __pagevec_release() will drain those queues here.
+ * So __folio_batch_release() will drain those queues here.
* folio_batch_move_lru() calls folios_put() directly to avoid
* mutual recursion.
*/
-void __pagevec_release(struct pagevec *pvec)
+void __folio_batch_release(struct folio_batch *fbatch)
{
- if (!pvec->percpu_pvec_drained) {
+ if (!fbatch->percpu_pvec_drained) {
lru_add_drain();
- pvec->percpu_pvec_drained = true;
+ fbatch->percpu_pvec_drained = true;
}
- release_pages(pvec->pages, pagevec_count(pvec));
- pagevec_reinit(pvec);
+ release_pages(fbatch->folios, folio_batch_count(fbatch));
+ folio_batch_reinit(fbatch);
}
-EXPORT_SYMBOL(__pagevec_release);
+EXPORT_SYMBOL(__folio_batch_release);
/**
* folio_batch_remove_exceptionals() - Prune non-folios from a batch.
diff --git a/mm/swap_state.c b/mm/swap_state.c
index b76a65ac28b3..f8ea7015bad4 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -16,7 +16,6 @@
#include <linux/pagemap.h>
#include <linux/backing-dev.h>
#include <linux/blkdev.h>
-#include <linux/pagevec.h>
#include <linux/migrate.h>
#include <linux/vmalloc.h>
#include <linux/swap_slots.h>
@@ -275,9 +274,9 @@ void clear_shadow_from_swap_cache(int type, unsigned long begin,
}
}
-/*
- * If we are the only user, then try to free up the swap cache.
- *
+/*
+ * If we are the only user, then try to free up the swap cache.
+ *
* Its ok to check the swapcache flag without the folio lock
* here because we are going to recheck again inside
* folio_free_swap() _with_ the lock.
@@ -294,7 +293,7 @@ void free_swap_cache(struct page *page)
}
}
-/*
+/*
* Perform a free_page(), also freeing any swap cache associated with
* this page if it is the last user of the page.
*/
@@ -417,9 +416,13 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
{
struct swap_info_struct *si;
struct folio *folio;
+ struct page *page;
void *shadow = NULL;
*new_page_allocated = false;
+ si = get_swap_device(entry);
+ if (!si)
+ return NULL;
for (;;) {
int err;
@@ -428,14 +431,12 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
* called after swap_cache_get_folio() failed, re-calling
* that would confuse statistics.
*/
- si = get_swap_device(entry);
- if (!si)
- return NULL;
folio = filemap_get_folio(swap_address_space(entry),
swp_offset(entry));
- put_swap_device(si);
- if (!IS_ERR(folio))
- return folio_file_page(folio, swp_offset(entry));
+ if (!IS_ERR(folio)) {
+ page = folio_file_page(folio, swp_offset(entry));
+ goto got_page;
+ }
/*
* Just skip read ahead for unused swap slot.
@@ -445,8 +446,8 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
* as SWAP_HAS_CACHE. That's done in later part of code or
* else swap_off will be aborted if we return NULL.
*/
- if (!__swp_swapcount(entry) && swap_slot_cache_enabled)
- return NULL;
+ if (!swap_swapcount(si, entry) && swap_slot_cache_enabled)
+ goto fail_put_swap;
/*
* Get a new page to read into from swap. Allocate it now,
@@ -455,7 +456,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
*/
folio = vma_alloc_folio(gfp_mask, 0, vma, addr, false);
if (!folio)
- return NULL;
+ goto fail_put_swap;
/*
* Swap entry may have been freed since our caller observed it.
@@ -466,7 +467,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
folio_put(folio);
if (err != -EEXIST)
- return NULL;
+ goto fail_put_swap;
/*
* We might race against __delete_from_swap_cache(), and
@@ -500,12 +501,17 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
/* Caller will initiate read into locked folio */
folio_add_lru(folio);
*new_page_allocated = true;
- return &folio->page;
+ page = &folio->page;
+got_page:
+ put_swap_device(si);
+ return page;
fail_unlock:
put_swap_folio(folio, entry);
folio_unlock(folio);
folio_put(folio);
+fail_put_swap:
+ put_swap_device(si);
return NULL;
}
@@ -514,6 +520,10 @@ fail_unlock:
* and reading the disk if it is not already cached.
* A failure return means that either the page allocation failed or that
* the swap entry is no longer in use.
+ *
+ * get/put_swap_device() aren't needed to call this function, because
+ * __read_swap_cache_async() call them and swap_readpage() holds the
+ * swap cache folio lock.
*/
struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
struct vm_area_struct *vma,
@@ -698,6 +708,14 @@ void exit_swap_address_space(unsigned int type)
swapper_spaces[type] = NULL;
}
+#define SWAP_RA_ORDER_CEILING 5
+
+struct vma_swap_readahead {
+ unsigned short win;
+ unsigned short offset;
+ unsigned short nr_pte;
+};
+
static void swap_ra_info(struct vm_fault *vmf,
struct vma_swap_readahead *ra_info)
{
@@ -705,11 +723,7 @@ static void swap_ra_info(struct vm_fault *vmf,
unsigned long ra_val;
unsigned long faddr, pfn, fpfn, lpfn, rpfn;
unsigned long start, end;
- pte_t *pte, *orig_pte;
unsigned int max_win, hits, prev_win, win;
-#ifndef CONFIG_64BIT
- pte_t *tpte;
-#endif
max_win = 1 << min_t(unsigned int, READ_ONCE(page_cluster),
SWAP_RA_ORDER_CEILING);
@@ -728,12 +742,9 @@ static void swap_ra_info(struct vm_fault *vmf,
max_win, prev_win);
atomic_long_set(&vma->swap_readahead_info,
SWAP_RA_VAL(faddr, win, 0));
-
if (win == 1)
return;
- /* Copy the PTEs because the page table may be unmapped */
- orig_pte = pte = pte_offset_map(vmf->pmd, faddr);
if (fpfn == pfn + 1) {
lpfn = fpfn;
rpfn = fpfn + win;
@@ -753,15 +764,6 @@ static void swap_ra_info(struct vm_fault *vmf,
ra_info->nr_pte = end - start;
ra_info->offset = fpfn - start;
- pte -= ra_info->offset;
-#ifdef CONFIG_64BIT
- ra_info->ptes = pte;
-#else
- tpte = ra_info->ptes;
- for (pfn = start; pfn != end; pfn++)
- *tpte++ = *pte++;
-#endif
- pte_unmap(orig_pte);
}
/**
@@ -785,7 +787,8 @@ static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask,
struct swap_iocb *splug = NULL;
struct vm_area_struct *vma = vmf->vma;
struct page *page;
- pte_t *pte, pentry;
+ pte_t *pte = NULL, pentry;
+ unsigned long addr;
swp_entry_t entry;
unsigned int i;
bool page_allocated;
@@ -797,17 +800,25 @@ static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask,
if (ra_info.win == 1)
goto skip;
+ addr = vmf->address - (ra_info.offset * PAGE_SIZE);
+
blk_start_plug(&plug);
- for (i = 0, pte = ra_info.ptes; i < ra_info.nr_pte;
- i++, pte++) {
- pentry = *pte;
+ for (i = 0; i < ra_info.nr_pte; i++, addr += PAGE_SIZE) {
+ if (!pte++) {
+ pte = pte_offset_map(vmf->pmd, addr);
+ if (!pte)
+ break;
+ }
+ pentry = ptep_get_lockless(pte);
if (!is_swap_pte(pentry))
continue;
entry = pte_to_swp_entry(pentry);
if (unlikely(non_swap_entry(entry)))
continue;
+ pte_unmap(pte);
+ pte = NULL;
page = __read_swap_cache_async(entry, gfp_mask, vma,
- vmf->address, &page_allocated);
+ addr, &page_allocated);
if (!page)
continue;
if (page_allocated) {
@@ -819,6 +830,8 @@ static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask,
}
put_page(page);
}
+ if (pte)
+ pte_unmap(pte);
blk_finish_plug(&plug);
swap_read_unplug(splug);
lru_add_drain();
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 6bc83060df9a..8e6dde68b389 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -41,6 +41,7 @@
#include <linux/swap_slots.h>
#include <linux/sort.h>
#include <linux/completion.h>
+#include <linux/suspend.h>
#include <asm/tlbflush.h>
#include <linux/swapops.h>
@@ -1219,6 +1220,13 @@ static unsigned char __swap_entry_free_locked(struct swap_info_struct *p,
}
/*
+ * When we get a swap entry, if there aren't some other ways to
+ * prevent swapoff, such as the folio in swap cache is locked, page
+ * table lock is held, etc., the swap entry may become invalid because
+ * of swapoff. Then, we need to enclose all swap related functions
+ * with get_swap_device() and put_swap_device(), unless the swap
+ * functions call get/put_swap_device() by themselves.
+ *
* Check whether swap entry is valid in the swap device. If so,
* return pointer to swap_info_struct, and keep the swap entry valid
* via preventing the swap device from being swapoff, until
@@ -1227,9 +1235,8 @@ static unsigned char __swap_entry_free_locked(struct swap_info_struct *p,
* Notice that swapoff or swapoff+swapon can still happen before the
* percpu_ref_tryget_live() in get_swap_device() or after the
* percpu_ref_put() in put_swap_device() if there isn't any other way
- * to prevent swapoff, such as page lock, page table lock, etc. The
- * caller must be prepared for that. For example, the following
- * situation is possible.
+ * to prevent swapoff. The caller must be prepared for that. For
+ * example, the following situation is possible.
*
* CPU1 CPU2
* do_swap_page()
@@ -1432,16 +1439,10 @@ void swapcache_free_entries(swp_entry_t *entries, int n)
int __swap_count(swp_entry_t entry)
{
- struct swap_info_struct *si;
+ struct swap_info_struct *si = swp_swap_info(entry);
pgoff_t offset = swp_offset(entry);
- int count = 0;
- si = get_swap_device(entry);
- if (si) {
- count = swap_count(si->swap_map[offset]);
- put_swap_device(si);
- }
- return count;
+ return swap_count(si->swap_map[offset]);
}
/*
@@ -1449,7 +1450,7 @@ int __swap_count(swp_entry_t entry)
* This does not give an exact answer when swap count is continued,
* but does include the high COUNT_CONTINUED flag to allow for that.
*/
-static int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry)
+int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry)
{
pgoff_t offset = swp_offset(entry);
struct swap_cluster_info *ci;
@@ -1463,24 +1464,6 @@ static int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry)
/*
* How many references to @entry are currently swapped out?
- * This does not give an exact answer when swap count is continued,
- * but does include the high COUNT_CONTINUED flag to allow for that.
- */
-int __swp_swapcount(swp_entry_t entry)
-{
- int count = 0;
- struct swap_info_struct *si;
-
- si = get_swap_device(entry);
- if (si) {
- count = swap_swapcount(si, entry);
- put_swap_device(si);
- }
- return count;
-}
-
-/*
- * How many references to @entry are currently swapped out?
* This considers COUNT_CONTINUED so it returns exact answer.
*/
int swp_swapcount(swp_entry_t entry)
@@ -1762,7 +1745,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
struct page *page = folio_file_page(folio, swp_offset(entry));
struct page *swapcache;
spinlock_t *ptl;
- pte_t *pte, new_pte;
+ pte_t *pte, new_pte, old_pte;
bool hwposioned = false;
int ret = 1;
@@ -1774,11 +1757,14 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
hwposioned = true;
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
- if (unlikely(!pte_same_as_swp(*pte, swp_entry_to_pte(entry)))) {
+ if (unlikely(!pte || !pte_same_as_swp(ptep_get(pte),
+ swp_entry_to_pte(entry)))) {
ret = 0;
goto out;
}
+ old_pte = ptep_get(pte);
+
if (unlikely(hwposioned || !PageUptodate(page))) {
swp_entry_t swp_entry;
@@ -1810,7 +1796,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
* call and have the page locked.
*/
VM_BUG_ON_PAGE(PageWriteback(page), page);
- if (pte_swp_exclusive(*pte))
+ if (pte_swp_exclusive(old_pte))
rmap_flags |= RMAP_EXCLUSIVE;
page_add_anon_rmap(page, vma, addr, rmap_flags);
@@ -1819,15 +1805,16 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
lru_cache_add_inactive_or_unevictable(page, vma);
}
new_pte = pte_mkold(mk_pte(page, vma->vm_page_prot));
- if (pte_swp_soft_dirty(*pte))
+ if (pte_swp_soft_dirty(old_pte))
new_pte = pte_mksoft_dirty(new_pte);
- if (pte_swp_uffd_wp(*pte))
+ if (pte_swp_uffd_wp(old_pte))
new_pte = pte_mkuffd_wp(new_pte);
setpte:
set_pte_at(vma->vm_mm, addr, pte, new_pte);
swap_free(entry);
out:
- pte_unmap_unlock(pte, ptl);
+ if (pte)
+ pte_unmap_unlock(pte, ptl);
if (page != swapcache) {
unlock_page(page);
put_page(page);
@@ -1839,27 +1826,37 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, unsigned long end,
unsigned int type)
{
- swp_entry_t entry;
- pte_t *pte;
+ pte_t *pte = NULL;
struct swap_info_struct *si;
- int ret = 0;
si = swap_info[type];
- pte = pte_offset_map(pmd, addr);
do {
struct folio *folio;
unsigned long offset;
unsigned char swp_count;
+ swp_entry_t entry;
+ int ret;
+ pte_t ptent;
+
+ if (!pte++) {
+ pte = pte_offset_map(pmd, addr);
+ if (!pte)
+ break;
+ }
+
+ ptent = ptep_get_lockless(pte);
- if (!is_swap_pte(*pte))
+ if (!is_swap_pte(ptent))
continue;
- entry = pte_to_swp_entry(*pte);
+ entry = pte_to_swp_entry(ptent);
if (swp_type(entry) != type)
continue;
offset = swp_offset(entry);
pte_unmap(pte);
+ pte = NULL;
+
folio = swap_cache_get_folio(entry, vma, addr);
if (!folio) {
struct page *page;
@@ -1878,8 +1875,7 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
if (!folio) {
swp_count = READ_ONCE(si->swap_map[offset]);
if (swp_count == 0 || swp_count == SWAP_MAP_BAD)
- goto try_next;
-
+ continue;
return -ENOMEM;
}
@@ -1889,20 +1885,17 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
if (ret < 0) {
folio_unlock(folio);
folio_put(folio);
- goto out;
+ return ret;
}
folio_free_swap(folio);
folio_unlock(folio);
folio_put(folio);
-try_next:
- pte = pte_offset_map(pmd, addr);
- } while (pte++, addr += PAGE_SIZE, addr != end);
- pte_unmap(pte - 1);
+ } while (addr += PAGE_SIZE, addr != end);
- ret = 0;
-out:
- return ret;
+ if (pte)
+ pte_unmap(pte);
+ return 0;
}
static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
@@ -1917,8 +1910,6 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
do {
cond_resched();
next = pmd_addr_end(addr, end);
- if (pmd_none_or_trans_huge_or_clear_bad(pmd))
- continue;
ret = unuse_pte_range(vma, pmd, addr, next, type);
if (ret)
return ret;
@@ -3288,9 +3279,7 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
unsigned char has_cache;
int err;
- p = get_swap_device(entry);
- if (!p)
- return -EINVAL;
+ p = swp_swap_info(entry);
offset = swp_offset(entry);
ci = lock_cluster_or_swap_info(p, offset);
@@ -3337,7 +3326,6 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
unlock_out:
unlock_cluster_or_swap_info(p, ci);
- put_swap_device(p);
return err;
}
@@ -3468,11 +3456,6 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
goto out;
}
- /*
- * We are fortunate that although vmalloc_to_page uses pte_offset_map,
- * no architecture is using highmem pages for kernel page tables: so it
- * will not corrupt the GFP_ATOMIC caller's atomic page table kmaps.
- */
head = vmalloc_to_page(si->swap_map + offset);
offset &= ~PAGE_MASK;
diff --git a/mm/truncate.c b/mm/truncate.c
index 86de31ed4d32..95d1291d269b 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -486,18 +486,17 @@ void truncate_inode_pages_final(struct address_space *mapping)
EXPORT_SYMBOL(truncate_inode_pages_final);
/**
- * invalidate_mapping_pagevec - Invalidate all the unlocked pages of one inode
- * @mapping: the address_space which holds the pages to invalidate
+ * mapping_try_invalidate - Invalidate all the evictable folios of one inode
+ * @mapping: the address_space which holds the folios to invalidate
* @start: the offset 'from' which to invalidate
* @end: the offset 'to' which to invalidate (inclusive)
- * @nr_pagevec: invalidate failed page number for caller
+ * @nr_failed: How many folio invalidations failed
*
- * This helper is similar to invalidate_mapping_pages(), except that it accounts
- * for pages that are likely on a pagevec and counts them in @nr_pagevec, which
- * will be used by the caller.
+ * This function is similar to invalidate_mapping_pages(), except that it
+ * returns the number of folios which could not be evicted in @nr_failed.
*/
-unsigned long invalidate_mapping_pagevec(struct address_space *mapping,
- pgoff_t start, pgoff_t end, unsigned long *nr_pagevec)
+unsigned long mapping_try_invalidate(struct address_space *mapping,
+ pgoff_t start, pgoff_t end, unsigned long *nr_failed)
{
pgoff_t indices[PAGEVEC_SIZE];
struct folio_batch fbatch;
@@ -527,9 +526,9 @@ unsigned long invalidate_mapping_pagevec(struct address_space *mapping,
*/
if (!ret) {
deactivate_file_folio(folio);
- /* It is likely on the pagevec of a remote CPU */
- if (nr_pagevec)
- (*nr_pagevec)++;
+ /* Likely in the lru cache of a remote CPU */
+ if (nr_failed)
+ (*nr_failed)++;
}
count += ret;
}
@@ -552,12 +551,12 @@ unsigned long invalidate_mapping_pagevec(struct address_space *mapping,
* If you want to remove all the pages of one inode, regardless of
* their use and writeback state, use truncate_inode_pages().
*
- * Return: the number of the cache entries that were invalidated
+ * Return: The number of indices that had their contents invalidated
*/
unsigned long invalidate_mapping_pages(struct address_space *mapping,
pgoff_t start, pgoff_t end)
{
- return invalidate_mapping_pagevec(mapping, start, end, NULL);
+ return mapping_try_invalidate(mapping, start, end, NULL);
}
EXPORT_SYMBOL(invalidate_mapping_pages);
@@ -566,7 +565,7 @@ EXPORT_SYMBOL(invalidate_mapping_pages);
* refcount. We do this because invalidate_inode_pages2() needs stronger
* invalidation guarantees, and cannot afford to leave pages behind because
* shrink_page_list() has a temp ref on them, or because they're transiently
- * sitting in the folio_add_lru() pagevecs.
+ * sitting in the folio_add_lru() caches.
*/
static int invalidate_complete_folio2(struct address_space *mapping,
struct folio *folio)
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index e97a0b4889fc..a2bf37ee276d 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -76,7 +76,10 @@ int mfill_atomic_install_pte(pmd_t *dst_pmd,
if (flags & MFILL_ATOMIC_WP)
_dst_pte = pte_mkuffd_wp(_dst_pte);
+ ret = -EAGAIN;
dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
+ if (!dst_pte)
+ goto out;
if (vma_is_shmem(dst_vma)) {
/* serialize against truncate with the page table lock */
@@ -94,7 +97,7 @@ int mfill_atomic_install_pte(pmd_t *dst_pmd,
* registered, we firstly wr-protect a none pte which has no page cache
* page backing it, then access the page.
*/
- if (!pte_none_mostly(*dst_pte))
+ if (!pte_none_mostly(ptep_get(dst_pte)))
goto out_unlock;
folio = page_folio(page);
@@ -121,6 +124,7 @@ int mfill_atomic_install_pte(pmd_t *dst_pmd,
ret = 0;
out_unlock:
pte_unmap_unlock(dst_pte, ptl);
+out:
return ret;
}
@@ -212,7 +216,10 @@ static int mfill_atomic_pte_zeropage(pmd_t *dst_pmd,
_dst_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr),
dst_vma->vm_page_prot));
+ ret = -EAGAIN;
dst_pte = pte_offset_map_lock(dst_vma->vm_mm, dst_pmd, dst_addr, &ptl);
+ if (!dst_pte)
+ goto out;
if (dst_vma->vm_file) {
/* the shmem MAP_PRIVATE case requires checking the i_size */
inode = dst_vma->vm_file->f_inode;
@@ -223,7 +230,7 @@ static int mfill_atomic_pte_zeropage(pmd_t *dst_pmd,
goto out_unlock;
}
ret = -EEXIST;
- if (!pte_none(*dst_pte))
+ if (!pte_none(ptep_get(dst_pte)))
goto out_unlock;
set_pte_at(dst_vma->vm_mm, dst_addr, dst_pte, _dst_pte);
/* No need to invalidate - it was non-present before */
@@ -231,6 +238,7 @@ static int mfill_atomic_pte_zeropage(pmd_t *dst_pmd,
ret = 0;
out_unlock:
pte_unmap_unlock(dst_pte, ptl);
+out:
return ret;
}
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 1d13d71687d7..93cf99aba335 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -103,7 +103,7 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
if (!pte)
return -ENOMEM;
do {
- BUG_ON(!pte_none(*pte));
+ BUG_ON(!pte_none(ptep_get(pte)));
#ifdef CONFIG_HUGETLB_PAGE
size = arch_vmap_pte_range_map_size(addr, end, pfn, max_page_shift);
@@ -472,7 +472,7 @@ static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr,
do {
struct page *page = pages[*nr];
- if (WARN_ON(!pte_none(*pte)))
+ if (WARN_ON(!pte_none(ptep_get(pte))))
return -EBUSY;
if (WARN_ON(!page))
return -ENOMEM;
@@ -703,11 +703,10 @@ struct page *vmalloc_to_page(const void *vmalloc_addr)
if (WARN_ON_ONCE(pmd_bad(*pmd)))
return NULL;
- ptep = pte_offset_map(pmd, addr);
- pte = *ptep;
+ ptep = pte_offset_kernel(pmd, addr);
+ pte = ptep_get(ptep);
if (pte_present(pte))
page = pte_page(pte);
- pte_unmap(ptep);
return page;
}
@@ -791,7 +790,7 @@ get_subtree_max_size(struct rb_node *node)
RB_DECLARE_CALLBACKS_MAX(static, free_vmap_area_rb_augment_cb,
struct vmap_area, rb_node, unsigned long, subtree_max_size, va_size)
-static void purge_vmap_area_lazy(void);
+static void reclaim_and_purge_vmap_areas(void);
static BLOCKING_NOTIFIER_HEAD(vmap_notify_list);
static void drain_vmap_area_work(struct work_struct *work);
static DECLARE_WORK(drain_vmap_work, drain_vmap_area_work);
@@ -1649,7 +1648,7 @@ retry:
overflow:
if (!purged) {
- purge_vmap_area_lazy();
+ reclaim_and_purge_vmap_areas();
purged = 1;
goto retry;
}
@@ -1785,9 +1784,10 @@ out:
}
/*
- * Kick off a purge of the outstanding lazy areas.
+ * Reclaim vmap areas by purging fragmented blocks and purge_vmap_area_list.
*/
-static void purge_vmap_area_lazy(void)
+static void reclaim_and_purge_vmap_areas(void)
+
{
mutex_lock(&vmap_purge_lock);
purge_fragmented_blocks_allcpus();
@@ -1908,6 +1908,12 @@ static struct vmap_area *find_unlink_vmap_area(unsigned long addr)
#define VMAP_BLOCK_SIZE (VMAP_BBMAP_BITS * PAGE_SIZE)
+/*
+ * Purge threshold to prevent overeager purging of fragmented blocks for
+ * regular operations: Purge if vb->free is less than 1/4 of the capacity.
+ */
+#define VMAP_PURGE_THRESHOLD (VMAP_BBMAP_BITS / 4)
+
#define VMAP_RAM 0x1 /* indicates vm_map_ram area*/
#define VMAP_BLOCK 0x2 /* mark out the vmap_block sub-type*/
#define VMAP_FLAGS_MASK 0x3
@@ -2086,39 +2092,62 @@ static void free_vmap_block(struct vmap_block *vb)
kfree_rcu(vb, rcu_head);
}
+static bool purge_fragmented_block(struct vmap_block *vb,
+ struct vmap_block_queue *vbq, struct list_head *purge_list,
+ bool force_purge)
+{
+ if (vb->free + vb->dirty != VMAP_BBMAP_BITS ||
+ vb->dirty == VMAP_BBMAP_BITS)
+ return false;
+
+ /* Don't overeagerly purge usable blocks unless requested */
+ if (!(force_purge || vb->free < VMAP_PURGE_THRESHOLD))
+ return false;
+
+ /* prevent further allocs after releasing lock */
+ WRITE_ONCE(vb->free, 0);
+ /* prevent purging it again */
+ WRITE_ONCE(vb->dirty, VMAP_BBMAP_BITS);
+ vb->dirty_min = 0;
+ vb->dirty_max = VMAP_BBMAP_BITS;
+ spin_lock(&vbq->lock);
+ list_del_rcu(&vb->free_list);
+ spin_unlock(&vbq->lock);
+ list_add_tail(&vb->purge, purge_list);
+ return true;
+}
+
+static void free_purged_blocks(struct list_head *purge_list)
+{
+ struct vmap_block *vb, *n_vb;
+
+ list_for_each_entry_safe(vb, n_vb, purge_list, purge) {
+ list_del(&vb->purge);
+ free_vmap_block(vb);
+ }
+}
+
static void purge_fragmented_blocks(int cpu)
{
LIST_HEAD(purge);
struct vmap_block *vb;
- struct vmap_block *n_vb;
struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu);
rcu_read_lock();
list_for_each_entry_rcu(vb, &vbq->free, free_list) {
+ unsigned long free = READ_ONCE(vb->free);
+ unsigned long dirty = READ_ONCE(vb->dirty);
- if (!(vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS))
+ if (free + dirty != VMAP_BBMAP_BITS ||
+ dirty == VMAP_BBMAP_BITS)
continue;
spin_lock(&vb->lock);
- if (vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS) {
- vb->free = 0; /* prevent further allocs after releasing lock */
- vb->dirty = VMAP_BBMAP_BITS; /* prevent purging it again */
- vb->dirty_min = 0;
- vb->dirty_max = VMAP_BBMAP_BITS;
- spin_lock(&vbq->lock);
- list_del_rcu(&vb->free_list);
- spin_unlock(&vbq->lock);
- spin_unlock(&vb->lock);
- list_add_tail(&vb->purge, &purge);
- } else
- spin_unlock(&vb->lock);
+ purge_fragmented_block(vb, vbq, &purge, true);
+ spin_unlock(&vb->lock);
}
rcu_read_unlock();
-
- list_for_each_entry_safe(vb, n_vb, &purge, purge) {
- list_del(&vb->purge);
- free_vmap_block(vb);
- }
+ free_purged_blocks(&purge);
}
static void purge_fragmented_blocks_allcpus(void)
@@ -2153,6 +2182,9 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
list_for_each_entry_rcu(vb, &vbq->free, free_list) {
unsigned long pages_off;
+ if (READ_ONCE(vb->free) < (1UL << order))
+ continue;
+
spin_lock(&vb->lock);
if (vb->free < (1UL << order)) {
spin_unlock(&vb->lock);
@@ -2161,7 +2193,7 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
pages_off = VMAP_BBMAP_BITS - vb->free;
vaddr = vmap_block_vaddr(vb->va->va_start, pages_off);
- vb->free -= 1UL << order;
+ WRITE_ONCE(vb->free, vb->free - (1UL << order));
bitmap_set(vb->used_map, pages_off, (1UL << order));
if (vb->free == 0) {
spin_lock(&vbq->lock);
@@ -2211,11 +2243,11 @@ static void vb_free(unsigned long addr, unsigned long size)
spin_lock(&vb->lock);
- /* Expand dirty range */
+ /* Expand the not yet TLB flushed dirty range */
vb->dirty_min = min(vb->dirty_min, offset);
vb->dirty_max = max(vb->dirty_max, offset + (1UL << order));
- vb->dirty += 1UL << order;
+ WRITE_ONCE(vb->dirty, vb->dirty + (1UL << order));
if (vb->dirty == VMAP_BBMAP_BITS) {
BUG_ON(vb->free);
spin_unlock(&vb->lock);
@@ -2226,21 +2258,30 @@ static void vb_free(unsigned long addr, unsigned long size)
static void _vm_unmap_aliases(unsigned long start, unsigned long end, int flush)
{
+ LIST_HEAD(purge_list);
int cpu;
if (unlikely(!vmap_initialized))
return;
- might_sleep();
+ mutex_lock(&vmap_purge_lock);
for_each_possible_cpu(cpu) {
struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu);
struct vmap_block *vb;
+ unsigned long idx;
rcu_read_lock();
- list_for_each_entry_rcu(vb, &vbq->free, free_list) {
+ xa_for_each(&vbq->vmap_blocks, idx, vb) {
spin_lock(&vb->lock);
- if (vb->dirty && vb->dirty != VMAP_BBMAP_BITS) {
+
+ /*
+ * Try to purge a fragmented block first. If it's
+ * not purgeable, check whether there is dirty
+ * space to be flushed.
+ */
+ if (!purge_fragmented_block(vb, vbq, &purge_list, false) &&
+ vb->dirty_max && vb->dirty != VMAP_BBMAP_BITS) {
unsigned long va_start = vb->va->va_start;
unsigned long s, e;
@@ -2250,15 +2291,18 @@ static void _vm_unmap_aliases(unsigned long start, unsigned long end, int flush)
start = min(s, start);
end = max(e, end);
+ /* Prevent that this is flushed again */
+ vb->dirty_min = VMAP_BBMAP_BITS;
+ vb->dirty_max = 0;
+
flush = 1;
}
spin_unlock(&vb->lock);
}
rcu_read_unlock();
}
+ free_purged_blocks(&purge_list);
- mutex_lock(&vmap_purge_lock);
- purge_fragmented_blocks_allcpus();
if (!__purge_vmap_area_lazy(start, end) && flush)
flush_tlb_kernel_range(start, end);
mutex_unlock(&vmap_purge_lock);
@@ -2899,10 +2943,16 @@ struct vmap_pfn_data {
static int vmap_pfn_apply(pte_t *pte, unsigned long addr, void *private)
{
struct vmap_pfn_data *data = private;
+ unsigned long pfn = data->pfns[data->idx];
+ pte_t ptent;
- if (WARN_ON_ONCE(pfn_valid(data->pfns[data->idx])))
+ if (WARN_ON_ONCE(pfn_valid(pfn)))
return -EINVAL;
- *pte = pte_mkspecial(pfn_pte(data->pfns[data->idx++], data->prot));
+
+ ptent = pte_mkspecial(pfn_pte(pfn, data->prot));
+ set_pte_at(&init_mm, addr, pte, ptent);
+
+ data->idx++;
return 0;
}
@@ -3520,7 +3570,7 @@ static size_t zero_iter(struct iov_iter *iter, size_t count)
while (remains > 0) {
size_t num, copied;
- num = remains < PAGE_SIZE ? remains : PAGE_SIZE;
+ num = min_t(size_t, remains, PAGE_SIZE);
copied = copy_page_to_iter_nofault(ZERO_PAGE(0), 0, num, iter);
remains -= copied;
@@ -4151,7 +4201,7 @@ recovery:
overflow:
spin_unlock(&free_vmap_area_lock);
if (!purged) {
- purge_vmap_area_lazy();
+ reclaim_and_purge_vmap_areas();
purged = true;
/* Before "retry", check if we recover. */
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 5bf98d0a22c9..1080209a568b 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -429,12 +429,17 @@ void reparent_shrinker_deferred(struct mem_cgroup *memcg)
up_read(&shrinker_rwsem);
}
+/* Returns true for reclaim through cgroup limits or cgroup interfaces. */
static bool cgroup_reclaim(struct scan_control *sc)
{
return sc->target_mem_cgroup;
}
-static bool global_reclaim(struct scan_control *sc)
+/*
+ * Returns true for reclaim on the root cgroup. This is true for direct
+ * allocator reclaim and reclaim through cgroup interfaces on the root cgroup.
+ */
+static bool root_reclaim(struct scan_control *sc)
{
return !sc->target_mem_cgroup || mem_cgroup_is_root(sc->target_mem_cgroup);
}
@@ -489,7 +494,7 @@ static bool cgroup_reclaim(struct scan_control *sc)
return false;
}
-static bool global_reclaim(struct scan_control *sc)
+static bool root_reclaim(struct scan_control *sc)
{
return true;
}
@@ -546,7 +551,7 @@ static void flush_reclaim_state(struct scan_control *sc)
* memcg reclaim, to make reporting more accurate and reduce
* underestimation, but it's probably not worth the complexity for now.
*/
- if (current->reclaim_state && global_reclaim(sc)) {
+ if (current->reclaim_state && root_reclaim(sc)) {
sc->nr_reclaimed += current->reclaim_state->reclaimed;
current->reclaim_state->reclaimed = 0;
}
@@ -1606,9 +1611,10 @@ static void folio_check_dirty_writeback(struct folio *folio,
mapping->a_ops->is_dirty_writeback(folio, dirty, writeback);
}
-static struct page *alloc_demote_page(struct page *page, unsigned long private)
+static struct folio *alloc_demote_folio(struct folio *src,
+ unsigned long private)
{
- struct page *target_page;
+ struct folio *dst;
nodemask_t *allowed_mask;
struct migration_target_control *mtc;
@@ -1626,14 +1632,14 @@ static struct page *alloc_demote_page(struct page *page, unsigned long private)
*/
mtc->nmask = NULL;
mtc->gfp_mask |= __GFP_THISNODE;
- target_page = alloc_migration_target(page, (unsigned long)mtc);
- if (target_page)
- return target_page;
+ dst = alloc_migration_target(src, (unsigned long)mtc);
+ if (dst)
+ return dst;
mtc->gfp_mask &= ~__GFP_THISNODE;
mtc->nmask = allowed_mask;
- return alloc_migration_target(page, (unsigned long)mtc);
+ return alloc_migration_target(src, (unsigned long)mtc);
}
/*
@@ -1668,7 +1674,7 @@ static unsigned int demote_folio_list(struct list_head *demote_folios,
node_get_allowed_targets(pgdat, &allowed_mask);
/* Demotion ignores all cpuset and mempolicy settings */
- migrate_pages(demote_folios, alloc_demote_page, NULL,
+ migrate_pages(demote_folios, alloc_demote_folio, NULL,
(unsigned long)&mtc, MIGRATE_ASYNC, MR_DEMOTION,
&nr_succeeded);
@@ -2255,6 +2261,25 @@ static __always_inline void update_lru_sizes(struct lruvec *lruvec,
}
+#ifdef CONFIG_CMA
+/*
+ * It is waste of effort to scan and reclaim CMA pages if it is not available
+ * for current allocation context. Kswapd can not be enrolled as it can not
+ * distinguish this scenario by using sc->gfp_mask = GFP_KERNEL
+ */
+static bool skip_cma(struct folio *folio, struct scan_control *sc)
+{
+ return !current_is_kswapd() &&
+ gfp_migratetype(sc->gfp_mask) != MIGRATE_MOVABLE &&
+ get_pageblock_migratetype(&folio->page) == MIGRATE_CMA;
+}
+#else
+static bool skip_cma(struct folio *folio, struct scan_control *sc)
+{
+ return false;
+}
+#endif
+
/*
* Isolating page from the lruvec to fill in @dst list by nr_to_scan times.
*
@@ -2301,7 +2326,8 @@ static unsigned long isolate_lru_folios(unsigned long nr_to_scan,
nr_pages = folio_nr_pages(folio);
total_scan += nr_pages;
- if (folio_zonenum(folio) > sc->reclaim_idx) {
+ if (folio_zonenum(folio) > sc->reclaim_idx ||
+ skip_cma(folio, sc)) {
nr_skipped[folio_zonenum(folio)] += nr_pages;
move_to = &folios_skipped;
goto move;
@@ -2443,7 +2469,7 @@ static int too_many_isolated(struct pglist_data *pgdat, int file,
* won't get blocked by normal direct-reclaimers, forming a circular
* deadlock.
*/
- if ((sc->gfp_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS))
+ if (gfp_has_io_fs(sc->gfp_mask))
inactive >>= 3;
too_many = isolated > inactive;
@@ -3218,6 +3244,16 @@ DEFINE_STATIC_KEY_ARRAY_FALSE(lru_gen_caps, NR_LRU_GEN_CAPS);
#define get_cap(cap) static_branch_unlikely(&lru_gen_caps[cap])
#endif
+static bool should_walk_mmu(void)
+{
+ return arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK);
+}
+
+static bool should_clear_pmd_young(void)
+{
+ return arch_has_hw_nonleaf_pmd_young() && get_cap(LRU_GEN_NONLEAF_YOUNG);
+}
+
/******************************************************************************
* shorthand helpers
******************************************************************************/
@@ -3978,28 +4014,29 @@ static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end,
struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
int old_gen, new_gen = lru_gen_from_seq(walk->max_seq);
- VM_WARN_ON_ONCE(pmd_leaf(*pmd));
-
- ptl = pte_lockptr(args->mm, pmd);
- if (!spin_trylock(ptl))
+ pte = pte_offset_map_nolock(args->mm, pmd, start & PMD_MASK, &ptl);
+ if (!pte)
return false;
+ if (!spin_trylock(ptl)) {
+ pte_unmap(pte);
+ return false;
+ }
arch_enter_lazy_mmu_mode();
-
- pte = pte_offset_map(pmd, start & PMD_MASK);
restart:
for (i = pte_index(start), addr = start; addr != end; i++, addr += PAGE_SIZE) {
unsigned long pfn;
struct folio *folio;
+ pte_t ptent = ptep_get(pte + i);
total++;
walk->mm_stats[MM_LEAF_TOTAL]++;
- pfn = get_pte_pfn(pte[i], args->vma, addr);
+ pfn = get_pte_pfn(ptent, args->vma, addr);
if (pfn == -1)
continue;
- if (!pte_young(pte[i])) {
+ if (!pte_young(ptent)) {
walk->mm_stats[MM_LEAF_OLD]++;
continue;
}
@@ -4014,7 +4051,7 @@ restart:
young++;
walk->mm_stats[MM_LEAF_YOUNG]++;
- if (pte_dirty(pte[i]) && !folio_test_dirty(folio) &&
+ if (pte_dirty(ptent) && !folio_test_dirty(folio) &&
!(folio_test_anon(folio) && folio_test_swapbacked(folio) &&
!folio_test_swapcache(folio)))
folio_mark_dirty(folio);
@@ -4027,10 +4064,8 @@ restart:
if (i < PTRS_PER_PTE && get_next_vma(PMD_MASK, PAGE_SIZE, args, &start, &end))
goto restart;
- pte_unmap(pte);
-
arch_leave_lazy_mmu_mode();
- spin_unlock(ptl);
+ pte_unmap_unlock(pte, ptl);
return suitable_to_scan(total, young);
}
@@ -4082,7 +4117,7 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area
goto next;
if (!pmd_trans_huge(pmd[i])) {
- if (arch_has_hw_nonleaf_pmd_young() && get_cap(LRU_GEN_NONLEAF_YOUNG))
+ if (should_clear_pmd_young())
pmdp_test_and_clear_young(vma, addr, pmd + i);
goto next;
}
@@ -4128,7 +4163,7 @@ static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end,
unsigned long next;
unsigned long addr;
struct vm_area_struct *vma;
- unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)];
+ DECLARE_BITMAP(bitmap, MIN_LRU_BATCH);
unsigned long first = -1;
struct lru_gen_mm_walk *walk = args->private;
@@ -4175,7 +4210,7 @@ restart:
#endif
walk->mm_stats[MM_NONLEAF_TOTAL]++;
- if (arch_has_hw_nonleaf_pmd_young() && get_cap(LRU_GEN_NONLEAF_YOUNG)) {
+ if (should_clear_pmd_young()) {
if (!pmd_young(val))
continue;
@@ -4477,7 +4512,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
* handful of PTEs. Spreading the work out over a period of time usually
* is less efficient, but it avoids bursty page faults.
*/
- if (!arch_has_hw_pte_young() || !get_cap(LRU_GEN_MM_WALK)) {
+ if (!should_walk_mmu()) {
success = iterate_mm_list_nowalk(lruvec, max_seq);
goto done;
}
@@ -4659,12 +4694,13 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
for (i = 0, addr = start; addr != end; i++, addr += PAGE_SIZE) {
unsigned long pfn;
+ pte_t ptent = ptep_get(pte + i);
- pfn = get_pte_pfn(pte[i], pvmw->vma, addr);
+ pfn = get_pte_pfn(ptent, pvmw->vma, addr);
if (pfn == -1)
continue;
- if (!pte_young(pte[i]))
+ if (!pte_young(ptent))
continue;
folio = get_pfn_folio(pfn, memcg, pgdat, !walk || walk->can_swap);
@@ -4676,7 +4712,7 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
young++;
- if (pte_dirty(pte[i]) && !folio_test_dirty(folio) &&
+ if (pte_dirty(ptent) && !folio_test_dirty(folio) &&
!(folio_test_anon(folio) && folio_test_swapbacked(folio) &&
!folio_test_swapcache(folio)))
folio_mark_dirty(folio);
@@ -4728,10 +4764,11 @@ static void lru_gen_rotate_memcg(struct lruvec *lruvec, int op)
{
int seg;
int old, new;
+ unsigned long flags;
int bin = get_random_u32_below(MEMCG_NR_BINS);
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
- spin_lock(&pgdat->memcg_lru.lock);
+ spin_lock_irqsave(&pgdat->memcg_lru.lock, flags);
VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list));
@@ -4766,7 +4803,7 @@ static void lru_gen_rotate_memcg(struct lruvec *lruvec, int op)
if (!pgdat->memcg_lru.nr_memcgs[old] && old == get_memcg_gen(pgdat->memcg_lru.seq))
WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1);
- spin_unlock(&pgdat->memcg_lru.lock);
+ spin_unlock_irqrestore(&pgdat->memcg_lru.lock, flags);
}
void lru_gen_online_memcg(struct mem_cgroup *memcg)
@@ -4779,7 +4816,7 @@ void lru_gen_online_memcg(struct mem_cgroup *memcg)
struct pglist_data *pgdat = NODE_DATA(nid);
struct lruvec *lruvec = get_lruvec(memcg, nid);
- spin_lock(&pgdat->memcg_lru.lock);
+ spin_lock_irq(&pgdat->memcg_lru.lock);
VM_WARN_ON_ONCE(!hlist_nulls_unhashed(&lruvec->lrugen.list));
@@ -4790,7 +4827,7 @@ void lru_gen_online_memcg(struct mem_cgroup *memcg)
lruvec->lrugen.gen = gen;
- spin_unlock(&pgdat->memcg_lru.lock);
+ spin_unlock_irq(&pgdat->memcg_lru.lock);
}
}
@@ -4814,7 +4851,7 @@ void lru_gen_release_memcg(struct mem_cgroup *memcg)
struct pglist_data *pgdat = NODE_DATA(nid);
struct lruvec *lruvec = get_lruvec(memcg, nid);
- spin_lock(&pgdat->memcg_lru.lock);
+ spin_lock_irq(&pgdat->memcg_lru.lock);
VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list));
@@ -4826,12 +4863,14 @@ void lru_gen_release_memcg(struct mem_cgroup *memcg)
if (!pgdat->memcg_lru.nr_memcgs[gen] && gen == get_memcg_gen(pgdat->memcg_lru.seq))
WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1);
- spin_unlock(&pgdat->memcg_lru.lock);
+ spin_unlock_irq(&pgdat->memcg_lru.lock);
}
}
-void lru_gen_soft_reclaim(struct lruvec *lruvec)
+void lru_gen_soft_reclaim(struct mem_cgroup *memcg, int nid)
{
+ struct lruvec *lruvec = get_lruvec(memcg, nid);
+
/* see the comment on MEMCG_NR_GENS */
if (lru_gen_memcg_seg(lruvec) != MEMCG_LRU_HEAD)
lru_gen_rotate_memcg(lruvec, MEMCG_LRU_HEAD);
@@ -4897,7 +4936,6 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, int tier_idx)
WRITE_ONCE(lrugen->protected[hist][type][tier - 1],
lrugen->protected[hist][type][tier - 1] + delta);
- __mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + type, delta);
return true;
}
@@ -5292,7 +5330,7 @@ static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, bool
static unsigned long get_nr_to_reclaim(struct scan_control *sc)
{
/* don't abort memcg reclaim to ensure fairness */
- if (!global_reclaim(sc))
+ if (!root_reclaim(sc))
return -1;
return max(sc->nr_to_reclaim, compact_gap(sc->order));
@@ -5444,7 +5482,7 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc
{
struct blk_plug plug;
- VM_WARN_ON_ONCE(global_reclaim(sc));
+ VM_WARN_ON_ONCE(root_reclaim(sc));
VM_WARN_ON_ONCE(!sc->may_writepage || !sc->may_unmap);
lru_add_drain();
@@ -5505,7 +5543,7 @@ static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *
struct blk_plug plug;
unsigned long reclaimed = sc->nr_reclaimed;
- VM_WARN_ON_ONCE(!global_reclaim(sc));
+ VM_WARN_ON_ONCE(!root_reclaim(sc));
/*
* Unmapped clean folios are already prioritized. Scanning for more of
@@ -5712,10 +5750,10 @@ static ssize_t enabled_show(struct kobject *kobj, struct kobj_attribute *attr, c
if (get_cap(LRU_GEN_CORE))
caps |= BIT(LRU_GEN_CORE);
- if (arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK))
+ if (should_walk_mmu())
caps |= BIT(LRU_GEN_MM_WALK);
- if (arch_has_hw_nonleaf_pmd_young() && get_cap(LRU_GEN_NONLEAF_YOUNG))
+ if (should_clear_pmd_young())
caps |= BIT(LRU_GEN_NONLEAF_YOUNG);
return sysfs_emit(buf, "0x%04x\n", caps);
@@ -6227,7 +6265,7 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
bool proportional_reclaim;
struct blk_plug plug;
- if (lru_gen_enabled() && !global_reclaim(sc)) {
+ if (lru_gen_enabled() && !root_reclaim(sc)) {
lru_gen_shrink_lruvec(lruvec, sc);
return;
}
@@ -6383,14 +6421,13 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat,
if (!managed_zone(zone))
continue;
- switch (compaction_suitable(zone, sc->order, 0, sc->reclaim_idx)) {
- case COMPACT_SUCCESS:
- case COMPACT_CONTINUE:
+ /* Allocation can already succeed, nothing to do */
+ if (zone_watermark_ok(zone, sc->order, min_wmark_pages(zone),
+ sc->reclaim_idx, 0))
+ return false;
+
+ if (compaction_suitable(zone, sc->order, sc->reclaim_idx))
return false;
- default:
- /* check next zone */
- ;
- }
}
/*
@@ -6469,7 +6506,7 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
struct lruvec *target_lruvec;
bool reclaimable = false;
- if (lru_gen_enabled() && global_reclaim(sc)) {
+ if (lru_gen_enabled() && root_reclaim(sc)) {
lru_gen_shrink_node(pgdat, sc);
return;
}
@@ -6541,10 +6578,13 @@ again:
* Legacy memcg will stall in page writeback so avoid forcibly
* stalling in reclaim_throttle().
*/
- if ((current_is_kswapd() ||
- (cgroup_reclaim(sc) && writeback_throttling_sane(sc))) &&
- sc->nr.dirty && sc->nr.dirty == sc->nr.congested)
- set_bit(LRUVEC_CONGESTED, &target_lruvec->flags);
+ if (sc->nr.dirty && sc->nr.dirty == sc->nr.congested) {
+ if (cgroup_reclaim(sc) && writeback_throttling_sane(sc))
+ set_bit(LRUVEC_CGROUP_CONGESTED, &target_lruvec->flags);
+
+ if (current_is_kswapd())
+ set_bit(LRUVEC_NODE_CONGESTED, &target_lruvec->flags);
+ }
/*
* Stall direct reclaim for IO completions if the lruvec is
@@ -6554,7 +6594,8 @@ again:
*/
if (!current_is_kswapd() && current_may_throttle() &&
!sc->hibernation_mode &&
- test_bit(LRUVEC_CONGESTED, &target_lruvec->flags))
+ (test_bit(LRUVEC_CGROUP_CONGESTED, &target_lruvec->flags) ||
+ test_bit(LRUVEC_NODE_CONGESTED, &target_lruvec->flags)))
reclaim_throttle(pgdat, VMSCAN_THROTTLE_CONGESTED);
if (should_continue_reclaim(pgdat, nr_node_reclaimed, sc))
@@ -6578,14 +6619,14 @@ again:
static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
{
unsigned long watermark;
- enum compact_result suitable;
- suitable = compaction_suitable(zone, sc->order, 0, sc->reclaim_idx);
- if (suitable == COMPACT_SUCCESS)
- /* Allocation should succeed already. Don't reclaim. */
+ /* Allocation can already succeed, nothing to do */
+ if (zone_watermark_ok(zone, sc->order, min_wmark_pages(zone),
+ sc->reclaim_idx, 0))
return true;
- if (suitable == COMPACT_SKIPPED)
- /* Compaction cannot yet proceed. Do reclaim. */
+
+ /* Compaction cannot yet proceed. Do reclaim. */
+ if (!compaction_suitable(zone, sc->order, sc->reclaim_idx))
return false;
/*
@@ -6811,7 +6852,7 @@ retry:
lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup,
zone->zone_pgdat);
- clear_bit(LRUVEC_CONGESTED, &lruvec->flags);
+ clear_bit(LRUVEC_CGROUP_CONGESTED, &lruvec->flags);
}
}
@@ -6872,7 +6913,7 @@ static bool allow_direct_reclaim(pg_data_t *pgdat)
continue;
pfmemalloc_reserve += min_wmark_pages(zone);
- free_pages += zone_page_state(zone, NR_FREE_PAGES);
+ free_pages += zone_page_state_snapshot(zone, NR_FREE_PAGES);
}
/* If there are no reserves (unexpected config) then do not throttle */
@@ -7200,7 +7241,8 @@ static void clear_pgdat_congested(pg_data_t *pgdat)
{
struct lruvec *lruvec = mem_cgroup_lruvec(NULL, pgdat);
- clear_bit(LRUVEC_CONGESTED, &lruvec->flags);
+ clear_bit(LRUVEC_NODE_CONGESTED, &lruvec->flags);
+ clear_bit(LRUVEC_CGROUP_CONGESTED, &lruvec->flags);
clear_bit(PGDAT_DIRTY, &pgdat->flags);
clear_bit(PGDAT_WRITEBACK, &pgdat->flags);
}
@@ -7825,7 +7867,7 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
/*
* This kswapd start function will be called by init and node-hot-add.
*/
-void kswapd_run(int nid)
+void __meminit kswapd_run(int nid)
{
pg_data_t *pgdat = NODE_DATA(nid);
@@ -7846,7 +7888,7 @@ void kswapd_run(int nid)
* Called by memory hotplug when all memory in a node is offlined. Caller must
* be holding mem_hotplug_begin/done().
*/
-void kswapd_stop(int nid)
+void __meminit kswapd_stop(int nid)
{
pg_data_t *pgdat = NODE_DATA(nid);
struct task_struct *kswapd;
@@ -8043,23 +8085,6 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
}
#endif
-void check_move_unevictable_pages(struct pagevec *pvec)
-{
- struct folio_batch fbatch;
- unsigned i;
-
- folio_batch_init(&fbatch);
- for (i = 0; i < pvec->nr; i++) {
- struct page *page = pvec->pages[i];
-
- if (PageTransTail(page))
- continue;
- folio_batch_add(&fbatch, page_folio(page));
- }
- check_move_unevictable_folios(&fbatch);
-}
-EXPORT_SYMBOL_GPL(check_move_unevictable_pages);
-
/**
* check_move_unevictable_folios - Move evictable folios to appropriate zone
* lru list
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 282349cabf01..b731d57996c5 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -28,6 +28,7 @@
#include <linux/mm_inline.h>
#include <linux/page_ext.h>
#include <linux/page_owner.h>
+#include <linux/sched/isolation.h>
#include "internal.h"
@@ -2025,6 +2026,20 @@ static void vmstat_shepherd(struct work_struct *w)
for_each_online_cpu(cpu) {
struct delayed_work *dw = &per_cpu(vmstat_work, cpu);
+ /*
+ * In kernel users of vmstat counters either require the precise value and
+ * they are using zone_page_state_snapshot interface or they can live with
+ * an imprecision as the regular flushing can happen at arbitrary time and
+ * cumulative error can grow (see calculate_normal_threshold).
+ *
+ * From that POV the regular flushing can be postponed for CPUs that have
+ * been isolated from the kernel interference without critical
+ * infrastructure ever noticing. Skip regular flushing from vmstat_shepherd
+ * for all isolated CPUs to avoid interference with the isolated workload.
+ */
+ if (cpu_is_isolated(cpu))
+ continue;
+
if (!delayed_work_pending(dw) && need_update(cpu))
queue_delayed_work_on(cpu, mm_percpu_wq, dw, 0);
diff --git a/mm/workingset.c b/mm/workingset.c
index 817758951886..4686ae363000 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -255,45 +255,58 @@ static void *lru_gen_eviction(struct folio *folio)
return pack_shadow(mem_cgroup_id(memcg), pgdat, token, refs);
}
+/*
+ * Tests if the shadow entry is for a folio that was recently evicted.
+ * Fills in @lruvec, @token, @workingset with the values unpacked from shadow.
+ */
+static bool lru_gen_test_recent(void *shadow, bool file, struct lruvec **lruvec,
+ unsigned long *token, bool *workingset)
+{
+ int memcg_id;
+ unsigned long min_seq;
+ struct mem_cgroup *memcg;
+ struct pglist_data *pgdat;
+
+ unpack_shadow(shadow, &memcg_id, &pgdat, token, workingset);
+
+ memcg = mem_cgroup_from_id(memcg_id);
+ *lruvec = mem_cgroup_lruvec(memcg, pgdat);
+
+ min_seq = READ_ONCE((*lruvec)->lrugen.min_seq[file]);
+ return (*token >> LRU_REFS_WIDTH) == (min_seq & (EVICTION_MASK >> LRU_REFS_WIDTH));
+}
+
static void lru_gen_refault(struct folio *folio, void *shadow)
{
+ bool recent;
int hist, tier, refs;
- int memcg_id;
bool workingset;
unsigned long token;
- unsigned long min_seq;
struct lruvec *lruvec;
struct lru_gen_folio *lrugen;
- struct mem_cgroup *memcg;
- struct pglist_data *pgdat;
int type = folio_is_file_lru(folio);
int delta = folio_nr_pages(folio);
- unpack_shadow(shadow, &memcg_id, &pgdat, &token, &workingset);
-
- if (pgdat != folio_pgdat(folio))
- return;
-
rcu_read_lock();
- memcg = folio_memcg_rcu(folio);
- if (memcg_id != mem_cgroup_id(memcg))
+ recent = lru_gen_test_recent(shadow, type, &lruvec, &token, &workingset);
+ if (lruvec != folio_lruvec(folio))
goto unlock;
- lruvec = mem_cgroup_lruvec(memcg, pgdat);
- lrugen = &lruvec->lrugen;
+ mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + type, delta);
- min_seq = READ_ONCE(lrugen->min_seq[type]);
- if ((token >> LRU_REFS_WIDTH) != (min_seq & (EVICTION_MASK >> LRU_REFS_WIDTH)))
+ if (!recent)
goto unlock;
- hist = lru_hist_from_seq(min_seq);
+ lrugen = &lruvec->lrugen;
+
+ hist = lru_hist_from_seq(READ_ONCE(lrugen->min_seq[type]));
/* see the comment in folio_lru_refs() */
refs = (token & (BIT(LRU_REFS_WIDTH) - 1)) + workingset;
tier = lru_tier_from_refs(refs);
atomic_long_add(delta, &lrugen->refaulted[hist][type][tier]);
- mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + type, delta);
+ mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + type, delta);
/*
* Count the following two cases as stalls:
@@ -317,6 +330,12 @@ static void *lru_gen_eviction(struct folio *folio)
return NULL;
}
+static bool lru_gen_test_recent(void *shadow, bool file, struct lruvec **lruvec,
+ unsigned long *token, bool *workingset)
+{
+ return false;
+}
+
static void lru_gen_refault(struct folio *folio, void *shadow)
{
}
@@ -385,42 +404,33 @@ void *workingset_eviction(struct folio *folio, struct mem_cgroup *target_memcg)
}
/**
- * workingset_refault - Evaluate the refault of a previously evicted folio.
- * @folio: The freshly allocated replacement folio.
- * @shadow: Shadow entry of the evicted folio.
- *
- * Calculates and evaluates the refault distance of the previously
- * evicted folio in the context of the node and the memcg whose memory
- * pressure caused the eviction.
+ * workingset_test_recent - tests if the shadow entry is for a folio that was
+ * recently evicted. Also fills in @workingset with the value unpacked from
+ * shadow.
+ * @shadow: the shadow entry to be tested.
+ * @file: whether the corresponding folio is from the file lru.
+ * @workingset: where the workingset value unpacked from shadow should
+ * be stored.
+ *
+ * Return: true if the shadow is for a recently evicted folio; false otherwise.
*/
-void workingset_refault(struct folio *folio, void *shadow)
+bool workingset_test_recent(void *shadow, bool file, bool *workingset)
{
- bool file = folio_is_file_lru(folio);
struct mem_cgroup *eviction_memcg;
struct lruvec *eviction_lruvec;
unsigned long refault_distance;
unsigned long workingset_size;
- struct pglist_data *pgdat;
- struct mem_cgroup *memcg;
- unsigned long eviction;
- struct lruvec *lruvec;
unsigned long refault;
- bool workingset;
int memcgid;
- long nr;
+ struct pglist_data *pgdat;
+ unsigned long eviction;
- if (lru_gen_enabled()) {
- lru_gen_refault(folio, shadow);
- return;
- }
+ if (lru_gen_enabled())
+ return lru_gen_test_recent(shadow, file, &eviction_lruvec, &eviction, workingset);
- unpack_shadow(shadow, &memcgid, &pgdat, &eviction, &workingset);
+ unpack_shadow(shadow, &memcgid, &pgdat, &eviction, workingset);
eviction <<= bucket_order;
- /* Flush stats (and potentially sleep) before holding RCU read lock */
- mem_cgroup_flush_stats_ratelimited();
-
- rcu_read_lock();
/*
* Look up the memcg associated with the stored ID. It might
* have been deleted since the folio's eviction.
@@ -439,7 +449,8 @@ void workingset_refault(struct folio *folio, void *shadow)
*/
eviction_memcg = mem_cgroup_from_id(memcgid);
if (!mem_cgroup_disabled() && !eviction_memcg)
- goto out;
+ return false;
+
eviction_lruvec = mem_cgroup_lruvec(eviction_memcg, pgdat);
refault = atomic_long_read(&eviction_lruvec->nonresident_age);
@@ -462,20 +473,6 @@ void workingset_refault(struct folio *folio, void *shadow)
refault_distance = (refault - eviction) & EVICTION_MASK;
/*
- * The activation decision for this folio is made at the level
- * where the eviction occurred, as that is where the LRU order
- * during folio reclaim is being determined.
- *
- * However, the cgroup that will own the folio is the one that
- * is actually experiencing the refault event.
- */
- nr = folio_nr_pages(folio);
- memcg = folio_memcg(folio);
- pgdat = folio_pgdat(folio);
- lruvec = mem_cgroup_lruvec(memcg, pgdat);
-
- mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file, nr);
- /*
* Compare the distance to the existing workingset size. We
* don't activate pages that couldn't stay resident even if
* all the memory was available to the workingset. Whether
@@ -495,7 +492,54 @@ void workingset_refault(struct folio *folio, void *shadow)
NR_INACTIVE_ANON);
}
}
- if (refault_distance > workingset_size)
+
+ return refault_distance <= workingset_size;
+}
+
+/**
+ * workingset_refault - Evaluate the refault of a previously evicted folio.
+ * @folio: The freshly allocated replacement folio.
+ * @shadow: Shadow entry of the evicted folio.
+ *
+ * Calculates and evaluates the refault distance of the previously
+ * evicted folio in the context of the node and the memcg whose memory
+ * pressure caused the eviction.
+ */
+void workingset_refault(struct folio *folio, void *shadow)
+{
+ bool file = folio_is_file_lru(folio);
+ struct pglist_data *pgdat;
+ struct mem_cgroup *memcg;
+ struct lruvec *lruvec;
+ bool workingset;
+ long nr;
+
+ if (lru_gen_enabled()) {
+ lru_gen_refault(folio, shadow);
+ return;
+ }
+
+ /* Flush stats (and potentially sleep) before holding RCU read lock */
+ mem_cgroup_flush_stats_ratelimited();
+
+ rcu_read_lock();
+
+ /*
+ * The activation decision for this folio is made at the level
+ * where the eviction occurred, as that is where the LRU order
+ * during folio reclaim is being determined.
+ *
+ * However, the cgroup that will own the folio is the one that
+ * is actually experiencing the refault event.
+ */
+ nr = folio_nr_pages(folio);
+ memcg = folio_memcg(folio);
+ pgdat = folio_pgdat(folio);
+ lruvec = mem_cgroup_lruvec(memcg, pgdat);
+
+ mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file, nr);
+
+ if (!workingset_test_recent(shadow, file, &workingset))
goto out;
folio_set_active(folio);
diff --git a/mm/z3fold.c b/mm/z3fold.c
index 0cef845d397b..e84de91ecccb 100644
--- a/mm/z3fold.c
+++ b/mm/z3fold.c
@@ -125,13 +125,11 @@ struct z3fold_header {
/**
* struct z3fold_pool - stores metadata for each z3fold pool
* @name: pool name
- * @lock: protects pool unbuddied/lru lists
+ * @lock: protects pool unbuddied lists
* @stale_lock: protects pool stale page list
* @unbuddied: per-cpu array of lists tracking z3fold pages that contain 2-
* buddies; the list each z3fold page is added to depends on
* the size of its free region.
- * @lru: list tracking the z3fold pages in LRU order by most recently
- * added buddy.
* @stale: list of pages marked for freeing
* @pages_nr: number of z3fold pages in the pool.
* @c_handle: cache for z3fold_buddy_slots allocation
@@ -149,12 +147,9 @@ struct z3fold_pool {
spinlock_t lock;
spinlock_t stale_lock;
struct list_head *unbuddied;
- struct list_head lru;
struct list_head stale;
atomic64_t pages_nr;
struct kmem_cache *c_handle;
- struct zpool *zpool;
- const struct zpool_ops *zpool_ops;
struct workqueue_struct *compact_wq;
struct workqueue_struct *release_wq;
struct work_struct work;
@@ -329,7 +324,6 @@ static struct z3fold_header *init_z3fold_page(struct page *page, bool headless,
struct z3fold_header *zhdr = page_address(page);
struct z3fold_buddy_slots *slots;
- INIT_LIST_HEAD(&page->lru);
clear_bit(PAGE_HEADLESS, &page->private);
clear_bit(MIDDLE_CHUNK_MAPPED, &page->private);
clear_bit(NEEDS_COMPACTING, &page->private);
@@ -451,8 +445,6 @@ static void __release_z3fold_page(struct z3fold_header *zhdr, bool locked)
set_bit(PAGE_STALE, &page->private);
clear_bit(NEEDS_COMPACTING, &page->private);
spin_lock(&pool->lock);
- if (!list_empty(&page->lru))
- list_del_init(&page->lru);
spin_unlock(&pool->lock);
if (locked)
@@ -930,7 +922,6 @@ static struct z3fold_pool *z3fold_create_pool(const char *name, gfp_t gfp)
for_each_unbuddied_list(i, 0)
INIT_LIST_HEAD(&unbuddied[i]);
}
- INIT_LIST_HEAD(&pool->lru);
INIT_LIST_HEAD(&pool->stale);
atomic64_set(&pool->pages_nr, 0);
pool->name = name;
@@ -1073,12 +1064,6 @@ found:
headless:
spin_lock(&pool->lock);
- /* Add/move z3fold page to beginning of LRU */
- if (!list_empty(&page->lru))
- list_del(&page->lru);
-
- list_add(&page->lru, &pool->lru);
-
*handle = encode_handle(zhdr, bud);
spin_unlock(&pool->lock);
if (bud != HEADLESS)
@@ -1115,9 +1100,6 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
* immediately so we don't care about its value any more.
*/
if (!page_claimed) {
- spin_lock(&pool->lock);
- list_del(&page->lru);
- spin_unlock(&pool->lock);
put_z3fold_header(zhdr);
free_z3fold_page(page, true);
atomic64_dec(&pool->pages_nr);
@@ -1173,194 +1155,6 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
}
/**
- * z3fold_reclaim_page() - evicts allocations from a pool page and frees it
- * @pool: pool from which a page will attempt to be evicted
- * @retries: number of pages on the LRU list for which eviction will
- * be attempted before failing
- *
- * z3fold reclaim is different from normal system reclaim in that it is done
- * from the bottom, up. This is because only the bottom layer, z3fold, has
- * information on how the allocations are organized within each z3fold page.
- * This has the potential to create interesting locking situations between
- * z3fold and the user, however.
- *
- * To avoid these, this is how z3fold_reclaim_page() should be called:
- *
- * The user detects a page should be reclaimed and calls z3fold_reclaim_page().
- * z3fold_reclaim_page() will remove a z3fold page from the pool LRU list and
- * call the user-defined eviction handler with the pool and handle as
- * arguments.
- *
- * If the handle can not be evicted, the eviction handler should return
- * non-zero. z3fold_reclaim_page() will add the z3fold page back to the
- * appropriate list and try the next z3fold page on the LRU up to
- * a user defined number of retries.
- *
- * If the handle is successfully evicted, the eviction handler should
- * return 0 _and_ should have called z3fold_free() on the handle. z3fold_free()
- * contains logic to delay freeing the page if the page is under reclaim,
- * as indicated by the setting of the PG_reclaim flag on the underlying page.
- *
- * If all buddies in the z3fold page are successfully evicted, then the
- * z3fold page can be freed.
- *
- * Returns: 0 if page is successfully freed, otherwise -EINVAL if there are
- * no pages to evict or an eviction handler is not registered, -EAGAIN if
- * the retry limit was hit.
- */
-static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries)
-{
- int i, ret = -1;
- struct z3fold_header *zhdr = NULL;
- struct page *page = NULL;
- struct list_head *pos;
- unsigned long first_handle = 0, middle_handle = 0, last_handle = 0;
- struct z3fold_buddy_slots slots __attribute__((aligned(SLOTS_ALIGN)));
-
- rwlock_init(&slots.lock);
- slots.pool = (unsigned long)pool | (1 << HANDLES_NOFREE);
-
- spin_lock(&pool->lock);
- for (i = 0; i < retries; i++) {
- if (list_empty(&pool->lru)) {
- spin_unlock(&pool->lock);
- return -EINVAL;
- }
- list_for_each_prev(pos, &pool->lru) {
- page = list_entry(pos, struct page, lru);
-
- zhdr = page_address(page);
- if (test_bit(PAGE_HEADLESS, &page->private)) {
- /*
- * For non-headless pages, we wait to do this
- * until we have the page lock to avoid racing
- * with __z3fold_alloc(). Headless pages don't
- * have a lock (and __z3fold_alloc() will never
- * see them), but we still need to test and set
- * PAGE_CLAIMED to avoid racing with
- * z3fold_free(), so just do it now before
- * leaving the loop.
- */
- if (test_and_set_bit(PAGE_CLAIMED, &page->private))
- continue;
-
- break;
- }
-
- if (!z3fold_page_trylock(zhdr)) {
- zhdr = NULL;
- continue; /* can't evict at this point */
- }
-
- /* test_and_set_bit is of course atomic, but we still
- * need to do it under page lock, otherwise checking
- * that bit in __z3fold_alloc wouldn't make sense
- */
- if (zhdr->foreign_handles ||
- test_and_set_bit(PAGE_CLAIMED, &page->private)) {
- z3fold_page_unlock(zhdr);
- zhdr = NULL;
- continue; /* can't evict such page */
- }
- list_del_init(&zhdr->buddy);
- zhdr->cpu = -1;
- /* See comment in __z3fold_alloc. */
- kref_get(&zhdr->refcount);
- break;
- }
-
- if (!zhdr)
- break;
-
- list_del_init(&page->lru);
- spin_unlock(&pool->lock);
-
- if (!test_bit(PAGE_HEADLESS, &page->private)) {
- /*
- * We need encode the handles before unlocking, and
- * use our local slots structure because z3fold_free
- * can zero out zhdr->slots and we can't do much
- * about that
- */
- first_handle = 0;
- last_handle = 0;
- middle_handle = 0;
- memset(slots.slot, 0, sizeof(slots.slot));
- if (zhdr->first_chunks)
- first_handle = __encode_handle(zhdr, &slots,
- FIRST);
- if (zhdr->middle_chunks)
- middle_handle = __encode_handle(zhdr, &slots,
- MIDDLE);
- if (zhdr->last_chunks)
- last_handle = __encode_handle(zhdr, &slots,
- LAST);
- /*
- * it's safe to unlock here because we hold a
- * reference to this page
- */
- z3fold_page_unlock(zhdr);
- } else {
- first_handle = encode_handle(zhdr, HEADLESS);
- last_handle = middle_handle = 0;
- }
- /* Issue the eviction callback(s) */
- if (middle_handle) {
- ret = pool->zpool_ops->evict(pool->zpool, middle_handle);
- if (ret)
- goto next;
- }
- if (first_handle) {
- ret = pool->zpool_ops->evict(pool->zpool, first_handle);
- if (ret)
- goto next;
- }
- if (last_handle) {
- ret = pool->zpool_ops->evict(pool->zpool, last_handle);
- if (ret)
- goto next;
- }
-next:
- if (test_bit(PAGE_HEADLESS, &page->private)) {
- if (ret == 0) {
- free_z3fold_page(page, true);
- atomic64_dec(&pool->pages_nr);
- return 0;
- }
- spin_lock(&pool->lock);
- list_add(&page->lru, &pool->lru);
- spin_unlock(&pool->lock);
- clear_bit(PAGE_CLAIMED, &page->private);
- } else {
- struct z3fold_buddy_slots *slots = zhdr->slots;
- z3fold_page_lock(zhdr);
- if (kref_put(&zhdr->refcount,
- release_z3fold_page_locked)) {
- kmem_cache_free(pool->c_handle, slots);
- return 0;
- }
- /*
- * if we are here, the page is still not completely
- * free. Take the global pool lock then to be able
- * to add it back to the lru list
- */
- spin_lock(&pool->lock);
- list_add(&page->lru, &pool->lru);
- spin_unlock(&pool->lock);
- if (list_empty(&zhdr->buddy))
- add_to_unbuddied(pool, zhdr);
- clear_bit(PAGE_CLAIMED, &page->private);
- z3fold_page_unlock(zhdr);
- }
-
- /* We started off locked to we need to lock the pool back */
- spin_lock(&pool->lock);
- }
- spin_unlock(&pool->lock);
- return -EAGAIN;
-}
-
-/**
* z3fold_map() - maps the allocation associated with the given handle
* @pool: pool in which the allocation resides
* @handle: handle associated with the allocation to be mapped
@@ -1470,8 +1264,6 @@ static bool z3fold_page_isolate(struct page *page, isolate_mode_t mode)
spin_lock(&pool->lock);
if (!list_empty(&zhdr->buddy))
list_del_init(&zhdr->buddy);
- if (!list_empty(&page->lru))
- list_del_init(&page->lru);
spin_unlock(&pool->lock);
kref_get(&zhdr->refcount);
@@ -1531,9 +1323,6 @@ static int z3fold_page_migrate(struct page *newpage, struct page *page,
encode_handle(new_zhdr, MIDDLE);
set_bit(NEEDS_COMPACTING, &newpage->private);
new_zhdr->cpu = smp_processor_id();
- spin_lock(&pool->lock);
- list_add(&newpage->lru, &pool->lru);
- spin_unlock(&pool->lock);
__SetPageMovable(newpage, &z3fold_mops);
z3fold_page_unlock(new_zhdr);
@@ -1559,9 +1348,6 @@ static void z3fold_page_putback(struct page *page)
INIT_LIST_HEAD(&page->lru);
if (kref_put(&zhdr->refcount, release_z3fold_page_locked))
return;
- spin_lock(&pool->lock);
- list_add(&page->lru, &pool->lru);
- spin_unlock(&pool->lock);
if (list_empty(&zhdr->buddy))
add_to_unbuddied(pool, zhdr);
clear_bit(PAGE_CLAIMED, &page->private);
@@ -1578,18 +1364,9 @@ static const struct movable_operations z3fold_mops = {
* zpool
****************/
-static void *z3fold_zpool_create(const char *name, gfp_t gfp,
- const struct zpool_ops *zpool_ops,
- struct zpool *zpool)
+static void *z3fold_zpool_create(const char *name, gfp_t gfp)
{
- struct z3fold_pool *pool;
-
- pool = z3fold_create_pool(name, gfp);
- if (pool) {
- pool->zpool = zpool;
- pool->zpool_ops = zpool_ops;
- }
- return pool;
+ return z3fold_create_pool(name, gfp);
}
static void z3fold_zpool_destroy(void *pool)
@@ -1607,25 +1384,6 @@ static void z3fold_zpool_free(void *pool, unsigned long handle)
z3fold_free(pool, handle);
}
-static int z3fold_zpool_shrink(void *pool, unsigned int pages,
- unsigned int *reclaimed)
-{
- unsigned int total = 0;
- int ret = -EINVAL;
-
- while (total < pages) {
- ret = z3fold_reclaim_page(pool, 8);
- if (ret < 0)
- break;
- total++;
- }
-
- if (reclaimed)
- *reclaimed = total;
-
- return ret;
-}
-
static void *z3fold_zpool_map(void *pool, unsigned long handle,
enum zpool_mapmode mm)
{
@@ -1649,7 +1407,6 @@ static struct zpool_driver z3fold_zpool_driver = {
.destroy = z3fold_zpool_destroy,
.malloc = z3fold_zpool_malloc,
.free = z3fold_zpool_free,
- .shrink = z3fold_zpool_shrink,
.map = z3fold_zpool_map,
.unmap = z3fold_zpool_unmap,
.total_size = z3fold_zpool_total_size,
diff --git a/mm/zbud.c b/mm/zbud.c
index 3acd26193920..2190cc1f37b3 100644
--- a/mm/zbud.c
+++ b/mm/zbud.c
@@ -83,11 +83,7 @@ struct zbud_pool;
* its free region.
* @buddied: list tracking the zbud pages that contain two buddies;
* these zbud pages are full
- * @lru: list tracking the zbud pages in LRU order by most recently
- * added buddy.
* @pages_nr: number of zbud pages in the pool.
- * @zpool: zpool driver
- * @zpool_ops: zpool operations structure with an evict callback
*
* This structure is allocated at pool creation time and maintains metadata
* pertaining to a particular zbud pool.
@@ -102,26 +98,20 @@ struct zbud_pool {
struct list_head buddied;
struct list_head unbuddied[NCHUNKS];
};
- struct list_head lru;
u64 pages_nr;
- struct zpool *zpool;
- const struct zpool_ops *zpool_ops;
};
/*
* struct zbud_header - zbud page metadata occupying the first chunk of each
* zbud page.
* @buddy: links the zbud page into the unbuddied/buddied lists in the pool
- * @lru: links the zbud page into the lru list in the pool
* @first_chunks: the size of the first buddy in chunks, 0 if free
* @last_chunks: the size of the last buddy in chunks, 0 if free
*/
struct zbud_header {
struct list_head buddy;
- struct list_head lru;
unsigned int first_chunks;
unsigned int last_chunks;
- bool under_reclaim;
};
/*****************
@@ -149,8 +139,6 @@ static struct zbud_header *init_zbud_page(struct page *page)
zhdr->first_chunks = 0;
zhdr->last_chunks = 0;
INIT_LIST_HEAD(&zhdr->buddy);
- INIT_LIST_HEAD(&zhdr->lru);
- zhdr->under_reclaim = false;
return zhdr;
}
@@ -221,7 +209,6 @@ static struct zbud_pool *zbud_create_pool(gfp_t gfp)
for_each_unbuddied_list(i, 0)
INIT_LIST_HEAD(&pool->unbuddied[i]);
INIT_LIST_HEAD(&pool->buddied);
- INIT_LIST_HEAD(&pool->lru);
pool->pages_nr = 0;
return pool;
}
@@ -310,11 +297,6 @@ found:
list_add(&zhdr->buddy, &pool->buddied);
}
- /* Add/move zbud page to beginning of LRU */
- if (!list_empty(&zhdr->lru))
- list_del(&zhdr->lru);
- list_add(&zhdr->lru, &pool->lru);
-
*handle = encode_handle(zhdr, bud);
spin_unlock(&pool->lock);
@@ -325,11 +307,6 @@ found:
* zbud_free() - frees the allocation associated with the given handle
* @pool: pool in which the allocation resided
* @handle: handle associated with the allocation returned by zbud_alloc()
- *
- * In the case that the zbud page in which the allocation resides is under
- * reclaim, as indicated by the PG_reclaim flag being set, this function
- * only sets the first|last_chunks to 0. The page is actually freed
- * once both buddies are evicted (see zbud_reclaim_page() below).
*/
static void zbud_free(struct zbud_pool *pool, unsigned long handle)
{
@@ -345,18 +322,11 @@ static void zbud_free(struct zbud_pool *pool, unsigned long handle)
else
zhdr->first_chunks = 0;
- if (zhdr->under_reclaim) {
- /* zbud page is under reclaim, reclaim will free */
- spin_unlock(&pool->lock);
- return;
- }
-
/* Remove from existing buddy list */
list_del(&zhdr->buddy);
if (zhdr->first_chunks == 0 && zhdr->last_chunks == 0) {
/* zbud page is empty, free */
- list_del(&zhdr->lru);
free_zbud_page(zhdr);
pool->pages_nr--;
} else {
@@ -369,110 +339,6 @@ static void zbud_free(struct zbud_pool *pool, unsigned long handle)
}
/**
- * zbud_reclaim_page() - evicts allocations from a pool page and frees it
- * @pool: pool from which a page will attempt to be evicted
- * @retries: number of pages on the LRU list for which eviction will
- * be attempted before failing
- *
- * zbud reclaim is different from normal system reclaim in that the reclaim is
- * done from the bottom, up. This is because only the bottom layer, zbud, has
- * information on how the allocations are organized within each zbud page. This
- * has the potential to create interesting locking situations between zbud and
- * the user, however.
- *
- * To avoid these, this is how zbud_reclaim_page() should be called:
- *
- * The user detects a page should be reclaimed and calls zbud_reclaim_page().
- * zbud_reclaim_page() will remove a zbud page from the pool LRU list and call
- * the user-defined eviction handler with the pool and handle as arguments.
- *
- * If the handle can not be evicted, the eviction handler should return
- * non-zero. zbud_reclaim_page() will add the zbud page back to the
- * appropriate list and try the next zbud page on the LRU up to
- * a user defined number of retries.
- *
- * If the handle is successfully evicted, the eviction handler should
- * return 0 _and_ should have called zbud_free() on the handle. zbud_free()
- * contains logic to delay freeing the page if the page is under reclaim,
- * as indicated by the setting of the PG_reclaim flag on the underlying page.
- *
- * If all buddies in the zbud page are successfully evicted, then the
- * zbud page can be freed.
- *
- * Returns: 0 if page is successfully freed, otherwise -EINVAL if there are
- * no pages to evict or an eviction handler is not registered, -EAGAIN if
- * the retry limit was hit.
- */
-static int zbud_reclaim_page(struct zbud_pool *pool, unsigned int retries)
-{
- int i, ret, freechunks;
- struct zbud_header *zhdr;
- unsigned long first_handle = 0, last_handle = 0;
-
- spin_lock(&pool->lock);
- if (list_empty(&pool->lru)) {
- spin_unlock(&pool->lock);
- return -EINVAL;
- }
- for (i = 0; i < retries; i++) {
- zhdr = list_last_entry(&pool->lru, struct zbud_header, lru);
- list_del(&zhdr->lru);
- list_del(&zhdr->buddy);
- /* Protect zbud page against free */
- zhdr->under_reclaim = true;
- /*
- * We need encode the handles before unlocking, since we can
- * race with free that will set (first|last)_chunks to 0
- */
- first_handle = 0;
- last_handle = 0;
- if (zhdr->first_chunks)
- first_handle = encode_handle(zhdr, FIRST);
- if (zhdr->last_chunks)
- last_handle = encode_handle(zhdr, LAST);
- spin_unlock(&pool->lock);
-
- /* Issue the eviction callback(s) */
- if (first_handle) {
- ret = pool->zpool_ops->evict(pool->zpool, first_handle);
- if (ret)
- goto next;
- }
- if (last_handle) {
- ret = pool->zpool_ops->evict(pool->zpool, last_handle);
- if (ret)
- goto next;
- }
-next:
- spin_lock(&pool->lock);
- zhdr->under_reclaim = false;
- if (zhdr->first_chunks == 0 && zhdr->last_chunks == 0) {
- /*
- * Both buddies are now free, free the zbud page and
- * return success.
- */
- free_zbud_page(zhdr);
- pool->pages_nr--;
- spin_unlock(&pool->lock);
- return 0;
- } else if (zhdr->first_chunks == 0 ||
- zhdr->last_chunks == 0) {
- /* add to unbuddied list */
- freechunks = num_free_chunks(zhdr);
- list_add(&zhdr->buddy, &pool->unbuddied[freechunks]);
- } else {
- /* add to buddied list */
- list_add(&zhdr->buddy, &pool->buddied);
- }
-
- /* add to beginning of LRU */
- list_add(&zhdr->lru, &pool->lru);
- }
- spin_unlock(&pool->lock);
- return -EAGAIN;
-}
-
-/**
* zbud_map() - maps the allocation associated with the given handle
* @pool: pool in which the allocation resides
* @handle: handle associated with the allocation to be mapped
@@ -514,18 +380,9 @@ static u64 zbud_get_pool_size(struct zbud_pool *pool)
* zpool
****************/
-static void *zbud_zpool_create(const char *name, gfp_t gfp,
- const struct zpool_ops *zpool_ops,
- struct zpool *zpool)
+static void *zbud_zpool_create(const char *name, gfp_t gfp)
{
- struct zbud_pool *pool;
-
- pool = zbud_create_pool(gfp);
- if (pool) {
- pool->zpool = zpool;
- pool->zpool_ops = zpool_ops;
- }
- return pool;
+ return zbud_create_pool(gfp);
}
static void zbud_zpool_destroy(void *pool)
@@ -543,25 +400,6 @@ static void zbud_zpool_free(void *pool, unsigned long handle)
zbud_free(pool, handle);
}
-static int zbud_zpool_shrink(void *pool, unsigned int pages,
- unsigned int *reclaimed)
-{
- unsigned int total = 0;
- int ret = -EINVAL;
-
- while (total < pages) {
- ret = zbud_reclaim_page(pool, 8);
- if (ret < 0)
- break;
- total++;
- }
-
- if (reclaimed)
- *reclaimed = total;
-
- return ret;
-}
-
static void *zbud_zpool_map(void *pool, unsigned long handle,
enum zpool_mapmode mm)
{
@@ -585,7 +423,6 @@ static struct zpool_driver zbud_zpool_driver = {
.destroy = zbud_zpool_destroy,
.malloc = zbud_zpool_malloc,
.free = zbud_zpool_free,
- .shrink = zbud_zpool_shrink,
.map = zbud_zpool_map,
.unmap = zbud_zpool_unmap,
.total_size = zbud_zpool_total_size,
diff --git a/mm/zpool.c b/mm/zpool.c
index 6a19c4a58f77..846410479c2f 100644
--- a/mm/zpool.c
+++ b/mm/zpool.c
@@ -133,7 +133,6 @@ EXPORT_SYMBOL(zpool_has_pool);
* @type: The type of the zpool to create (e.g. zbud, zsmalloc)
* @name: The name of the zpool (e.g. zram0, zswap)
* @gfp: The GFP flags to use when allocating the pool.
- * @ops: The optional ops callback.
*
* This creates a new zpool of the specified type. The gfp flags will be
* used when allocating memory, if the implementation supports it. If the
@@ -145,8 +144,7 @@ EXPORT_SYMBOL(zpool_has_pool);
*
* Returns: New zpool on success, NULL on failure.
*/
-struct zpool *zpool_create_pool(const char *type, const char *name, gfp_t gfp,
- const struct zpool_ops *ops)
+struct zpool *zpool_create_pool(const char *type, const char *name, gfp_t gfp)
{
struct zpool_driver *driver;
struct zpool *zpool;
@@ -173,7 +171,7 @@ struct zpool *zpool_create_pool(const char *type, const char *name, gfp_t gfp,
}
zpool->driver = driver;
- zpool->pool = driver->create(name, gfp, ops, zpool);
+ zpool->pool = driver->create(name, gfp);
if (!zpool->pool) {
pr_err("couldn't create %s pool\n", type);
@@ -280,30 +278,6 @@ void zpool_free(struct zpool *zpool, unsigned long handle)
}
/**
- * zpool_shrink() - Shrink the pool size
- * @zpool: The zpool to shrink.
- * @pages: The number of pages to shrink the pool.
- * @reclaimed: The number of pages successfully evicted.
- *
- * This attempts to shrink the actual memory size of the pool
- * by evicting currently used handle(s). If the pool was
- * created with no zpool_ops, or the evict call fails for any
- * of the handles, this will fail. If non-NULL, the @reclaimed
- * parameter will be set to the number of pages reclaimed,
- * which may be more than the number of pages requested.
- *
- * Implementations must guarantee this to be thread-safe.
- *
- * Returns: 0 on success, negative value on error/failure.
- */
-int zpool_shrink(struct zpool *zpool, unsigned int pages,
- unsigned int *reclaimed)
-{
- return zpool->driver->shrink ?
- zpool->driver->shrink(zpool->pool, pages, reclaimed) : -EINVAL;
-}
-
-/**
* zpool_map_handle() - Map a previously allocated handle into memory
* @zpool: The zpool that the handle was allocated from
* @handle: The handle to map
@@ -360,24 +334,6 @@ u64 zpool_get_total_size(struct zpool *zpool)
}
/**
- * zpool_evictable() - Test if zpool is potentially evictable
- * @zpool: The zpool to test
- *
- * Zpool is only potentially evictable when it's created with struct
- * zpool_ops.evict and its driver implements struct zpool_driver.shrink.
- *
- * However, it doesn't necessarily mean driver will use zpool_ops.evict
- * in its implementation of zpool_driver.shrink. It could do internal
- * defragmentation instead.
- *
- * Returns: true if potentially evictable; false otherwise.
- */
-bool zpool_evictable(struct zpool *zpool)
-{
- return zpool->driver->shrink;
-}
-
-/**
* zpool_can_sleep_mapped - Test if zpool can sleep when do mapped.
* @zpool: The zpool to test
*
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 02f7f414aade..3f057970504e 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -107,21 +107,8 @@
*/
#define OBJ_ALLOCATED_TAG 1
-#ifdef CONFIG_ZPOOL
-/*
- * The second least-significant bit in the object's header identifies if the
- * value stored at the header is a deferred handle from the last reclaim
- * attempt.
- *
- * As noted above, this is valid because we have room for two bits.
- */
-#define OBJ_DEFERRED_HANDLE_TAG 2
-#define OBJ_TAG_BITS 2
-#define OBJ_TAG_MASK (OBJ_ALLOCATED_TAG | OBJ_DEFERRED_HANDLE_TAG)
-#else
#define OBJ_TAG_BITS 1
#define OBJ_TAG_MASK OBJ_ALLOCATED_TAG
-#endif /* CONFIG_ZPOOL */
#define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS - OBJ_TAG_BITS)
#define OBJ_INDEX_MASK ((_AC(1, UL) << OBJ_INDEX_BITS) - 1)
@@ -227,12 +214,6 @@ struct link_free {
* Handle of allocated object.
*/
unsigned long handle;
-#ifdef CONFIG_ZPOOL
- /*
- * Deferred handle of a reclaimed object.
- */
- unsigned long deferred_handle;
-#endif
};
};
@@ -250,13 +231,6 @@ struct zs_pool {
/* Compact classes */
struct shrinker shrinker;
-#ifdef CONFIG_ZPOOL
- /* List tracking the zspages in LRU order by most recently added object */
- struct list_head lru;
- struct zpool *zpool;
- const struct zpool_ops *zpool_ops;
-#endif
-
#ifdef CONFIG_ZSMALLOC_STAT
struct dentry *stat_dentry;
#endif
@@ -279,13 +253,6 @@ struct zspage {
unsigned int freeobj;
struct page *first_page;
struct list_head list; /* fullness list */
-
-#ifdef CONFIG_ZPOOL
- /* links the zspage to the lru list in the pool */
- struct list_head lru;
- bool under_reclaim;
-#endif
-
struct zs_pool *pool;
rwlock_t lock;
};
@@ -384,23 +351,14 @@ static void record_obj(unsigned long handle, unsigned long obj)
#ifdef CONFIG_ZPOOL
-static void *zs_zpool_create(const char *name, gfp_t gfp,
- const struct zpool_ops *zpool_ops,
- struct zpool *zpool)
+static void *zs_zpool_create(const char *name, gfp_t gfp)
{
/*
* Ignore global gfp flags: zs_malloc() may be invoked from
* different contexts and its caller must provide a valid
* gfp mask.
*/
- struct zs_pool *pool = zs_create_pool(name);
-
- if (pool) {
- pool->zpool = zpool;
- pool->zpool_ops = zpool_ops;
- }
-
- return pool;
+ return zs_create_pool(name);
}
static void zs_zpool_destroy(void *pool)
@@ -422,27 +380,6 @@ static void zs_zpool_free(void *pool, unsigned long handle)
zs_free(pool, handle);
}
-static int zs_reclaim_page(struct zs_pool *pool, unsigned int retries);
-
-static int zs_zpool_shrink(void *pool, unsigned int pages,
- unsigned int *reclaimed)
-{
- unsigned int total = 0;
- int ret = -EINVAL;
-
- while (total < pages) {
- ret = zs_reclaim_page(pool, 8);
- if (ret < 0)
- break;
- total++;
- }
-
- if (reclaimed)
- *reclaimed = total;
-
- return ret;
-}
-
static void *zs_zpool_map(void *pool, unsigned long handle,
enum zpool_mapmode mm)
{
@@ -481,7 +418,6 @@ static struct zpool_driver zs_zpool_driver = {
.malloc_support_movable = true,
.malloc = zs_zpool_malloc,
.free = zs_zpool_free,
- .shrink = zs_zpool_shrink,
.map = zs_zpool_map,
.unmap = zs_zpool_unmap,
.total_size = zs_zpool_total_size,
@@ -884,14 +820,6 @@ static inline bool obj_allocated(struct page *page, void *obj, unsigned long *ph
return obj_tagged(page, obj, phandle, OBJ_ALLOCATED_TAG);
}
-#ifdef CONFIG_ZPOOL
-static bool obj_stores_deferred_handle(struct page *page, void *obj,
- unsigned long *phandle)
-{
- return obj_tagged(page, obj, phandle, OBJ_DEFERRED_HANDLE_TAG);
-}
-#endif
-
static void reset_page(struct page *page)
{
__ClearPageMovable(page);
@@ -922,39 +850,6 @@ unlock:
return 0;
}
-#ifdef CONFIG_ZPOOL
-static unsigned long find_deferred_handle_obj(struct size_class *class,
- struct page *page, int *obj_idx);
-
-/*
- * Free all the deferred handles whose objects are freed in zs_free.
- */
-static void free_handles(struct zs_pool *pool, struct size_class *class,
- struct zspage *zspage)
-{
- int obj_idx = 0;
- struct page *page = get_first_page(zspage);
- unsigned long handle;
-
- while (1) {
- handle = find_deferred_handle_obj(class, page, &obj_idx);
- if (!handle) {
- page = get_next_page(page);
- if (!page)
- break;
- obj_idx = 0;
- continue;
- }
-
- cache_free_handle(pool, handle);
- obj_idx++;
- }
-}
-#else
-static inline void free_handles(struct zs_pool *pool, struct size_class *class,
- struct zspage *zspage) {}
-#endif
-
static void __free_zspage(struct zs_pool *pool, struct size_class *class,
struct zspage *zspage)
{
@@ -969,9 +864,6 @@ static void __free_zspage(struct zs_pool *pool, struct size_class *class,
VM_BUG_ON(get_zspage_inuse(zspage));
VM_BUG_ON(fg != ZS_INUSE_RATIO_0);
- /* Free all deferred handles from zs_free */
- free_handles(pool, class, zspage);
-
next = page = get_first_page(zspage);
do {
VM_BUG_ON_PAGE(!PageLocked(page), page);
@@ -1006,9 +898,6 @@ static void free_zspage(struct zs_pool *pool, struct size_class *class,
}
remove_zspage(class, zspage, ZS_INUSE_RATIO_0);
-#ifdef CONFIG_ZPOOL
- list_del(&zspage->lru);
-#endif
__free_zspage(pool, class, zspage);
}
@@ -1054,11 +943,6 @@ static void init_zspage(struct size_class *class, struct zspage *zspage)
off %= PAGE_SIZE;
}
-#ifdef CONFIG_ZPOOL
- INIT_LIST_HEAD(&zspage->lru);
- zspage->under_reclaim = false;
-#endif
-
set_freeobj(zspage, 0);
}
@@ -1341,7 +1225,7 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
spin_unlock(&pool->lock);
class = zspage_class(pool, zspage);
- off = (class->size * obj_idx) & ~PAGE_MASK;
+ off = offset_in_page(class->size * obj_idx);
local_lock(&zs_map_area.lock);
area = this_cpu_ptr(&zs_map_area);
@@ -1381,7 +1265,7 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
obj_to_location(obj, &page, &obj_idx);
zspage = get_zspage(page);
class = zspage_class(pool, zspage);
- off = (class->size * obj_idx) & ~PAGE_MASK;
+ off = offset_in_page(class->size * obj_idx);
area = this_cpu_ptr(&zs_map_area);
if (off + class->size <= PAGE_SIZE)
@@ -1438,7 +1322,7 @@ static unsigned long obj_malloc(struct zs_pool *pool,
offset = obj * class->size;
nr_page = offset >> PAGE_SHIFT;
- m_offset = offset & ~PAGE_MASK;
+ m_offset = offset_in_page(offset);
m_page = get_first_page(zspage);
for (i = 0; i < nr_page; i++)
@@ -1525,20 +1409,13 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp)
/* We completely set up zspage so mark them as movable */
SetZsPageMovable(pool, zspage);
out:
-#ifdef CONFIG_ZPOOL
- /* Add/move zspage to beginning of LRU */
- if (!list_empty(&zspage->lru))
- list_del(&zspage->lru);
- list_add(&zspage->lru, &pool->lru);
-#endif
-
spin_unlock(&pool->lock);
return handle;
}
EXPORT_SYMBOL_GPL(zs_malloc);
-static void obj_free(int class_size, unsigned long obj, unsigned long *handle)
+static void obj_free(int class_size, unsigned long obj)
{
struct link_free *link;
struct zspage *zspage;
@@ -1548,31 +1425,18 @@ static void obj_free(int class_size, unsigned long obj, unsigned long *handle)
void *vaddr;
obj_to_location(obj, &f_page, &f_objidx);
- f_offset = (class_size * f_objidx) & ~PAGE_MASK;
+ f_offset = offset_in_page(class_size * f_objidx);
zspage = get_zspage(f_page);
vaddr = kmap_atomic(f_page);
link = (struct link_free *)(vaddr + f_offset);
- if (handle) {
-#ifdef CONFIG_ZPOOL
- /* Stores the (deferred) handle in the object's header */
- *handle |= OBJ_DEFERRED_HANDLE_TAG;
- *handle &= ~OBJ_ALLOCATED_TAG;
-
- if (likely(!ZsHugePage(zspage)))
- link->deferred_handle = *handle;
- else
- f_page->index = *handle;
-#endif
- } else {
- /* Insert this object in containing zspage's freelist */
- if (likely(!ZsHugePage(zspage)))
- link->next = get_freeobj(zspage) << OBJ_TAG_BITS;
- else
- f_page->index = 0;
- set_freeobj(zspage, f_objidx);
- }
+ /* Insert this object in containing zspage's freelist */
+ if (likely(!ZsHugePage(zspage)))
+ link->next = get_freeobj(zspage) << OBJ_TAG_BITS;
+ else
+ f_page->index = 0;
+ set_freeobj(zspage, f_objidx);
kunmap_atomic(vaddr);
mod_zspage_inuse(zspage, -1);
@@ -1600,21 +1464,7 @@ void zs_free(struct zs_pool *pool, unsigned long handle)
class = zspage_class(pool, zspage);
class_stat_dec(class, ZS_OBJS_INUSE, 1);
-
-#ifdef CONFIG_ZPOOL
- if (zspage->under_reclaim) {
- /*
- * Reclaim needs the handles during writeback. It'll free
- * them along with the zspage when it's done with them.
- *
- * Record current deferred handle in the object's header.
- */
- obj_free(class->size, obj, &handle);
- spin_unlock(&pool->lock);
- return;
- }
-#endif
- obj_free(class->size, obj, NULL);
+ obj_free(class->size, obj);
fullness = fix_fullness_group(class, zspage);
if (fullness == ZS_INUSE_RATIO_0)
@@ -1640,8 +1490,8 @@ static void zs_object_copy(struct size_class *class, unsigned long dst,
obj_to_location(src, &s_page, &s_objidx);
obj_to_location(dst, &d_page, &d_objidx);
- s_off = (class->size * s_objidx) & ~PAGE_MASK;
- d_off = (class->size * d_objidx) & ~PAGE_MASK;
+ s_off = offset_in_page(class->size * s_objidx);
+ d_off = offset_in_page(class->size * d_objidx);
if (s_off + class->size > PAGE_SIZE)
s_size = PAGE_SIZE - s_off;
@@ -1735,18 +1585,6 @@ static unsigned long find_alloced_obj(struct size_class *class,
return find_tagged_obj(class, page, obj_idx, OBJ_ALLOCATED_TAG);
}
-#ifdef CONFIG_ZPOOL
-/*
- * Find object storing a deferred handle in header in zspage from index object
- * and return handle.
- */
-static unsigned long find_deferred_handle_obj(struct size_class *class,
- struct page *page, int *obj_idx)
-{
- return find_tagged_obj(class, page, obj_idx, OBJ_DEFERRED_HANDLE_TAG);
-}
-#endif
-
struct zs_compact_control {
/* Source spage for migration which could be a subpage of zspage */
struct page *s_page;
@@ -1786,7 +1624,7 @@ static void migrate_zspage(struct zs_pool *pool, struct size_class *class,
zs_object_copy(class, free_obj, used_obj);
obj_idx++;
record_obj(handle, free_obj);
- obj_free(class->size, used_obj, NULL);
+ obj_free(class->size, used_obj);
}
/* Remember last position in this iteration */
@@ -1846,7 +1684,7 @@ static int putback_zspage(struct size_class *class, struct zspage *zspage)
return fullness;
}
-#if defined(CONFIG_ZPOOL) || defined(CONFIG_COMPACTION)
+#ifdef CONFIG_COMPACTION
/*
* To prevent zspage destroy during migration, zspage freeing should
* hold locks of all pages in the zspage.
@@ -1888,24 +1726,7 @@ static void lock_zspage(struct zspage *zspage)
}
migrate_read_unlock(zspage);
}
-#endif /* defined(CONFIG_ZPOOL) || defined(CONFIG_COMPACTION) */
-
-#ifdef CONFIG_ZPOOL
-/*
- * Unlocks all the pages of the zspage.
- *
- * pool->lock must be held before this function is called
- * to prevent the underlying pages from migrating.
- */
-static void unlock_zspage(struct zspage *zspage)
-{
- struct page *page = get_first_page(zspage);
-
- do {
- unlock_page(page);
- } while ((page = get_next_page(page)) != NULL);
-}
-#endif /* CONFIG_ZPOOL */
+#endif /* CONFIG_COMPACTION */
static void migrate_lock_init(struct zspage *zspage)
{
@@ -2126,9 +1947,6 @@ static void async_free_zspage(struct work_struct *work)
VM_BUG_ON(fullness != ZS_INUSE_RATIO_0);
class = pool->size_class[class_idx];
spin_lock(&pool->lock);
-#ifdef CONFIG_ZPOOL
- list_del(&zspage->lru);
-#endif
__free_zspage(pool, class, zspage);
spin_unlock(&pool->lock);
}
@@ -2474,10 +2292,6 @@ struct zs_pool *zs_create_pool(const char *name)
*/
zs_register_shrinker(pool);
-#ifdef CONFIG_ZPOOL
- INIT_LIST_HEAD(&pool->lru);
-#endif
-
return pool;
err:
@@ -2520,190 +2334,6 @@ void zs_destroy_pool(struct zs_pool *pool)
}
EXPORT_SYMBOL_GPL(zs_destroy_pool);
-#ifdef CONFIG_ZPOOL
-static void restore_freelist(struct zs_pool *pool, struct size_class *class,
- struct zspage *zspage)
-{
- unsigned int obj_idx = 0;
- unsigned long handle, off = 0; /* off is within-page offset */
- struct page *page = get_first_page(zspage);
- struct link_free *prev_free = NULL;
- void *prev_page_vaddr = NULL;
-
- /* in case no free object found */
- set_freeobj(zspage, (unsigned int)(-1UL));
-
- while (page) {
- void *vaddr = kmap_atomic(page);
- struct page *next_page;
-
- while (off < PAGE_SIZE) {
- void *obj_addr = vaddr + off;
-
- /* skip allocated object */
- if (obj_allocated(page, obj_addr, &handle)) {
- obj_idx++;
- off += class->size;
- continue;
- }
-
- /* free deferred handle from reclaim attempt */
- if (obj_stores_deferred_handle(page, obj_addr, &handle))
- cache_free_handle(pool, handle);
-
- if (prev_free)
- prev_free->next = obj_idx << OBJ_TAG_BITS;
- else /* first free object found */
- set_freeobj(zspage, obj_idx);
-
- prev_free = (struct link_free *)vaddr + off / sizeof(*prev_free);
- /* if last free object in a previous page, need to unmap */
- if (prev_page_vaddr) {
- kunmap_atomic(prev_page_vaddr);
- prev_page_vaddr = NULL;
- }
-
- obj_idx++;
- off += class->size;
- }
-
- /*
- * Handle the last (full or partial) object on this page.
- */
- next_page = get_next_page(page);
- if (next_page) {
- if (!prev_free || prev_page_vaddr) {
- /*
- * There is no free object in this page, so we can safely
- * unmap it.
- */
- kunmap_atomic(vaddr);
- } else {
- /* update prev_page_vaddr since prev_free is on this page */
- prev_page_vaddr = vaddr;
- }
- } else { /* this is the last page */
- if (prev_free) {
- /*
- * Reset OBJ_TAG_BITS bit to last link to tell
- * whether it's allocated object or not.
- */
- prev_free->next = -1UL << OBJ_TAG_BITS;
- }
-
- /* unmap previous page (if not done yet) */
- if (prev_page_vaddr) {
- kunmap_atomic(prev_page_vaddr);
- prev_page_vaddr = NULL;
- }
-
- kunmap_atomic(vaddr);
- }
-
- page = next_page;
- off %= PAGE_SIZE;
- }
-}
-
-static int zs_reclaim_page(struct zs_pool *pool, unsigned int retries)
-{
- int i, obj_idx, ret = 0;
- unsigned long handle;
- struct zspage *zspage;
- struct page *page;
- int fullness;
-
- /* Lock LRU and fullness list */
- spin_lock(&pool->lock);
- if (list_empty(&pool->lru)) {
- spin_unlock(&pool->lock);
- return -EINVAL;
- }
-
- for (i = 0; i < retries; i++) {
- struct size_class *class;
-
- zspage = list_last_entry(&pool->lru, struct zspage, lru);
- list_del(&zspage->lru);
-
- /* zs_free may free objects, but not the zspage and handles */
- zspage->under_reclaim = true;
-
- class = zspage_class(pool, zspage);
- fullness = get_fullness_group(class, zspage);
-
- /* Lock out object allocations and object compaction */
- remove_zspage(class, zspage, fullness);
-
- spin_unlock(&pool->lock);
- cond_resched();
-
- /* Lock backing pages into place */
- lock_zspage(zspage);
-
- obj_idx = 0;
- page = get_first_page(zspage);
- while (1) {
- handle = find_alloced_obj(class, page, &obj_idx);
- if (!handle) {
- page = get_next_page(page);
- if (!page)
- break;
- obj_idx = 0;
- continue;
- }
-
- /*
- * This will write the object and call zs_free.
- *
- * zs_free will free the object, but the
- * under_reclaim flag prevents it from freeing
- * the zspage altogether. This is necessary so
- * that we can continue working with the
- * zspage potentially after the last object
- * has been freed.
- */
- ret = pool->zpool_ops->evict(pool->zpool, handle);
- if (ret)
- goto next;
-
- obj_idx++;
- }
-
-next:
- /* For freeing the zspage, or putting it back in the pool and LRU list. */
- spin_lock(&pool->lock);
- zspage->under_reclaim = false;
-
- if (!get_zspage_inuse(zspage)) {
- /*
- * Fullness went stale as zs_free() won't touch it
- * while the page is removed from the pool. Fix it
- * up for the check in __free_zspage().
- */
- zspage->fullness = ZS_INUSE_RATIO_0;
-
- __free_zspage(pool, class, zspage);
- spin_unlock(&pool->lock);
- return 0;
- }
-
- /*
- * Eviction fails on one of the handles, so we need to restore zspage.
- * We need to rebuild its freelist (and free stored deferred handles),
- * put it back to the correct size class, and add it to the LRU list.
- */
- restore_freelist(pool, class, zspage);
- putback_zspage(class, zspage);
- list_add(&zspage->lru, &pool->lru);
- unlock_zspage(zspage);
- }
-
- spin_unlock(&pool->lock);
- return -EAGAIN;
-}
-#endif /* CONFIG_ZPOOL */
-
static int __init zs_init(void)
{
int ret;
diff --git a/mm/zswap.c b/mm/zswap.c
index 30092d9a3b23..62195f72bf56 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -37,6 +37,7 @@
#include <linux/workqueue.h>
#include "swap.h"
+#include "internal.h"
/*********************************
* statistics
@@ -137,6 +138,10 @@ static bool zswap_non_same_filled_pages_enabled = true;
module_param_named(non_same_filled_pages_enabled, zswap_non_same_filled_pages_enabled,
bool, 0644);
+static bool zswap_exclusive_loads_enabled = IS_ENABLED(
+ CONFIG_ZSWAP_EXCLUSIVE_LOADS_DEFAULT_ON);
+module_param_named(exclusive_loads, zswap_exclusive_loads_enabled, bool, 0644);
+
/*********************************
* data structures
**********************************/
@@ -149,6 +154,12 @@ struct crypto_acomp_ctx {
struct mutex *mutex;
};
+/*
+ * The lock ordering is zswap_tree.lock -> zswap_pool.lru_lock.
+ * The only case where lru_lock is not acquired while holding tree.lock is
+ * when a zswap_entry is taken off the lru for writeback, in that case it
+ * needs to be verified that it's still valid in the tree.
+ */
struct zswap_pool {
struct zpool *zpool;
struct crypto_acomp_ctx __percpu *acomp_ctx;
@@ -158,6 +169,8 @@ struct zswap_pool {
struct work_struct shrink_work;
struct hlist_node node;
char tfm_name[CRYPTO_MAX_ALG_NAME];
+ struct list_head lru;
+ spinlock_t lru_lock;
};
/*
@@ -175,14 +188,16 @@ struct zswap_pool {
* be held while changing the refcount. Since the lock must
* be held, there is no reason to also make refcount atomic.
* length - the length in bytes of the compressed page data. Needed during
- * decompression. For a same value filled page length is 0.
+ * decompression. For a same value filled page length is 0, and both
+ * pool and lru are invalid and must be ignored.
* pool - the zswap_pool the entry's data is in
* handle - zpool allocation handle that stores the compressed page data
* value - value of the same-value filled pages which have same content
+ * lru - handle to the pool's lru used to evict pages.
*/
struct zswap_entry {
struct rb_node rbnode;
- pgoff_t offset;
+ swp_entry_t swpentry;
int refcount;
unsigned int length;
struct zswap_pool *pool;
@@ -191,10 +206,7 @@ struct zswap_entry {
unsigned long value;
};
struct obj_cgroup *objcg;
-};
-
-struct zswap_header {
- swp_entry_t swpentry;
+ struct list_head lru;
};
/*
@@ -238,14 +250,11 @@ static bool zswap_has_pool;
pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name, \
zpool_get_type((p)->zpool))
-static int zswap_writeback_entry(struct zpool *pool, unsigned long handle);
+static int zswap_writeback_entry(struct zswap_entry *entry,
+ struct zswap_tree *tree);
static int zswap_pool_get(struct zswap_pool *pool);
static void zswap_pool_put(struct zswap_pool *pool);
-static const struct zpool_ops zswap_zpool_ops = {
- .evict = zswap_writeback_entry
-};
-
static bool zswap_is_full(void)
{
return totalram_pages() * zswap_max_pool_percent / 100 <
@@ -302,12 +311,14 @@ static struct zswap_entry *zswap_rb_search(struct rb_root *root, pgoff_t offset)
{
struct rb_node *node = root->rb_node;
struct zswap_entry *entry;
+ pgoff_t entry_offset;
while (node) {
entry = rb_entry(node, struct zswap_entry, rbnode);
- if (entry->offset > offset)
+ entry_offset = swp_offset(entry->swpentry);
+ if (entry_offset > offset)
node = node->rb_left;
- else if (entry->offset < offset)
+ else if (entry_offset < offset)
node = node->rb_right;
else
return entry;
@@ -324,13 +335,15 @@ static int zswap_rb_insert(struct rb_root *root, struct zswap_entry *entry,
{
struct rb_node **link = &root->rb_node, *parent = NULL;
struct zswap_entry *myentry;
+ pgoff_t myentry_offset, entry_offset = swp_offset(entry->swpentry);
while (*link) {
parent = *link;
myentry = rb_entry(parent, struct zswap_entry, rbnode);
- if (myentry->offset > entry->offset)
+ myentry_offset = swp_offset(myentry->swpentry);
+ if (myentry_offset > entry_offset)
link = &(*link)->rb_left;
- else if (myentry->offset < entry->offset)
+ else if (myentry_offset < entry_offset)
link = &(*link)->rb_right;
else {
*dupentry = myentry;
@@ -342,12 +355,14 @@ static int zswap_rb_insert(struct rb_root *root, struct zswap_entry *entry,
return 0;
}
-static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry)
+static bool zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry)
{
if (!RB_EMPTY_NODE(&entry->rbnode)) {
rb_erase(&entry->rbnode, root);
RB_CLEAR_NODE(&entry->rbnode);
+ return true;
}
+ return false;
}
/*
@@ -363,6 +378,9 @@ static void zswap_free_entry(struct zswap_entry *entry)
if (!entry->length)
atomic_dec(&zswap_same_filled_pages);
else {
+ spin_lock(&entry->pool->lru_lock);
+ list_del(&entry->lru);
+ spin_unlock(&entry->pool->lru_lock);
zpool_free(entry->pool->zpool, entry->handle);
zswap_pool_put(entry->pool);
}
@@ -583,13 +601,95 @@ static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor)
return NULL;
}
+/*
+ * If the entry is still valid in the tree, drop the initial ref and remove it
+ * from the tree. This function must be called with an additional ref held,
+ * otherwise it may race with another invalidation freeing the entry.
+ */
+static void zswap_invalidate_entry(struct zswap_tree *tree,
+ struct zswap_entry *entry)
+{
+ if (zswap_rb_erase(&tree->rbroot, entry))
+ zswap_entry_put(tree, entry);
+}
+
+static int zswap_reclaim_entry(struct zswap_pool *pool)
+{
+ struct zswap_entry *entry;
+ struct zswap_tree *tree;
+ pgoff_t swpoffset;
+ int ret;
+
+ /* Get an entry off the LRU */
+ spin_lock(&pool->lru_lock);
+ if (list_empty(&pool->lru)) {
+ spin_unlock(&pool->lru_lock);
+ return -EINVAL;
+ }
+ entry = list_last_entry(&pool->lru, struct zswap_entry, lru);
+ list_del_init(&entry->lru);
+ /*
+ * Once the lru lock is dropped, the entry might get freed. The
+ * swpoffset is copied to the stack, and entry isn't deref'd again
+ * until the entry is verified to still be alive in the tree.
+ */
+ swpoffset = swp_offset(entry->swpentry);
+ tree = zswap_trees[swp_type(entry->swpentry)];
+ spin_unlock(&pool->lru_lock);
+
+ /* Check for invalidate() race */
+ spin_lock(&tree->lock);
+ if (entry != zswap_rb_search(&tree->rbroot, swpoffset)) {
+ ret = -EAGAIN;
+ goto unlock;
+ }
+ /* Hold a reference to prevent a free during writeback */
+ zswap_entry_get(entry);
+ spin_unlock(&tree->lock);
+
+ ret = zswap_writeback_entry(entry, tree);
+
+ spin_lock(&tree->lock);
+ if (ret) {
+ /* Writeback failed, put entry back on LRU */
+ spin_lock(&pool->lru_lock);
+ list_move(&entry->lru, &pool->lru);
+ spin_unlock(&pool->lru_lock);
+ goto put_unlock;
+ }
+
+ /*
+ * Writeback started successfully, the page now belongs to the
+ * swapcache. Drop the entry from zswap - unless invalidate already
+ * took it out while we had the tree->lock released for IO.
+ */
+ zswap_invalidate_entry(tree, entry);
+
+put_unlock:
+ /* Drop local reference */
+ zswap_entry_put(tree, entry);
+unlock:
+ spin_unlock(&tree->lock);
+ return ret ? -EAGAIN : 0;
+}
+
static void shrink_worker(struct work_struct *w)
{
struct zswap_pool *pool = container_of(w, typeof(*pool),
shrink_work);
+ int ret, failures = 0;
- if (zpool_shrink(pool->zpool, 1, NULL))
- zswap_reject_reclaim_fail++;
+ do {
+ ret = zswap_reclaim_entry(pool);
+ if (ret) {
+ zswap_reject_reclaim_fail++;
+ if (ret != -EAGAIN)
+ break;
+ if (++failures == MAX_RECLAIM_RETRIES)
+ break;
+ }
+ cond_resched();
+ } while (!zswap_can_accept());
zswap_pool_put(pool);
}
@@ -618,7 +718,7 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
/* unique name for each pool specifically required by zsmalloc */
snprintf(name, 38, "zswap%x", atomic_inc_return(&zswap_pools_count));
- pool->zpool = zpool_create_pool(type, name, gfp, &zswap_zpool_ops);
+ pool->zpool = zpool_create_pool(type, name, gfp);
if (!pool->zpool) {
pr_err("%s zpool not available\n", type);
goto error;
@@ -644,6 +744,8 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
*/
kref_init(&pool->kref);
INIT_LIST_HEAD(&pool->list);
+ INIT_LIST_HEAD(&pool->lru);
+ spin_lock_init(&pool->lru_lock);
INIT_WORK(&pool->shrink_work, shrink_worker);
zswap_pool_debug("created", pool);
@@ -964,16 +1066,14 @@ static int zswap_get_swap_cache_page(swp_entry_t entry,
* the swap cache, the compressed version stored by zswap can be
* freed.
*/
-static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
+static int zswap_writeback_entry(struct zswap_entry *entry,
+ struct zswap_tree *tree)
{
- struct zswap_header *zhdr;
- swp_entry_t swpentry;
- struct zswap_tree *tree;
- pgoff_t offset;
- struct zswap_entry *entry;
+ swp_entry_t swpentry = entry->swpentry;
struct page *page;
struct scatterlist input, output;
struct crypto_acomp_ctx *acomp_ctx;
+ struct zpool *pool = entry->pool->zpool;
u8 *src, *tmp = NULL;
unsigned int dlen;
@@ -988,25 +1088,6 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
return -ENOMEM;
}
- /* extract swpentry from data */
- zhdr = zpool_map_handle(pool, handle, ZPOOL_MM_RO);
- swpentry = zhdr->swpentry; /* here */
- tree = zswap_trees[swp_type(swpentry)];
- offset = swp_offset(swpentry);
- zpool_unmap_handle(pool, handle);
-
- /* find and ref zswap entry */
- spin_lock(&tree->lock);
- entry = zswap_entry_find_get(&tree->rbroot, offset);
- if (!entry) {
- /* entry was invalidated */
- spin_unlock(&tree->lock);
- kfree(tmp);
- return 0;
- }
- spin_unlock(&tree->lock);
- BUG_ON(offset != entry->offset);
-
/* try to allocate swap cache page */
switch (zswap_get_swap_cache_page(swpentry, &page)) {
case ZSWAP_SWAPCACHE_FAIL: /* no memory or invalidate happened */
@@ -1028,7 +1109,7 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
* writing.
*/
spin_lock(&tree->lock);
- if (zswap_rb_search(&tree->rbroot, entry->offset) != entry) {
+ if (zswap_rb_search(&tree->rbroot, swp_offset(entry->swpentry)) != entry) {
spin_unlock(&tree->lock);
delete_from_swap_cache(page_folio(page));
ret = -ENOMEM;
@@ -1040,12 +1121,11 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
dlen = PAGE_SIZE;
- zhdr = zpool_map_handle(pool, handle, ZPOOL_MM_RO);
- src = (u8 *)zhdr + sizeof(struct zswap_header);
+ src = zpool_map_handle(pool, entry->handle, ZPOOL_MM_RO);
if (!zpool_can_sleep_mapped(pool)) {
memcpy(tmp, src, entry->length);
src = tmp;
- zpool_unmap_handle(pool, handle);
+ zpool_unmap_handle(pool, entry->handle);
}
mutex_lock(acomp_ctx->mutex);
@@ -1060,7 +1140,7 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
if (!zpool_can_sleep_mapped(pool))
kfree(tmp);
else
- zpool_unmap_handle(pool, handle);
+ zpool_unmap_handle(pool, entry->handle);
BUG_ON(ret);
BUG_ON(dlen != PAGE_SIZE);
@@ -1077,23 +1157,7 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
put_page(page);
zswap_written_back_pages++;
- spin_lock(&tree->lock);
- /* drop local reference */
- zswap_entry_put(tree, entry);
-
- /*
- * There are two possible situations for entry here:
- * (1) refcount is 1(normal case), entry is valid and on the tree
- * (2) refcount is 0, entry is freed and not on the tree
- * because invalidate happened during writeback
- * search the tree and free the entry if find entry
- */
- if (entry == zswap_rb_search(&tree->rbroot, offset))
- zswap_entry_put(tree, entry);
- spin_unlock(&tree->lock);
-
return ret;
-
fail:
if (!zpool_can_sleep_mapped(pool))
kfree(tmp);
@@ -1102,13 +1166,8 @@ fail:
* if we get here due to ZSWAP_SWAPCACHE_EXIST
* a load may be happening concurrently.
* it is safe and okay to not free the entry.
- * if we free the entry in the following put
* it is also okay to return !0
*/
- spin_lock(&tree->lock);
- zswap_entry_put(tree, entry);
- spin_unlock(&tree->lock);
-
return ret;
}
@@ -1156,11 +1215,10 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
struct obj_cgroup *objcg = NULL;
struct zswap_pool *pool;
int ret;
- unsigned int hlen, dlen = PAGE_SIZE;
+ unsigned int dlen = PAGE_SIZE;
unsigned long handle, value;
char *buf;
u8 *src, *dst;
- struct zswap_header zhdr = { .swpentry = swp_entry(type, offset) };
gfp_t gfp;
/* THP isn't supported */
@@ -1195,7 +1253,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
if (zswap_pool_reached_full) {
if (!zswap_can_accept()) {
ret = -ENOMEM;
- goto reject;
+ goto shrink;
} else
zswap_pool_reached_full = false;
}
@@ -1212,7 +1270,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
src = kmap_atomic(page);
if (zswap_is_page_same_filled(src, &value)) {
kunmap_atomic(src);
- entry->offset = offset;
+ entry->swpentry = swp_entry(type, offset);
entry->length = 0;
entry->value = value;
atomic_inc(&zswap_same_filled_pages);
@@ -1266,11 +1324,10 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
}
/* store */
- hlen = zpool_evictable(entry->pool->zpool) ? sizeof(zhdr) : 0;
gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
if (zpool_malloc_support_movable(entry->pool->zpool))
gfp |= __GFP_HIGHMEM | __GFP_MOVABLE;
- ret = zpool_malloc(entry->pool->zpool, hlen + dlen, gfp, &handle);
+ ret = zpool_malloc(entry->pool->zpool, dlen, gfp, &handle);
if (ret == -ENOSPC) {
zswap_reject_compress_poor++;
goto put_dstmem;
@@ -1280,13 +1337,12 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
goto put_dstmem;
}
buf = zpool_map_handle(entry->pool->zpool, handle, ZPOOL_MM_WO);
- memcpy(buf, &zhdr, hlen);
- memcpy(buf + hlen, dst, dlen);
+ memcpy(buf, dst, dlen);
zpool_unmap_handle(entry->pool->zpool, handle);
mutex_unlock(acomp_ctx->mutex);
/* populate entry */
- entry->offset = offset;
+ entry->swpentry = swp_entry(type, offset);
entry->handle = handle;
entry->length = dlen;
@@ -1309,6 +1365,11 @@ insert_entry:
zswap_entry_put(tree, dupentry);
}
} while (ret == -EEXIST);
+ if (entry->length) {
+ spin_lock(&entry->pool->lru_lock);
+ list_add(&entry->lru, &entry->pool->lru);
+ spin_unlock(&entry->pool->lru_lock);
+ }
spin_unlock(&tree->lock);
/* update stats */
@@ -1341,7 +1402,7 @@ shrink:
* return -1 on entry not found or error
*/
static int zswap_frontswap_load(unsigned type, pgoff_t offset,
- struct page *page)
+ struct page *page, bool *exclusive)
{
struct zswap_tree *tree = zswap_trees[type];
struct zswap_entry *entry;
@@ -1380,8 +1441,6 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
/* decompress */
dlen = PAGE_SIZE;
src = zpool_map_handle(entry->pool->zpool, entry->handle, ZPOOL_MM_RO);
- if (zpool_evictable(entry->pool->zpool))
- src += sizeof(struct zswap_header);
if (!zpool_can_sleep_mapped(entry->pool->zpool)) {
memcpy(tmp, src, entry->length);
@@ -1410,6 +1469,14 @@ stats:
count_objcg_event(entry->objcg, ZSWPIN);
freeentry:
spin_lock(&tree->lock);
+ if (!ret && zswap_exclusive_loads_enabled) {
+ zswap_invalidate_entry(tree, entry);
+ *exclusive = true;
+ } else if (entry->length) {
+ spin_lock(&entry->pool->lru_lock);
+ list_move(&entry->lru, &entry->pool->lru);
+ spin_unlock(&entry->pool->lru_lock);
+ }
zswap_entry_put(tree, entry);
spin_unlock(&tree->lock);
@@ -1430,13 +1497,7 @@ static void zswap_frontswap_invalidate_page(unsigned type, pgoff_t offset)
spin_unlock(&tree->lock);
return;
}
-
- /* remove from rbtree */
- zswap_rb_erase(&tree->rbroot, entry);
-
- /* drop the initial reference from entry creation */
- zswap_entry_put(tree, entry);
-
+ zswap_invalidate_entry(tree, entry);
spin_unlock(&tree->lock);
}