summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig38
-rw-r--r--mm/Kconfig.debug1
-rw-r--r--mm/Makefile4
-rw-r--r--mm/backing-dev.c17
-rw-r--r--mm/cma.c4
-rw-r--r--mm/compaction.c334
-rw-r--r--mm/damon/core-test.h24
-rw-r--r--mm/damon/core.c2
-rw-r--r--mm/damon/ops-common.c32
-rw-r--r--mm/damon/ops-common.h4
-rw-r--r--mm/damon/paddr.c6
-rw-r--r--mm/damon/vaddr.c26
-rw-r--r--mm/debug.c9
-rw-r--r--mm/debug_page_alloc.c59
-rw-r--r--mm/debug_vm_pgtable.c9
-rw-r--r--mm/dmapool.c10
-rw-r--r--mm/early_ioremap.c8
-rw-r--r--mm/fadvise.c17
-rw-r--r--mm/fail_page_alloc.c66
-rw-r--r--mm/filemap.c455
-rw-r--r--mm/frontswap.c10
-rw-r--r--mm/gup.c466
-rw-r--r--mm/gup_test.c28
-rw-r--r--mm/highmem.c12
-rw-r--r--mm/hmm.c6
-rw-r--r--mm/huge_memory.c56
-rw-r--r--mm/hugetlb.c126
-rw-r--r--mm/hugetlb_vmemmap.c17
-rw-r--r--mm/internal.h93
-rw-r--r--mm/kasan/common.c2
-rw-r--r--mm/kasan/generic.c76
-rw-r--r--mm/kasan/init.c9
-rw-r--r--mm/kasan/kasan.h159
-rw-r--r--mm/kasan/report.c44
-rw-r--r--mm/kasan/report_generic.c12
-rw-r--r--mm/kasan/report_hw_tags.c2
-rw-r--r--mm/kasan/report_sw_tags.c2
-rw-r--r--mm/kasan/shadow.c46
-rw-r--r--mm/kasan/sw_tags.c20
-rw-r--r--mm/kasan/tags.c2
-rw-r--r--mm/kfence/kfence_test.c7
-rw-r--r--mm/khugepaged.c133
-rw-r--r--mm/kmsan/core.c6
-rw-r--r--mm/kmsan/instrumentation.c2
-rw-r--r--mm/ksm.c38
-rw-r--r--mm/madvise.c150
-rw-r--r--mm/mapping_dirty_helpers.c38
-rw-r--r--mm/memblock.c42
-rw-r--r--mm/memcontrol.c253
-rw-r--r--mm/memfd.c9
-rw-r--r--mm/memory-failure.c45
-rw-r--r--mm/memory-tiers.c3
-rw-r--r--mm/memory.c493
-rw-r--r--mm/memory_hotplug.c42
-rw-r--r--mm/mempolicy.c28
-rw-r--r--mm/migrate.c382
-rw-r--r--mm/migrate_device.c46
-rw-r--r--mm/mincore.c11
-rw-r--r--mm/mlock.c10
-rw-r--r--mm/mm_init.c161
-rw-r--r--mm/mmap.c383
-rw-r--r--mm/mprotect.c89
-rw-r--r--mm/mremap.c35
-rw-r--r--mm/nommu.c17
-rw-r--r--mm/oom_kill.c8
-rw-r--r--mm/page-writeback.c6
-rw-r--r--mm/page_alloc.c1046
-rw-r--r--mm/page_io.c8
-rw-r--r--mm/page_isolation.c33
-rw-r--r--mm/page_owner.c2
-rw-r--r--mm/page_table_check.c12
-rw-r--r--mm/page_vma_mapped.c114
-rw-r--r--mm/pagewalk.c33
-rw-r--r--mm/percpu-internal.h11
-rw-r--r--mm/pgtable-generic.c58
-rw-r--r--mm/process_vm_access.c2
-rw-r--r--mm/ptdump.c2
-rw-r--r--mm/readahead.c1
-rw-r--r--mm/rmap.c36
-rw-r--r--mm/secretmem.c4
-rw-r--r--mm/shmem.c141
-rw-r--r--mm/show_mem.c429
-rw-r--r--mm/shrinker_debug.c39
-rw-r--r--mm/slab.c43
-rw-r--r--mm/slab.h71
-rw-r--r--mm/slab_common.c64
-rw-r--r--mm/slub.c186
-rw-r--r--mm/sparse-vmemmap.c8
-rw-r--r--mm/sparse.c12
-rw-r--r--mm/swap.c20
-rw-r--r--mm/swap_state.c87
-rw-r--r--mm/swapfile.c115
-rw-r--r--mm/truncate.c27
-rw-r--r--mm/userfaultfd.c12
-rw-r--r--mm/vmalloc.c147
-rw-r--r--mm/vmscan.c322
-rw-r--r--mm/vmstat.c18
-rw-r--r--mm/workingset.c158
-rw-r--r--mm/z3fold.c249
-rw-r--r--mm/zbud.c167
-rw-r--r--mm/zpool.c48
-rw-r--r--mm/zsmalloc.c408
-rw-r--r--mm/zswap.c250
103 files changed, 4773 insertions, 4360 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 7672a22647b4..09130434e30d 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -46,6 +46,22 @@ config ZSWAP_DEFAULT_ON
The selection made here can be overridden by using the kernel
command line 'zswap.enabled=' option.
+config ZSWAP_EXCLUSIVE_LOADS_DEFAULT_ON
+ bool "Invalidate zswap entries when pages are loaded"
+ depends on ZSWAP
+ help
+ If selected, exclusive loads for zswap will be enabled at boot,
+ otherwise it will be disabled.
+
+ If exclusive loads are enabled, when a page is loaded from zswap,
+ the zswap entry is invalidated at once, as opposed to leaving it
+ in zswap until the swap entry is freed.
+
+ This avoids having two copies of the same page in memory
+ (compressed and uncompressed) after faulting in a page from zswap.
+ The cost is that if the page was never dirtied and needs to be
+ swapped out again, it will be re-compressed.
+
choice
prompt "Default compressor"
depends on ZSWAP
@@ -218,18 +234,23 @@ choice
help
This option allows to select a slab allocator.
-config SLAB
- bool "SLAB"
+config SLAB_DEPRECATED
+ bool "SLAB (DEPRECATED)"
depends on !PREEMPT_RT
- select HAVE_HARDENED_USERCOPY_ALLOCATOR
help
+ Deprecated and scheduled for removal in a few cycles. Replaced by
+ SLUB.
+
+ If you cannot migrate to SLUB, please contact linux-mm@kvack.org
+ and the people listed in the SLAB ALLOCATOR section of MAINTAINERS
+ file, explaining why.
+
The regular slab allocator that is established and known to work
well in all environments. It organizes cache hot objects in
per cpu and per node queues.
config SLUB
bool "SLUB (Unqueued Allocator)"
- select HAVE_HARDENED_USERCOPY_ALLOCATOR
help
SLUB is a slab allocator that minimizes cache line usage
instead of managing queues of cached objects (SLAB approach).
@@ -240,6 +261,11 @@ config SLUB
endchoice
+config SLAB
+ bool
+ default y
+ depends on SLAB_DEPRECATED
+
config SLUB_TINY
bool "Configure SLUB for minimal memory footprint"
depends on SLUB && EXPERT
@@ -1206,6 +1232,10 @@ config PER_VMA_LOCK
This feature allows locking each virtual memory area separately when
handling page faults instead of taking mmap_lock.
+config LOCK_MM_AND_FIND_VMA
+ bool
+ depends on !STACK_GROWSUP
+
source "mm/damon/Kconfig"
endmenu
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index a925415b4d10..018a5bd2f576 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -98,6 +98,7 @@ config PAGE_OWNER
config PAGE_TABLE_CHECK
bool "Check for invalid mappings in user page tables"
depends on ARCH_SUPPORTS_PAGE_TABLE_CHECK
+ depends on EXCLUSIVE_SYSTEM_RAM
select PAGE_EXTENSION
help
Check that anonymous page is not being mapped twice with read write
diff --git a/mm/Makefile b/mm/Makefile
index e29afc890cde..678530a07326 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -51,7 +51,7 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
readahead.o swap.o truncate.o vmscan.o shmem.o \
util.o mmzone.o vmstat.o backing-dev.o \
mm_init.o percpu.o slab_common.o \
- compaction.o \
+ compaction.o show_mem.o\
interval_tree.o list_lru.o workingset.o \
debug.o gup.o mmap_lock.o $(mmu-y)
@@ -89,6 +89,7 @@ obj-$(CONFIG_KASAN) += kasan/
obj-$(CONFIG_KFENCE) += kfence/
obj-$(CONFIG_KMSAN) += kmsan/
obj-$(CONFIG_FAILSLAB) += failslab.o
+obj-$(CONFIG_FAIL_PAGE_ALLOC) += fail_page_alloc.o
obj-$(CONFIG_MEMTEST) += memtest.o
obj-$(CONFIG_MIGRATION) += migrate.o
obj-$(CONFIG_NUMA) += memory-tiers.o
@@ -123,6 +124,7 @@ obj-$(CONFIG_SECRETMEM) += secretmem.o
obj-$(CONFIG_CMA_SYSFS) += cma_sysfs.o
obj-$(CONFIG_USERFAULTFD) += userfaultfd.o
obj-$(CONFIG_IDLE_PAGE_TRACKING) += page_idle.o
+obj-$(CONFIG_DEBUG_PAGEALLOC) += debug_page_alloc.o
obj-$(CONFIG_DEBUG_PAGE_REF) += debug_page_ref.o
obj-$(CONFIG_DAMON) += damon/
obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 7da9727fcdf3..3ffc3cfa7a14 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -20,7 +20,6 @@
struct backing_dev_info noop_backing_dev_info;
EXPORT_SYMBOL_GPL(noop_backing_dev_info);
-static struct class *bdi_class;
static const char *bdi_unknown_name = "(unknown)";
/*
@@ -345,13 +344,19 @@ static struct attribute *bdi_dev_attrs[] = {
};
ATTRIBUTE_GROUPS(bdi_dev);
+static const struct class bdi_class = {
+ .name = "bdi",
+ .dev_groups = bdi_dev_groups,
+};
+
static __init int bdi_class_init(void)
{
- bdi_class = class_create("bdi");
- if (IS_ERR(bdi_class))
- return PTR_ERR(bdi_class);
+ int ret;
+
+ ret = class_register(&bdi_class);
+ if (ret)
+ return ret;
- bdi_class->dev_groups = bdi_dev_groups;
bdi_debug_init();
return 0;
@@ -1001,7 +1006,7 @@ int bdi_register_va(struct backing_dev_info *bdi, const char *fmt, va_list args)
return 0;
vsnprintf(bdi->dev_name, sizeof(bdi->dev_name), fmt, args);
- dev = device_create(bdi_class, NULL, MKDEV(0, 0), bdi, bdi->dev_name);
+ dev = device_create(&bdi_class, NULL, MKDEV(0, 0), bdi, bdi->dev_name);
if (IS_ERR(dev))
return PTR_ERR(dev);
diff --git a/mm/cma.c b/mm/cma.c
index 6268d6620254..a4cfe995e11e 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -483,8 +483,8 @@ struct page *cma_alloc(struct cma *cma, unsigned long count,
if (ret != -EBUSY)
break;
- pr_debug("%s(): memory range at %p is busy, retrying\n",
- __func__, pfn_to_page(pfn));
+ pr_debug("%s(): memory range at pfn 0x%lx %p is busy, retrying\n",
+ __func__, pfn, pfn_to_page(pfn));
trace_cma_alloc_busy_retry(cma->name, pfn, pfn_to_page(pfn),
count, align);
diff --git a/mm/compaction.c b/mm/compaction.c
index c8bcdea15f5f..dbc9f86b1934 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -229,6 +229,33 @@ static void reset_cached_positions(struct zone *zone)
pageblock_start_pfn(zone_end_pfn(zone) - 1);
}
+#ifdef CONFIG_SPARSEMEM
+/*
+ * If the PFN falls into an offline section, return the start PFN of the
+ * next online section. If the PFN falls into an online section or if
+ * there is no next online section, return 0.
+ */
+static unsigned long skip_offline_sections(unsigned long start_pfn)
+{
+ unsigned long start_nr = pfn_to_section_nr(start_pfn);
+
+ if (online_section_nr(start_nr))
+ return 0;
+
+ while (++start_nr <= __highest_present_section_nr) {
+ if (online_section_nr(start_nr))
+ return section_nr_to_pfn(start_nr);
+ }
+
+ return 0;
+}
+#else
+static unsigned long skip_offline_sections(unsigned long start_pfn)
+{
+ return 0;
+}
+#endif
+
/*
* Compound pages of >= pageblock_order should consistently be skipped until
* released. It is always pointless to compact pages of such order (if they are
@@ -392,18 +419,14 @@ void reset_isolation_suitable(pg_data_t *pgdat)
* Sets the pageblock skip bit if it was clear. Note that this is a hint as
* locks are not required for read/writers. Returns true if it was already set.
*/
-static bool test_and_set_skip(struct compact_control *cc, struct page *page,
- unsigned long pfn)
+static bool test_and_set_skip(struct compact_control *cc, struct page *page)
{
bool skip;
- /* Do no update if skip hint is being ignored */
+ /* Do not update if skip hint is being ignored */
if (cc->ignore_skip_hint)
return false;
- if (!pageblock_aligned(pfn))
- return false;
-
skip = get_pageblock_skip(page);
if (!skip && !cc->no_set_skip_hint)
set_pageblock_skip(page);
@@ -440,9 +463,6 @@ static void update_pageblock_skip(struct compact_control *cc,
if (cc->no_set_skip_hint)
return;
- if (!page)
- return;
-
set_pageblock_skip(page);
/* Update where async and sync compaction should restart */
@@ -470,8 +490,7 @@ static void update_cached_migrate(struct compact_control *cc, unsigned long pfn)
{
}
-static bool test_and_set_skip(struct compact_control *cc, struct page *page,
- unsigned long pfn)
+static bool test_and_set_skip(struct compact_control *cc, struct page *page)
{
return false;
}
@@ -745,8 +764,9 @@ isolate_freepages_range(struct compact_control *cc,
}
/* Similar to reclaim, but different enough that they don't share logic */
-static bool too_many_isolated(pg_data_t *pgdat)
+static bool too_many_isolated(struct compact_control *cc)
{
+ pg_data_t *pgdat = cc->zone->zone_pgdat;
bool too_many;
unsigned long active, inactive, isolated;
@@ -758,6 +778,17 @@ static bool too_many_isolated(pg_data_t *pgdat)
isolated = node_page_state(pgdat, NR_ISOLATED_FILE) +
node_page_state(pgdat, NR_ISOLATED_ANON);
+ /*
+ * Allow GFP_NOFS to isolate past the limit set for regular
+ * compaction runs. This prevents an ABBA deadlock when other
+ * compactors have already isolated to the limit, but are
+ * blocked on filesystem locks held by the GFP_NOFS thread.
+ */
+ if (cc->gfp_mask & __GFP_FS) {
+ inactive >>= 3;
+ active >>= 3;
+ }
+
too_many = isolated > (inactive + active) / 2;
if (!too_many)
wake_throttle_isolated(pgdat);
@@ -791,6 +822,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
struct lruvec *lruvec;
unsigned long flags = 0;
struct lruvec *locked = NULL;
+ struct folio *folio = NULL;
struct page *page = NULL, *valid_page = NULL;
struct address_space *mapping;
unsigned long start_pfn = low_pfn;
@@ -806,7 +838,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
* list by either parallel reclaimers or compaction. If there are,
* delay for some time until fewer pages are isolated
*/
- while (unlikely(too_many_isolated(pgdat))) {
+ while (unlikely(too_many_isolated(cc))) {
/* stop isolation if there are still pages not migrated */
if (cc->nr_migratepages)
return -EAGAIN;
@@ -887,7 +919,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
if (!valid_page && pageblock_aligned(low_pfn)) {
if (!isolation_suitable(cc, page)) {
low_pfn = end_pfn;
- page = NULL;
+ folio = NULL;
goto isolate_abort;
}
valid_page = page;
@@ -919,7 +951,8 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
* Hugepage was successfully isolated and placed
* on the cc->migratepages list.
*/
- low_pfn += compound_nr(page) - 1;
+ folio = page_folio(page);
+ low_pfn += folio_nr_pages(folio) - 1;
goto isolate_success_no_list;
}
@@ -987,8 +1020,10 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
locked = NULL;
}
- if (isolate_movable_page(page, mode))
+ if (isolate_movable_page(page, mode)) {
+ folio = page_folio(page);
goto isolate_success;
+ }
}
goto isolate_fail;
@@ -999,7 +1034,8 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
* sure the page is not being freed elsewhere -- the
* page release code relies on it.
*/
- if (unlikely(!get_page_unless_zero(page)))
+ folio = folio_get_nontail_page(page);
+ if (unlikely(!folio))
goto isolate_fail;
/*
@@ -1007,8 +1043,8 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
* so avoid taking lru_lock and isolating it unnecessarily in an
* admittedly racy check.
*/
- mapping = page_mapping(page);
- if (!mapping && (page_count(page) - 1) > total_mapcount(page))
+ mapping = folio_mapping(folio);
+ if (!mapping && (folio_ref_count(folio) - 1) > folio_mapcount(folio))
goto isolate_fail_put;
/*
@@ -1019,11 +1055,11 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
goto isolate_fail_put;
/* Only take pages on LRU: a check now makes later tests safe */
- if (!PageLRU(page))
+ if (!folio_test_lru(folio))
goto isolate_fail_put;
/* Compaction might skip unevictable pages but CMA takes them */
- if (!(mode & ISOLATE_UNEVICTABLE) && PageUnevictable(page))
+ if (!(mode & ISOLATE_UNEVICTABLE) && folio_test_unevictable(folio))
goto isolate_fail_put;
/*
@@ -1032,10 +1068,10 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
* it will be able to migrate without blocking - clean pages
* for the most part. PageWriteback would require blocking.
*/
- if ((mode & ISOLATE_ASYNC_MIGRATE) && PageWriteback(page))
+ if ((mode & ISOLATE_ASYNC_MIGRATE) && folio_test_writeback(folio))
goto isolate_fail_put;
- if ((mode & ISOLATE_ASYNC_MIGRATE) && PageDirty(page)) {
+ if ((mode & ISOLATE_ASYNC_MIGRATE) && folio_test_dirty(folio)) {
bool migrate_dirty;
/*
@@ -1047,22 +1083,22 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
* the page lock until after the page is removed
* from the page cache.
*/
- if (!trylock_page(page))
+ if (!folio_trylock(folio))
goto isolate_fail_put;
- mapping = page_mapping(page);
+ mapping = folio_mapping(folio);
migrate_dirty = !mapping ||
mapping->a_ops->migrate_folio;
- unlock_page(page);
+ folio_unlock(folio);
if (!migrate_dirty)
goto isolate_fail_put;
}
- /* Try isolate the page */
- if (!TestClearPageLRU(page))
+ /* Try isolate the folio */
+ if (!folio_test_clear_lru(folio))
goto isolate_fail_put;
- lruvec = folio_lruvec(page_folio(page));
+ lruvec = folio_lruvec(folio);
/* If we already hold the lock, we can skip some rechecking */
if (lruvec != locked) {
@@ -1072,44 +1108,49 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
compact_lock_irqsave(&lruvec->lru_lock, &flags, cc);
locked = lruvec;
- lruvec_memcg_debug(lruvec, page_folio(page));
+ lruvec_memcg_debug(lruvec, folio);
- /* Try get exclusive access under lock */
- if (!skip_updated) {
+ /*
+ * Try get exclusive access under lock. If marked for
+ * skip, the scan is aborted unless the current context
+ * is a rescan to reach the end of the pageblock.
+ */
+ if (!skip_updated && valid_page) {
skip_updated = true;
- if (test_and_set_skip(cc, page, low_pfn))
+ if (test_and_set_skip(cc, valid_page) &&
+ !cc->finish_pageblock) {
goto isolate_abort;
+ }
}
/*
- * Page become compound since the non-locked check,
- * and it's on LRU. It can only be a THP so the order
- * is safe to read and it's 0 for tail pages.
+ * folio become large since the non-locked check,
+ * and it's on LRU.
*/
- if (unlikely(PageCompound(page) && !cc->alloc_contig)) {
- low_pfn += compound_nr(page) - 1;
- nr_scanned += compound_nr(page) - 1;
- SetPageLRU(page);
+ if (unlikely(folio_test_large(folio) && !cc->alloc_contig)) {
+ low_pfn += folio_nr_pages(folio) - 1;
+ nr_scanned += folio_nr_pages(folio) - 1;
+ folio_set_lru(folio);
goto isolate_fail_put;
}
}
- /* The whole page is taken off the LRU; skip the tail pages. */
- if (PageCompound(page))
- low_pfn += compound_nr(page) - 1;
+ /* The folio is taken off the LRU */
+ if (folio_test_large(folio))
+ low_pfn += folio_nr_pages(folio) - 1;
/* Successfully isolated */
- del_page_from_lru_list(page, lruvec);
- mod_node_page_state(page_pgdat(page),
- NR_ISOLATED_ANON + page_is_file_lru(page),
- thp_nr_pages(page));
+ lruvec_del_folio(lruvec, folio);
+ node_stat_mod_folio(folio,
+ NR_ISOLATED_ANON + folio_is_file_lru(folio),
+ folio_nr_pages(folio));
isolate_success:
- list_add(&page->lru, &cc->migratepages);
+ list_add(&folio->lru, &cc->migratepages);
isolate_success_no_list:
- cc->nr_migratepages += compound_nr(page);
- nr_isolated += compound_nr(page);
- nr_scanned += compound_nr(page) - 1;
+ cc->nr_migratepages += folio_nr_pages(folio);
+ nr_isolated += folio_nr_pages(folio);
+ nr_scanned += folio_nr_pages(folio) - 1;
/*
* Avoid isolating too much unless this block is being
@@ -1131,7 +1172,7 @@ isolate_fail_put:
unlock_page_lruvec_irqrestore(locked, flags);
locked = NULL;
}
- put_page(page);
+ folio_put(folio);
isolate_fail:
if (!skip_on_failure && ret != -ENOMEM)
@@ -1172,14 +1213,14 @@ isolate_fail:
if (unlikely(low_pfn > end_pfn))
low_pfn = end_pfn;
- page = NULL;
+ folio = NULL;
isolate_abort:
if (locked)
unlock_page_lruvec_irqrestore(locked, flags);
- if (page) {
- SetPageLRU(page);
- put_page(page);
+ if (folio) {
+ folio_set_lru(folio);
+ folio_put(folio);
}
/*
@@ -1191,7 +1232,7 @@ isolate_abort:
* rescanned twice in a row.
*/
if (low_pfn == end_pfn && (!nr_isolated || cc->finish_pageblock)) {
- if (valid_page && !skip_updated)
+ if (!cc->no_set_skip_hint && valid_page && !skip_updated)
set_pageblock_skip(valid_page);
update_cached_migrate(cc, low_pfn);
}
@@ -1379,7 +1420,7 @@ fast_isolate_around(struct compact_control *cc, unsigned long pfn)
isolate_freepages_block(cc, &start_pfn, end_pfn, &cc->freepages, 1, false);
/* Skip this pageblock in the future as it's full or nearly full */
- if (cc->nr_freepages < cc->nr_migratepages)
+ if (start_pfn == end_pfn)
set_pageblock_skip(page);
return;
@@ -1403,11 +1444,10 @@ static int next_search_order(struct compact_control *cc, int order)
return order;
}
-static unsigned long
-fast_isolate_freepages(struct compact_control *cc)
+static void fast_isolate_freepages(struct compact_control *cc)
{
unsigned int limit = max(1U, freelist_scan_limit(cc) >> 1);
- unsigned int nr_scanned = 0;
+ unsigned int nr_scanned = 0, total_isolated = 0;
unsigned long low_pfn, min_pfn, highest = 0;
unsigned long nr_isolated = 0;
unsigned long distance;
@@ -1417,7 +1457,7 @@ fast_isolate_freepages(struct compact_control *cc)
/* Full compaction passes in a negative order */
if (cc->order <= 0)
- return cc->free_pfn;
+ return;
/*
* If starting the scan, use a deeper search and use the highest
@@ -1506,6 +1546,7 @@ fast_isolate_freepages(struct compact_control *cc)
set_page_private(page, order);
nr_isolated = 1 << order;
nr_scanned += nr_isolated - 1;
+ total_isolated += nr_isolated;
cc->nr_freepages += nr_isolated;
list_add_tail(&page->lru, &cc->freepages);
count_compact_events(COMPACTISOLATED, nr_isolated);
@@ -1518,6 +1559,10 @@ fast_isolate_freepages(struct compact_control *cc)
spin_unlock_irqrestore(&cc->zone->lock, flags);
+ /* Skip fast search if enough freepages isolated */
+ if (cc->nr_freepages >= cc->nr_migratepages)
+ break;
+
/*
* Smaller scan on next order so the total scan is related
* to freelist_scan_limit.
@@ -1526,6 +1571,9 @@ fast_isolate_freepages(struct compact_control *cc)
limit = max(1U, limit >> 1);
}
+ trace_mm_compaction_fast_isolate_freepages(min_pfn, cc->free_pfn,
+ nr_scanned, total_isolated);
+
if (!page) {
cc->fast_search_fail++;
if (scan_start) {
@@ -1556,11 +1604,10 @@ fast_isolate_freepages(struct compact_control *cc)
cc->total_free_scanned += nr_scanned;
if (!page)
- return cc->free_pfn;
+ return;
low_pfn = page_to_pfn(page);
fast_isolate_around(cc, low_pfn);
- return low_pfn;
}
/*
@@ -1684,11 +1731,10 @@ splitmap:
* This is a migrate-callback that "allocates" freepages by taking pages
* from the isolated freelists in the block we are migrating to.
*/
-static struct page *compaction_alloc(struct page *migratepage,
- unsigned long data)
+static struct folio *compaction_alloc(struct folio *src, unsigned long data)
{
struct compact_control *cc = (struct compact_control *)data;
- struct page *freepage;
+ struct folio *dst;
if (list_empty(&cc->freepages)) {
isolate_freepages(cc);
@@ -1697,11 +1743,11 @@ static struct page *compaction_alloc(struct page *migratepage,
return NULL;
}
- freepage = list_entry(cc->freepages.next, struct page, lru);
- list_del(&freepage->lru);
+ dst = list_entry(cc->freepages.next, struct folio, lru);
+ list_del(&dst->lru);
cc->nr_freepages--;
- return freepage;
+ return dst;
}
/*
@@ -1709,11 +1755,11 @@ static struct page *compaction_alloc(struct page *migratepage,
* freelist. All pages on the freelist are from the same zone, so there is no
* special handling needed for NUMA.
*/
-static void compaction_free(struct page *page, unsigned long data)
+static void compaction_free(struct folio *dst, unsigned long data)
{
struct compact_control *cc = (struct compact_control *)data;
- list_add(&page->lru, &cc->freepages);
+ list_add(&dst->lru, &cc->freepages);
cc->nr_freepages++;
}
@@ -1736,6 +1782,7 @@ static int sysctl_compact_unevictable_allowed __read_mostly = CONFIG_COMPACT_UNE
*/
static unsigned int __read_mostly sysctl_compaction_proactiveness = 20;
static int sysctl_extfrag_threshold = 500;
+static int __read_mostly sysctl_compact_memory;
static inline void
update_fast_start_pfn(struct compact_control *cc, unsigned long pfn)
@@ -1864,7 +1911,6 @@ static unsigned long fast_find_migrateblock(struct compact_control *cc)
pfn = cc->zone->zone_start_pfn;
cc->fast_search_fail = 0;
found_block = true;
- set_pageblock_skip(freepage);
break;
}
}
@@ -1940,8 +1986,14 @@ static isolate_migrate_t isolate_migratepages(struct compact_control *cc)
page = pageblock_pfn_to_page(block_start_pfn,
block_end_pfn, cc->zone);
- if (!page)
+ if (!page) {
+ unsigned long next_pfn;
+
+ next_pfn = skip_offline_sections(block_start_pfn);
+ if (next_pfn)
+ block_end_pfn = min(next_pfn, cc->free_pfn);
continue;
+ }
/*
* If isolation recently failed, do not retry. Only check the
@@ -2193,25 +2245,11 @@ static enum compact_result compact_finished(struct compact_control *cc)
return ret;
}
-static enum compact_result __compaction_suitable(struct zone *zone, int order,
- unsigned int alloc_flags,
- int highest_zoneidx,
- unsigned long wmark_target)
+static bool __compaction_suitable(struct zone *zone, int order,
+ int highest_zoneidx,
+ unsigned long wmark_target)
{
unsigned long watermark;
-
- if (is_via_compact_memory(order))
- return COMPACT_CONTINUE;
-
- watermark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK);
- /*
- * If watermarks for high-order allocation are already met, there
- * should be no need for compaction at all.
- */
- if (zone_watermark_ok(zone, order, watermark, highest_zoneidx,
- alloc_flags))
- return COMPACT_SUCCESS;
-
/*
* Watermarks for order-0 must be met for compaction to be able to
* isolate free pages for migration targets. This means that the
@@ -2229,29 +2267,20 @@ static enum compact_result __compaction_suitable(struct zone *zone, int order,
watermark = (order > PAGE_ALLOC_COSTLY_ORDER) ?
low_wmark_pages(zone) : min_wmark_pages(zone);
watermark += compact_gap(order);
- if (!__zone_watermark_ok(zone, 0, watermark, highest_zoneidx,
- ALLOC_CMA, wmark_target))
- return COMPACT_SKIPPED;
-
- return COMPACT_CONTINUE;
+ return __zone_watermark_ok(zone, 0, watermark, highest_zoneidx,
+ ALLOC_CMA, wmark_target);
}
/*
* compaction_suitable: Is this suitable to run compaction on this zone now?
- * Returns
- * COMPACT_SKIPPED - If there are too few free pages for compaction
- * COMPACT_SUCCESS - If the allocation would succeed without compaction
- * COMPACT_CONTINUE - If compaction should run now
*/
-enum compact_result compaction_suitable(struct zone *zone, int order,
- unsigned int alloc_flags,
- int highest_zoneidx)
+bool compaction_suitable(struct zone *zone, int order, int highest_zoneidx)
{
- enum compact_result ret;
- int fragindex;
+ enum compact_result compact_result;
+ bool suitable;
- ret = __compaction_suitable(zone, order, alloc_flags, highest_zoneidx,
- zone_page_state(zone, NR_FREE_PAGES));
+ suitable = __compaction_suitable(zone, order, highest_zoneidx,
+ zone_page_state(zone, NR_FREE_PAGES));
/*
* fragmentation index determines if allocation failures are due to
* low memory or external fragmentation
@@ -2268,17 +2297,24 @@ enum compact_result compaction_suitable(struct zone *zone, int order,
* excessive compaction for costly orders, but it should not be at the
* expense of system stability.
*/
- if (ret == COMPACT_CONTINUE && (order > PAGE_ALLOC_COSTLY_ORDER)) {
- fragindex = fragmentation_index(zone, order);
- if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)
- ret = COMPACT_NOT_SUITABLE_ZONE;
+ if (suitable) {
+ compact_result = COMPACT_CONTINUE;
+ if (order > PAGE_ALLOC_COSTLY_ORDER) {
+ int fragindex = fragmentation_index(zone, order);
+
+ if (fragindex >= 0 &&
+ fragindex <= sysctl_extfrag_threshold) {
+ suitable = false;
+ compact_result = COMPACT_NOT_SUITABLE_ZONE;
+ }
+ }
+ } else {
+ compact_result = COMPACT_SKIPPED;
}
- trace_mm_compaction_suitable(zone, order, ret);
- if (ret == COMPACT_NOT_SUITABLE_ZONE)
- ret = COMPACT_SKIPPED;
+ trace_mm_compaction_suitable(zone, order, compact_result);
- return ret;
+ return suitable;
}
bool compaction_zonelist_suitable(struct alloc_context *ac, int order,
@@ -2294,7 +2330,6 @@ bool compaction_zonelist_suitable(struct alloc_context *ac, int order,
for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
ac->highest_zoneidx, ac->nodemask) {
unsigned long available;
- enum compact_result compact_result;
/*
* Do not consider all the reclaimable memory because we do not
@@ -2304,9 +2339,8 @@ bool compaction_zonelist_suitable(struct alloc_context *ac, int order,
*/
available = zone_reclaimable_pages(zone) / order;
available += zone_page_state_snapshot(zone, NR_FREE_PAGES);
- compact_result = __compaction_suitable(zone, order, alloc_flags,
- ac->highest_zoneidx, available);
- if (compact_result == COMPACT_CONTINUE)
+ if (__compaction_suitable(zone, order, ac->highest_zoneidx,
+ available))
return true;
}
@@ -2336,11 +2370,22 @@ compact_zone(struct compact_control *cc, struct capture_control *capc)
INIT_LIST_HEAD(&cc->migratepages);
cc->migratetype = gfp_migratetype(cc->gfp_mask);
- ret = compaction_suitable(cc->zone, cc->order, cc->alloc_flags,
- cc->highest_zoneidx);
- /* Compaction is likely to fail */
- if (ret == COMPACT_SUCCESS || ret == COMPACT_SKIPPED)
- return ret;
+
+ if (!is_via_compact_memory(cc->order)) {
+ unsigned long watermark;
+
+ /* Allocation can already succeed, nothing to do */
+ watermark = wmark_pages(cc->zone,
+ cc->alloc_flags & ALLOC_WMARK_MASK);
+ if (zone_watermark_ok(cc->zone, cc->order, watermark,
+ cc->highest_zoneidx, cc->alloc_flags))
+ return COMPACT_SUCCESS;
+
+ /* Compaction is likely to fail */
+ if (!compaction_suitable(cc->zone, cc->order,
+ cc->highest_zoneidx))
+ return COMPACT_SKIPPED;
+ }
/*
* Clear pageblock skip if there were failures recently and compaction
@@ -2456,7 +2501,8 @@ rescan:
}
/*
* If an ASYNC or SYNC_LIGHT fails to migrate a page
- * within the current order-aligned block, scan the
+ * within the current order-aligned block and
+ * fast_find_migrateblock may be used then scan the
* remainder of the pageblock. This will mark the
* pageblock "skip" to avoid rescanning in the near
* future. This will isolate more pages than necessary
@@ -2464,8 +2510,9 @@ rescan:
* fast_find_migrateblock revisiting blocks that were
* recently partially scanned.
*/
- if (cc->direct_compaction && !cc->finish_pageblock &&
- (cc->mode < MIGRATE_SYNC)) {
+ if (!pageblock_aligned(cc->migrate_pfn) &&
+ !cc->ignore_skip_hint && !cc->finish_pageblock &&
+ (cc->mode < MIGRATE_SYNC)) {
cc->finish_pageblock = true;
/*
@@ -2780,6 +2827,15 @@ static int compaction_proactiveness_sysctl_handler(struct ctl_table *table, int
static int sysctl_compaction_handler(struct ctl_table *table, int write,
void *buffer, size_t *length, loff_t *ppos)
{
+ int ret;
+
+ ret = proc_dointvec(table, write, buffer, length, ppos);
+ if (ret)
+ return ret;
+
+ if (sysctl_compact_memory != 1)
+ return -EINVAL;
+
if (write)
compact_nodes();
@@ -2833,8 +2889,14 @@ static bool kcompactd_node_suitable(pg_data_t *pgdat)
if (!populated_zone(zone))
continue;
- if (compaction_suitable(zone, pgdat->kcompactd_max_order, 0,
- highest_zoneidx) == COMPACT_CONTINUE)
+ /* Allocation can already succeed, check other zones */
+ if (zone_watermark_ok(zone, pgdat->kcompactd_max_order,
+ min_wmark_pages(zone),
+ highest_zoneidx, 0))
+ continue;
+
+ if (compaction_suitable(zone, pgdat->kcompactd_max_order,
+ highest_zoneidx))
return true;
}
@@ -2871,8 +2933,12 @@ static void kcompactd_do_work(pg_data_t *pgdat)
if (compaction_deferred(zone, cc.order))
continue;
- if (compaction_suitable(zone, cc.order, 0, zoneid) !=
- COMPACT_CONTINUE)
+ /* Allocation can already succeed, nothing to do */
+ if (zone_watermark_ok(zone, cc.order,
+ min_wmark_pages(zone), zoneid, 0))
+ continue;
+
+ if (!compaction_suitable(zone, cc.order, zoneid))
continue;
if (kthread_should_stop())
@@ -3021,7 +3087,7 @@ static int kcompactd(void *p)
* This kcompactd start function will be called by init and node-hot-add.
* On node-hot-add, kcompactd will moved to proper cpus if cpus are hot-added.
*/
-void kcompactd_run(int nid)
+void __meminit kcompactd_run(int nid)
{
pg_data_t *pgdat = NODE_DATA(nid);
@@ -3039,7 +3105,7 @@ void kcompactd_run(int nid)
* Called by memory hotplug when all memory in a node is offlined. Caller must
* be holding mem_hotplug_begin/done().
*/
-void kcompactd_stop(int nid)
+void __meminit kcompactd_stop(int nid)
{
struct task_struct *kcompactd = NODE_DATA(nid)->kcompactd;
@@ -3095,7 +3161,7 @@ static int proc_dointvec_minmax_warn_RT_change(struct ctl_table *table,
static struct ctl_table vm_compaction[] = {
{
.procname = "compact_memory",
- .data = NULL,
+ .data = &sysctl_compact_memory,
.maxlen = sizeof(int),
.mode = 0200,
.proc_handler = sysctl_compaction_handler,
diff --git a/mm/damon/core-test.h b/mm/damon/core-test.h
index fae64d32b925..c11210124344 100644
--- a/mm/damon/core-test.h
+++ b/mm/damon/core-test.h
@@ -318,6 +318,29 @@ static void damon_test_update_monitoring_result(struct kunit *test)
KUNIT_EXPECT_EQ(test, r->age, 20);
}
+static void damon_test_set_attrs(struct kunit *test)
+{
+ struct damon_ctx ctx;
+ struct damon_attrs valid_attrs = {
+ .min_nr_regions = 10, .max_nr_regions = 1000,
+ .sample_interval = 5000, .aggr_interval = 100000,};
+ struct damon_attrs invalid_attrs;
+
+ KUNIT_EXPECT_EQ(test, damon_set_attrs(&ctx, &valid_attrs), 0);
+
+ invalid_attrs = valid_attrs;
+ invalid_attrs.min_nr_regions = 1;
+ KUNIT_EXPECT_EQ(test, damon_set_attrs(&ctx, &invalid_attrs), -EINVAL);
+
+ invalid_attrs = valid_attrs;
+ invalid_attrs.max_nr_regions = 9;
+ KUNIT_EXPECT_EQ(test, damon_set_attrs(&ctx, &invalid_attrs), -EINVAL);
+
+ invalid_attrs = valid_attrs;
+ invalid_attrs.aggr_interval = 4999;
+ KUNIT_EXPECT_EQ(test, damon_set_attrs(&ctx, &invalid_attrs), -EINVAL);
+}
+
static struct kunit_case damon_test_cases[] = {
KUNIT_CASE(damon_test_target),
KUNIT_CASE(damon_test_regions),
@@ -329,6 +352,7 @@ static struct kunit_case damon_test_cases[] = {
KUNIT_CASE(damon_test_ops_registration),
KUNIT_CASE(damon_test_set_regions),
KUNIT_CASE(damon_test_update_monitoring_result),
+ KUNIT_CASE(damon_test_set_attrs),
{},
};
diff --git a/mm/damon/core.c b/mm/damon/core.c
index d9ef62047bf5..91cff7f2997e 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -551,6 +551,8 @@ int damon_set_attrs(struct damon_ctx *ctx, struct damon_attrs *attrs)
return -EINVAL;
if (attrs->min_nr_regions > attrs->max_nr_regions)
return -EINVAL;
+ if (attrs->sample_interval > attrs->aggr_interval)
+ return -EINVAL;
damon_update_monitoring_results(ctx, attrs);
ctx->attrs = *attrs;
diff --git a/mm/damon/ops-common.c b/mm/damon/ops-common.c
index cc63cf953636..e940802a15a4 100644
--- a/mm/damon/ops-common.c
+++ b/mm/damon/ops-common.c
@@ -37,51 +37,29 @@ struct folio *damon_get_folio(unsigned long pfn)
return folio;
}
-void damon_ptep_mkold(pte_t *pte, struct mm_struct *mm, unsigned long addr)
+void damon_ptep_mkold(pte_t *pte, struct vm_area_struct *vma, unsigned long addr)
{
- bool referenced = false;
- struct folio *folio = damon_get_folio(pte_pfn(*pte));
+ struct folio *folio = damon_get_folio(pte_pfn(ptep_get(pte)));
if (!folio)
return;
- if (pte_young(*pte)) {
- referenced = true;
- *pte = pte_mkold(*pte);
- }
-
-#ifdef CONFIG_MMU_NOTIFIER
- if (mmu_notifier_clear_young(mm, addr, addr + PAGE_SIZE))
- referenced = true;
-#endif /* CONFIG_MMU_NOTIFIER */
-
- if (referenced)
+ if (ptep_clear_young_notify(vma, addr, pte))
folio_set_young(folio);
folio_set_idle(folio);
folio_put(folio);
}
-void damon_pmdp_mkold(pmd_t *pmd, struct mm_struct *mm, unsigned long addr)
+void damon_pmdp_mkold(pmd_t *pmd, struct vm_area_struct *vma, unsigned long addr)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- bool referenced = false;
struct folio *folio = damon_get_folio(pmd_pfn(*pmd));
if (!folio)
return;
- if (pmd_young(*pmd)) {
- referenced = true;
- *pmd = pmd_mkold(*pmd);
- }
-
-#ifdef CONFIG_MMU_NOTIFIER
- if (mmu_notifier_clear_young(mm, addr, addr + HPAGE_PMD_SIZE))
- referenced = true;
-#endif /* CONFIG_MMU_NOTIFIER */
-
- if (referenced)
+ if (pmdp_clear_young_notify(vma, addr, pmd))
folio_set_young(folio);
folio_set_idle(folio);
diff --git a/mm/damon/ops-common.h b/mm/damon/ops-common.h
index 14f4bc69f29b..18d837d11bce 100644
--- a/mm/damon/ops-common.h
+++ b/mm/damon/ops-common.h
@@ -9,8 +9,8 @@
struct folio *damon_get_folio(unsigned long pfn);
-void damon_ptep_mkold(pte_t *pte, struct mm_struct *mm, unsigned long addr);
-void damon_pmdp_mkold(pmd_t *pmd, struct mm_struct *mm, unsigned long addr);
+void damon_ptep_mkold(pte_t *pte, struct vm_area_struct *vma, unsigned long addr);
+void damon_pmdp_mkold(pmd_t *pmd, struct vm_area_struct *vma, unsigned long addr);
int damon_cold_score(struct damon_ctx *c, struct damon_region *r,
struct damos *s);
diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c
index 467b99166b43..40801e38fcf0 100644
--- a/mm/damon/paddr.c
+++ b/mm/damon/paddr.c
@@ -24,9 +24,9 @@ static bool __damon_pa_mkold(struct folio *folio, struct vm_area_struct *vma,
while (page_vma_mapped_walk(&pvmw)) {
addr = pvmw.address;
if (pvmw.pte)
- damon_ptep_mkold(pvmw.pte, vma->vm_mm, addr);
+ damon_ptep_mkold(pvmw.pte, vma, addr);
else
- damon_pmdp_mkold(pvmw.pmd, vma->vm_mm, addr);
+ damon_pmdp_mkold(pvmw.pmd, vma, addr);
}
return true;
}
@@ -89,7 +89,7 @@ static bool __damon_pa_young(struct folio *folio, struct vm_area_struct *vma,
while (page_vma_mapped_walk(&pvmw)) {
addr = pvmw.address;
if (pvmw.pte) {
- *accessed = pte_young(*pvmw.pte) ||
+ *accessed = pte_young(ptep_get(pvmw.pte)) ||
!folio_test_idle(folio) ||
mmu_notifier_test_young(vma->vm_mm, addr);
} else {
diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
index 1fec16d7263e..2fcc9731528a 100644
--- a/mm/damon/vaddr.c
+++ b/mm/damon/vaddr.c
@@ -311,19 +311,21 @@ static int damon_mkold_pmd_entry(pmd_t *pmd, unsigned long addr,
}
if (pmd_trans_huge(*pmd)) {
- damon_pmdp_mkold(pmd, walk->mm, addr);
+ damon_pmdp_mkold(pmd, walk->vma, addr);
spin_unlock(ptl);
return 0;
}
spin_unlock(ptl);
}
- if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
- return 0;
pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
- if (!pte_present(*pte))
+ if (!pte) {
+ walk->action = ACTION_AGAIN;
+ return 0;
+ }
+ if (!pte_present(ptep_get(pte)))
goto out;
- damon_ptep_mkold(pte, walk->mm, addr);
+ damon_ptep_mkold(pte, walk->vma, addr);
out:
pte_unmap_unlock(pte, ptl);
return 0;
@@ -431,6 +433,7 @@ static int damon_young_pmd_entry(pmd_t *pmd, unsigned long addr,
unsigned long next, struct mm_walk *walk)
{
pte_t *pte;
+ pte_t ptent;
spinlock_t *ptl;
struct folio *folio;
struct damon_young_walk_private *priv = walk->private;
@@ -464,15 +467,18 @@ huge_out:
regular_page:
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
- if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
- return -EINVAL;
pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
- if (!pte_present(*pte))
+ if (!pte) {
+ walk->action = ACTION_AGAIN;
+ return 0;
+ }
+ ptent = ptep_get(pte);
+ if (!pte_present(ptent))
goto out;
- folio = damon_get_folio(pte_pfn(*pte));
+ folio = damon_get_folio(pte_pfn(ptent));
if (!folio)
goto out;
- if (pte_young(*pte) || !folio_test_idle(folio) ||
+ if (pte_young(ptent) || !folio_test_idle(folio) ||
mmu_notifier_test_young(walk->mm, addr))
priv->young = true;
*priv->folio_sz = folio_size(folio);
diff --git a/mm/debug.c b/mm/debug.c
index c7b228097bd9..ee533a5ceb79 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -268,4 +268,13 @@ void page_init_poison(struct page *page, size_t size)
if (page_init_poisoning)
memset(page, PAGE_POISON_PATTERN, size);
}
+
+void vma_iter_dump_tree(const struct vma_iterator *vmi)
+{
+#if defined(CONFIG_DEBUG_VM_MAPLE_TREE)
+ mas_dump(&vmi->mas);
+ mt_dump(vmi->mas.tree, mt_dump_hex);
+#endif /* CONFIG_DEBUG_VM_MAPLE_TREE */
+}
+
#endif /* CONFIG_DEBUG_VM */
diff --git a/mm/debug_page_alloc.c b/mm/debug_page_alloc.c
new file mode 100644
index 000000000000..f9d145730fd1
--- /dev/null
+++ b/mm/debug_page_alloc.c
@@ -0,0 +1,59 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/mm.h>
+#include <linux/page-isolation.h>
+
+unsigned int _debug_guardpage_minorder;
+
+bool _debug_pagealloc_enabled_early __read_mostly
+ = IS_ENABLED(CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT);
+EXPORT_SYMBOL(_debug_pagealloc_enabled_early);
+DEFINE_STATIC_KEY_FALSE(_debug_pagealloc_enabled);
+EXPORT_SYMBOL(_debug_pagealloc_enabled);
+
+DEFINE_STATIC_KEY_FALSE(_debug_guardpage_enabled);
+
+static int __init early_debug_pagealloc(char *buf)
+{
+ return kstrtobool(buf, &_debug_pagealloc_enabled_early);
+}
+early_param("debug_pagealloc", early_debug_pagealloc);
+
+static int __init debug_guardpage_minorder_setup(char *buf)
+{
+ unsigned long res;
+
+ if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) {
+ pr_err("Bad debug_guardpage_minorder value\n");
+ return 0;
+ }
+ _debug_guardpage_minorder = res;
+ pr_info("Setting debug_guardpage_minorder to %lu\n", res);
+ return 0;
+}
+early_param("debug_guardpage_minorder", debug_guardpage_minorder_setup);
+
+bool __set_page_guard(struct zone *zone, struct page *page, unsigned int order,
+ int migratetype)
+{
+ if (order >= debug_guardpage_minorder())
+ return false;
+
+ __SetPageGuard(page);
+ INIT_LIST_HEAD(&page->buddy_list);
+ set_page_private(page, order);
+ /* Guard pages are not available for any usage */
+ if (!is_migrate_isolate(migratetype))
+ __mod_zone_freepage_state(zone, -(1 << order), migratetype);
+
+ return true;
+}
+
+void __clear_page_guard(struct zone *zone, struct page *page, unsigned int order,
+ int migratetype)
+{
+ __ClearPageGuard(page);
+
+ set_page_private(page, 0);
+ if (!is_migrate_isolate(migratetype))
+ __mod_zone_freepage_state(zone, (1 << order), migratetype);
+}
diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index c54177aabebd..ee119e33fef1 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -138,6 +138,9 @@ static void __init pte_advanced_tests(struct pgtable_debug_args *args)
return;
pr_debug("Validating PTE advanced\n");
+ if (WARN_ON(!args->ptep))
+ return;
+
pte = pfn_pte(args->pte_pfn, args->page_prot);
set_pte_at(args->mm, args->vaddr, args->ptep, pte);
flush_dcache_page(page);
@@ -619,6 +622,9 @@ static void __init pte_clear_tests(struct pgtable_debug_args *args)
* the unexpected overhead of cache flushing is acceptable.
*/
pr_debug("Validating PTE clear\n");
+ if (WARN_ON(!args->ptep))
+ return;
+
#ifndef CONFIG_RISCV
pte = __pte(pte_val(pte) | RANDOM_ORVALUE);
#endif
@@ -1377,7 +1383,8 @@ static int __init debug_vm_pgtable(void)
args.ptep = pte_offset_map_lock(args.mm, args.pmdp, args.vaddr, &ptl);
pte_clear_tests(&args);
pte_advanced_tests(&args);
- pte_unmap_unlock(args.ptep, ptl);
+ if (args.ptep)
+ pte_unmap_unlock(args.ptep, ptl);
ptl = pmd_lock(args.mm, args.pmdp);
pmd_clear_tests(&args);
diff --git a/mm/dmapool.c b/mm/dmapool.c
index d2b0f8fc9649..a151a21e571b 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -226,7 +226,7 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev,
{
struct dma_pool *retval;
size_t allocation;
- bool empty = false;
+ bool empty;
if (!dev)
return NULL;
@@ -276,8 +276,7 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev,
*/
mutex_lock(&pools_reg_lock);
mutex_lock(&pools_lock);
- if (list_empty(&dev->dma_pools))
- empty = true;
+ empty = list_empty(&dev->dma_pools);
list_add(&retval->pools, &dev->dma_pools);
mutex_unlock(&pools_lock);
if (empty) {
@@ -361,7 +360,7 @@ static struct dma_page *pool_alloc_page(struct dma_pool *pool, gfp_t mem_flags)
void dma_pool_destroy(struct dma_pool *pool)
{
struct dma_page *page, *tmp;
- bool empty = false, busy = false;
+ bool empty, busy = false;
if (unlikely(!pool))
return;
@@ -369,8 +368,7 @@ void dma_pool_destroy(struct dma_pool *pool)
mutex_lock(&pools_reg_lock);
mutex_lock(&pools_lock);
list_del(&pool->pools);
- if (list_empty(&pool->dev->dma_pools))
- empty = true;
+ empty = list_empty(&pool->dev->dma_pools);
mutex_unlock(&pools_lock);
if (empty)
device_remove_file(pool->dev, &dev_attr_pools);
diff --git a/mm/early_ioremap.c b/mm/early_ioremap.c
index 9bc12e526ed0..ce06b2884789 100644
--- a/mm/early_ioremap.c
+++ b/mm/early_ioremap.c
@@ -72,12 +72,10 @@ void __init early_ioremap_setup(void)
{
int i;
- for (i = 0; i < FIX_BTMAPS_SLOTS; i++)
- if (WARN_ON(prev_map[i]))
- break;
-
- for (i = 0; i < FIX_BTMAPS_SLOTS; i++)
+ for (i = 0; i < FIX_BTMAPS_SLOTS; i++) {
+ WARN_ON_ONCE(prev_map[i]);
slot_virt[i] = __fix_to_virt(FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*i);
+ }
}
static int __init check_early_ioremap_leak(void)
diff --git a/mm/fadvise.c b/mm/fadvise.c
index fb7c5f43fd2a..6c39d42f16dc 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -14,7 +14,6 @@
#include <linux/mm.h>
#include <linux/pagemap.h>
#include <linux/backing-dev.h>
-#include <linux/pagevec.h>
#include <linux/fadvise.h>
#include <linux/writeback.h>
#include <linux/syscalls.h>
@@ -143,7 +142,7 @@ int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice)
}
if (end_index >= start_index) {
- unsigned long nr_pagevec = 0;
+ unsigned long nr_failed = 0;
/*
* It's common to FADV_DONTNEED right after
@@ -156,17 +155,15 @@ int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice)
*/
lru_add_drain();
- invalidate_mapping_pagevec(mapping,
- start_index, end_index,
- &nr_pagevec);
+ mapping_try_invalidate(mapping, start_index, end_index,
+ &nr_failed);
/*
- * If fewer pages were invalidated than expected then
- * it is possible that some of the pages were on
- * a per-cpu pagevec for a remote CPU. Drain all
- * pagevecs and try again.
+ * The failures may be due to the folio being
+ * in the LRU cache of a remote CPU. Drain all
+ * caches and try again.
*/
- if (nr_pagevec) {
+ if (nr_failed) {
lru_add_drain_all();
invalidate_mapping_pages(mapping, start_index,
end_index);
diff --git a/mm/fail_page_alloc.c b/mm/fail_page_alloc.c
new file mode 100644
index 000000000000..b1b09cce9394
--- /dev/null
+++ b/mm/fail_page_alloc.c
@@ -0,0 +1,66 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/fault-inject.h>
+#include <linux/mm.h>
+
+static struct {
+ struct fault_attr attr;
+
+ bool ignore_gfp_highmem;
+ bool ignore_gfp_reclaim;
+ u32 min_order;
+} fail_page_alloc = {
+ .attr = FAULT_ATTR_INITIALIZER,
+ .ignore_gfp_reclaim = true,
+ .ignore_gfp_highmem = true,
+ .min_order = 1,
+};
+
+static int __init setup_fail_page_alloc(char *str)
+{
+ return setup_fault_attr(&fail_page_alloc.attr, str);
+}
+__setup("fail_page_alloc=", setup_fail_page_alloc);
+
+bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
+{
+ int flags = 0;
+
+ if (order < fail_page_alloc.min_order)
+ return false;
+ if (gfp_mask & __GFP_NOFAIL)
+ return false;
+ if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))
+ return false;
+ if (fail_page_alloc.ignore_gfp_reclaim &&
+ (gfp_mask & __GFP_DIRECT_RECLAIM))
+ return false;
+
+ /* See comment in __should_failslab() */
+ if (gfp_mask & __GFP_NOWARN)
+ flags |= FAULT_NOWARN;
+
+ return should_fail_ex(&fail_page_alloc.attr, 1 << order, flags);
+}
+
+#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
+
+static int __init fail_page_alloc_debugfs(void)
+{
+ umode_t mode = S_IFREG | 0600;
+ struct dentry *dir;
+
+ dir = fault_create_debugfs_attr("fail_page_alloc", NULL,
+ &fail_page_alloc.attr);
+
+ debugfs_create_bool("ignore-gfp-wait", mode, dir,
+ &fail_page_alloc.ignore_gfp_reclaim);
+ debugfs_create_bool("ignore-gfp-highmem", mode, dir,
+ &fail_page_alloc.ignore_gfp_highmem);
+ debugfs_create_u32("min-order", mode, dir, &fail_page_alloc.min_order);
+
+ return 0;
+}
+
+late_initcall(fail_page_alloc_debugfs);
+
+#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
diff --git a/mm/filemap.c b/mm/filemap.c
index b4c9bd368b7e..9e44a49bbd74 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -22,6 +22,7 @@
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/swapops.h>
+#include <linux/syscalls.h>
#include <linux/mman.h>
#include <linux/pagemap.h>
#include <linux/file.h>
@@ -58,6 +59,8 @@
#include <asm/mman.h>
+#include "swap.h"
+
/*
* Shared mappings implemented 30.11.1994. It's not fully working yet,
* though.
@@ -114,7 +117,7 @@
* ->i_pages lock (page_remove_rmap->set_page_dirty)
* bdi.wb->list_lock (page_remove_rmap->set_page_dirty)
* ->inode->i_lock (page_remove_rmap->set_page_dirty)
- * ->memcg->move_lock (page_remove_rmap->lock_page_memcg)
+ * ->memcg->move_lock (page_remove_rmap->folio_memcg_lock)
* bdi.wb->list_lock (zap_pte_range->set_page_dirty)
* ->inode->i_lock (zap_pte_range->set_page_dirty)
* ->private_lock (zap_pte_range->block_dirty_folio)
@@ -1359,8 +1362,6 @@ repeat:
/**
* migration_entry_wait_on_locked - Wait for a migration entry to be removed
* @entry: migration swap entry.
- * @ptep: mapped pte pointer. Will return with the ptep unmapped. Only required
- * for pte entries, pass NULL for pmd entries.
* @ptl: already locked ptl. This function will drop the lock.
*
* Wait for a migration entry referencing the given page to be removed. This is
@@ -1369,13 +1370,13 @@ repeat:
* should be called while holding the ptl for the migration entry referencing
* the page.
*
- * Returns after unmapping and unlocking the pte/ptl with pte_unmap_unlock().
+ * Returns after unlocking the ptl.
*
* This follows the same logic as folio_wait_bit_common() so see the comments
* there.
*/
-void migration_entry_wait_on_locked(swp_entry_t entry, pte_t *ptep,
- spinlock_t *ptl)
+void migration_entry_wait_on_locked(swp_entry_t entry, spinlock_t *ptl)
+ __releases(ptl)
{
struct wait_page_queue wait_page;
wait_queue_entry_t *wait = &wait_page.wait;
@@ -1409,10 +1410,7 @@ void migration_entry_wait_on_locked(swp_entry_t entry, pte_t *ptep,
* a valid reference to the page, and it must take the ptl to remove the
* migration entry. So the page is valid until the ptl is dropped.
*/
- if (ptep)
- pte_unmap_unlock(ptep, ptl);
- else
- spin_unlock(ptl);
+ spin_unlock(ptl);
for (;;) {
unsigned int flags;
@@ -1625,36 +1623,6 @@ void folio_end_writeback(struct folio *folio)
}
EXPORT_SYMBOL(folio_end_writeback);
-/*
- * After completing I/O on a page, call this routine to update the page
- * flags appropriately
- */
-void page_endio(struct page *page, bool is_write, int err)
-{
- struct folio *folio = page_folio(page);
-
- if (!is_write) {
- if (!err) {
- folio_mark_uptodate(folio);
- } else {
- folio_clear_uptodate(folio);
- folio_set_error(folio);
- }
- folio_unlock(folio);
- } else {
- if (err) {
- struct address_space *mapping;
-
- folio_set_error(folio);
- mapping = folio_mapping(folio);
- if (mapping)
- mapping_set_error(mapping, err);
- }
- folio_end_writeback(folio);
- }
-}
-EXPORT_SYMBOL_GPL(page_endio);
-
/**
* __folio_lock - Get a lock on the folio, assuming we need to sleep to get it.
* @folio: The folio to lock
@@ -2687,8 +2655,7 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
if (unlikely(iocb->ki_pos >= i_size_read(inode)))
break;
- error = filemap_get_pages(iocb, iter->count, &fbatch,
- iov_iter_is_pipe(iter));
+ error = filemap_get_pages(iocb, iter->count, &fbatch, false);
if (error < 0)
break;
@@ -2762,6 +2729,48 @@ put_folios:
}
EXPORT_SYMBOL_GPL(filemap_read);
+int kiocb_write_and_wait(struct kiocb *iocb, size_t count)
+{
+ struct address_space *mapping = iocb->ki_filp->f_mapping;
+ loff_t pos = iocb->ki_pos;
+ loff_t end = pos + count - 1;
+
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ if (filemap_range_needs_writeback(mapping, pos, end))
+ return -EAGAIN;
+ return 0;
+ }
+
+ return filemap_write_and_wait_range(mapping, pos, end);
+}
+
+int kiocb_invalidate_pages(struct kiocb *iocb, size_t count)
+{
+ struct address_space *mapping = iocb->ki_filp->f_mapping;
+ loff_t pos = iocb->ki_pos;
+ loff_t end = pos + count - 1;
+ int ret;
+
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ /* we could block if there are any pages in the range */
+ if (filemap_range_has_page(mapping, pos, end))
+ return -EAGAIN;
+ } else {
+ ret = filemap_write_and_wait_range(mapping, pos, end);
+ if (ret)
+ return ret;
+ }
+
+ /*
+ * After a write we want buffered reads to be sure to go to disk to get
+ * the new data. We invalidate clean cached page from the region we're
+ * about to write. We do this *before* the write so that we can return
+ * without clobbering -EIOCBQUEUED from ->direct_IO().
+ */
+ return invalidate_inode_pages2_range(mapping, pos >> PAGE_SHIFT,
+ end >> PAGE_SHIFT);
+}
+
/**
* generic_file_read_iter - generic filesystem read routine
* @iocb: kernel I/O control block
@@ -2797,18 +2806,9 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
struct address_space *mapping = file->f_mapping;
struct inode *inode = mapping->host;
- if (iocb->ki_flags & IOCB_NOWAIT) {
- if (filemap_range_needs_writeback(mapping, iocb->ki_pos,
- iocb->ki_pos + count - 1))
- return -EAGAIN;
- } else {
- retval = filemap_write_and_wait_range(mapping,
- iocb->ki_pos,
- iocb->ki_pos + count - 1);
- if (retval < 0)
- return retval;
- }
-
+ retval = kiocb_write_and_wait(iocb, count);
+ if (retval < 0)
+ return retval;
file_accessed(file);
retval = mapping->a_ops->direct_IO(iocb, iter);
@@ -2872,9 +2872,24 @@ size_t splice_folio_into_pipe(struct pipe_inode_info *pipe,
return spliced;
}
-/*
- * Splice folios from the pagecache of a buffered (ie. non-O_DIRECT) file into
- * a pipe.
+/**
+ * filemap_splice_read - Splice data from a file's pagecache into a pipe
+ * @in: The file to read from
+ * @ppos: Pointer to the file position to read from
+ * @pipe: The pipe to splice into
+ * @len: The amount to splice
+ * @flags: The SPLICE_F_* flags
+ *
+ * This function gets folios from a file's pagecache and splices them into the
+ * pipe. Readahead will be called as necessary to fill more folios. This may
+ * be used for blockdevs also.
+ *
+ * Return: On success, the number of bytes read will be returned and *@ppos
+ * will be updated if appropriate; 0 will be returned if there is no more data
+ * to be read; -EAGAIN will be returned if the pipe had no space, and some
+ * other negative error code will be returned on error. A short read may occur
+ * if the pipe has insufficient space, we reach the end of the data or we hit a
+ * hole.
*/
ssize_t filemap_splice_read(struct file *in, loff_t *ppos,
struct pipe_inode_info *pipe,
@@ -2887,6 +2902,9 @@ ssize_t filemap_splice_read(struct file *in, loff_t *ppos,
bool writably_mapped;
int i, error = 0;
+ if (unlikely(*ppos >= in->f_mapping->host->i_sb->s_maxbytes))
+ return 0;
+
init_sync_kiocb(&iocb, in);
iocb.ki_pos = *ppos;
@@ -2900,7 +2918,7 @@ ssize_t filemap_splice_read(struct file *in, loff_t *ppos,
do {
cond_resched();
- if (*ppos >= i_size_read(file_inode(in)))
+ if (*ppos >= i_size_read(in->f_mapping->host))
break;
iocb.ki_pos = *ppos;
@@ -2916,7 +2934,7 @@ ssize_t filemap_splice_read(struct file *in, loff_t *ppos,
* part of the page is not copied back to userspace (unless
* another truncate extends the file - this is desired though).
*/
- isize = i_size_read(file_inode(in));
+ isize = i_size_read(in->f_mapping->host);
if (unlikely(*ppos >= isize))
break;
end_offset = min_t(loff_t, isize, *ppos + len);
@@ -3413,13 +3431,6 @@ static bool filemap_map_pmd(struct vm_fault *vmf, struct folio *folio,
if (pmd_none(*vmf->pmd))
pmd_install(mm, vmf->pmd, &vmf->prealloc_pte);
- /* See comment in handle_pte_fault() */
- if (pmd_devmap_trans_unstable(vmf->pmd)) {
- folio_unlock(folio);
- folio_put(folio);
- return true;
- }
-
return false;
}
@@ -3506,6 +3517,11 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf,
addr = vma->vm_start + ((start_pgoff - vma->vm_pgoff) << PAGE_SHIFT);
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, addr, &vmf->ptl);
+ if (!vmf->pte) {
+ folio_unlock(folio);
+ folio_put(folio);
+ goto out;
+ }
do {
again:
page = folio_file_page(folio, xas.xa_index);
@@ -3524,7 +3540,7 @@ again:
* handled in the specific fault path, and it'll prohibit the
* fault-around logic.
*/
- if (!pte_none(*vmf->pte))
+ if (!pte_none(ptep_get(vmf->pte)))
goto unlock;
/* We're about to handle the fault */
@@ -3783,7 +3799,7 @@ EXPORT_SYMBOL(read_cache_page_gfp);
/*
* Warn about a page cache invalidation failure during a direct I/O write.
*/
-void dio_warn_stale_pagecache(struct file *filp)
+static void dio_warn_stale_pagecache(struct file *filp)
{
static DEFINE_RATELIMIT_STATE(_rs, 86400 * HZ, DEFAULT_RATELIMIT_BURST);
char pathname[128];
@@ -3800,48 +3816,33 @@ void dio_warn_stale_pagecache(struct file *filp)
}
}
-ssize_t
-generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
+void kiocb_invalidate_post_direct_write(struct kiocb *iocb, size_t count)
{
- struct file *file = iocb->ki_filp;
- struct address_space *mapping = file->f_mapping;
- struct inode *inode = mapping->host;
- loff_t pos = iocb->ki_pos;
- ssize_t written;
- size_t write_len;
- pgoff_t end;
+ struct address_space *mapping = iocb->ki_filp->f_mapping;
- write_len = iov_iter_count(from);
- end = (pos + write_len - 1) >> PAGE_SHIFT;
+ if (mapping->nrpages &&
+ invalidate_inode_pages2_range(mapping,
+ iocb->ki_pos >> PAGE_SHIFT,
+ (iocb->ki_pos + count - 1) >> PAGE_SHIFT))
+ dio_warn_stale_pagecache(iocb->ki_filp);
+}
- if (iocb->ki_flags & IOCB_NOWAIT) {
- /* If there are pages to writeback, return */
- if (filemap_range_has_page(file->f_mapping, pos,
- pos + write_len - 1))
- return -EAGAIN;
- } else {
- written = filemap_write_and_wait_range(mapping, pos,
- pos + write_len - 1);
- if (written)
- goto out;
- }
+ssize_t
+generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
+{
+ struct address_space *mapping = iocb->ki_filp->f_mapping;
+ size_t write_len = iov_iter_count(from);
+ ssize_t written;
/*
- * After a write we want buffered reads to be sure to go to disk to get
- * the new data. We invalidate clean cached page from the region we're
- * about to write. We do this *before* the write so that we can return
- * without clobbering -EIOCBQUEUED from ->direct_IO().
- */
- written = invalidate_inode_pages2_range(mapping,
- pos >> PAGE_SHIFT, end);
- /*
* If a page can not be invalidated, return 0 to fall back
* to buffered write.
*/
+ written = kiocb_invalidate_pages(iocb, write_len);
if (written) {
if (written == -EBUSY)
return 0;
- goto out;
+ return written;
}
written = mapping->a_ops->direct_IO(iocb, from);
@@ -3863,11 +3864,11 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
*
* Skip invalidation for async writes or if mapping has no pages.
*/
- if (written > 0 && mapping->nrpages &&
- invalidate_inode_pages2_range(mapping, pos >> PAGE_SHIFT, end))
- dio_warn_stale_pagecache(file);
-
if (written > 0) {
+ struct inode *inode = mapping->host;
+ loff_t pos = iocb->ki_pos;
+
+ kiocb_invalidate_post_direct_write(iocb, written);
pos += written;
write_len -= written;
if (pos > i_size_read(inode) && !S_ISBLK(inode->i_mode)) {
@@ -3878,7 +3879,6 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
}
if (written != -EIOCBQUEUED)
iov_iter_revert(from, write_len - iov_iter_count(from));
-out:
return written;
}
EXPORT_SYMBOL(generic_file_direct_write);
@@ -3957,7 +3957,10 @@ again:
balance_dirty_pages_ratelimited(mapping);
} while (iov_iter_count(i));
- return written ? written : status;
+ if (!written)
+ return status;
+ iocb->ki_pos += written;
+ return written;
}
EXPORT_SYMBOL(generic_perform_write);
@@ -3986,25 +3989,19 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
- struct inode *inode = mapping->host;
- ssize_t written = 0;
- ssize_t err;
- ssize_t status;
-
- /* We can write back this queue in page reclaim */
- current->backing_dev_info = inode_to_bdi(inode);
- err = file_remove_privs(file);
- if (err)
- goto out;
+ struct inode *inode = mapping->host;
+ ssize_t ret;
- err = file_update_time(file);
- if (err)
- goto out;
+ ret = file_remove_privs(file);
+ if (ret)
+ return ret;
- if (iocb->ki_flags & IOCB_DIRECT) {
- loff_t pos, endbyte;
+ ret = file_update_time(file);
+ if (ret)
+ return ret;
- written = generic_file_direct_write(iocb, from);
+ if (iocb->ki_flags & IOCB_DIRECT) {
+ ret = generic_file_direct_write(iocb, from);
/*
* If the write stopped short of completing, fall back to
* buffered writes. Some filesystems do this for writes to
@@ -4012,49 +4009,13 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
* not succeed (even if it did, DAX does not handle dirty
* page-cache pages correctly).
*/
- if (written < 0 || !iov_iter_count(from) || IS_DAX(inode))
- goto out;
-
- pos = iocb->ki_pos;
- status = generic_perform_write(iocb, from);
- /*
- * If generic_perform_write() returned a synchronous error
- * then we want to return the number of bytes which were
- * direct-written, or the error code if that was zero. Note
- * that this differs from normal direct-io semantics, which
- * will return -EFOO even if some bytes were written.
- */
- if (unlikely(status < 0)) {
- err = status;
- goto out;
- }
- /*
- * We need to ensure that the page cache pages are written to
- * disk and invalidated to preserve the expected O_DIRECT
- * semantics.
- */
- endbyte = pos + status - 1;
- err = filemap_write_and_wait_range(mapping, pos, endbyte);
- if (err == 0) {
- iocb->ki_pos = endbyte + 1;
- written += status;
- invalidate_mapping_pages(mapping,
- pos >> PAGE_SHIFT,
- endbyte >> PAGE_SHIFT);
- } else {
- /*
- * We don't know how much we wrote, so just return
- * the number of bytes which were direct-written
- */
- }
- } else {
- written = generic_perform_write(iocb, from);
- if (likely(written > 0))
- iocb->ki_pos += written;
+ if (ret < 0 || !iov_iter_count(from) || IS_DAX(inode))
+ return ret;
+ return direct_write_fallback(iocb, from, ret,
+ generic_perform_write(iocb, from));
}
-out:
- current->backing_dev_info = NULL;
- return written ? written : err;
+
+ return generic_perform_write(iocb, from);
}
EXPORT_SYMBOL(__generic_file_write_iter);
@@ -4119,3 +4080,171 @@ bool filemap_release_folio(struct folio *folio, gfp_t gfp)
return try_to_free_buffers(folio);
}
EXPORT_SYMBOL(filemap_release_folio);
+
+#ifdef CONFIG_CACHESTAT_SYSCALL
+/**
+ * filemap_cachestat() - compute the page cache statistics of a mapping
+ * @mapping: The mapping to compute the statistics for.
+ * @first_index: The starting page cache index.
+ * @last_index: The final page index (inclusive).
+ * @cs: the cachestat struct to write the result to.
+ *
+ * This will query the page cache statistics of a mapping in the
+ * page range of [first_index, last_index] (inclusive). The statistics
+ * queried include: number of dirty pages, number of pages marked for
+ * writeback, and the number of (recently) evicted pages.
+ */
+static void filemap_cachestat(struct address_space *mapping,
+ pgoff_t first_index, pgoff_t last_index, struct cachestat *cs)
+{
+ XA_STATE(xas, &mapping->i_pages, first_index);
+ struct folio *folio;
+
+ rcu_read_lock();
+ xas_for_each(&xas, folio, last_index) {
+ unsigned long nr_pages;
+ pgoff_t folio_first_index, folio_last_index;
+
+ if (xas_retry(&xas, folio))
+ continue;
+
+ if (xa_is_value(folio)) {
+ /* page is evicted */
+ void *shadow = (void *)folio;
+ bool workingset; /* not used */
+ int order = xa_get_order(xas.xa, xas.xa_index);
+
+ nr_pages = 1 << order;
+ folio_first_index = round_down(xas.xa_index, 1 << order);
+ folio_last_index = folio_first_index + nr_pages - 1;
+
+ /* Folios might straddle the range boundaries, only count covered pages */
+ if (folio_first_index < first_index)
+ nr_pages -= first_index - folio_first_index;
+
+ if (folio_last_index > last_index)
+ nr_pages -= folio_last_index - last_index;
+
+ cs->nr_evicted += nr_pages;
+
+#ifdef CONFIG_SWAP /* implies CONFIG_MMU */
+ if (shmem_mapping(mapping)) {
+ /* shmem file - in swap cache */
+ swp_entry_t swp = radix_to_swp_entry(folio);
+
+ shadow = get_shadow_from_swap_cache(swp);
+ }
+#endif
+ if (workingset_test_recent(shadow, true, &workingset))
+ cs->nr_recently_evicted += nr_pages;
+
+ goto resched;
+ }
+
+ nr_pages = folio_nr_pages(folio);
+ folio_first_index = folio_pgoff(folio);
+ folio_last_index = folio_first_index + nr_pages - 1;
+
+ /* Folios might straddle the range boundaries, only count covered pages */
+ if (folio_first_index < first_index)
+ nr_pages -= first_index - folio_first_index;
+
+ if (folio_last_index > last_index)
+ nr_pages -= folio_last_index - last_index;
+
+ /* page is in cache */
+ cs->nr_cache += nr_pages;
+
+ if (folio_test_dirty(folio))
+ cs->nr_dirty += nr_pages;
+
+ if (folio_test_writeback(folio))
+ cs->nr_writeback += nr_pages;
+
+resched:
+ if (need_resched()) {
+ xas_pause(&xas);
+ cond_resched_rcu();
+ }
+ }
+ rcu_read_unlock();
+}
+
+/*
+ * The cachestat(2) system call.
+ *
+ * cachestat() returns the page cache statistics of a file in the
+ * bytes range specified by `off` and `len`: number of cached pages,
+ * number of dirty pages, number of pages marked for writeback,
+ * number of evicted pages, and number of recently evicted pages.
+ *
+ * An evicted page is a page that is previously in the page cache
+ * but has been evicted since. A page is recently evicted if its last
+ * eviction was recent enough that its reentry to the cache would
+ * indicate that it is actively being used by the system, and that
+ * there is memory pressure on the system.
+ *
+ * `off` and `len` must be non-negative integers. If `len` > 0,
+ * the queried range is [`off`, `off` + `len`]. If `len` == 0,
+ * we will query in the range from `off` to the end of the file.
+ *
+ * The `flags` argument is unused for now, but is included for future
+ * extensibility. User should pass 0 (i.e no flag specified).
+ *
+ * Currently, hugetlbfs is not supported.
+ *
+ * Because the status of a page can change after cachestat() checks it
+ * but before it returns to the application, the returned values may
+ * contain stale information.
+ *
+ * return values:
+ * zero - success
+ * -EFAULT - cstat or cstat_range points to an illegal address
+ * -EINVAL - invalid flags
+ * -EBADF - invalid file descriptor
+ * -EOPNOTSUPP - file descriptor is of a hugetlbfs file
+ */
+SYSCALL_DEFINE4(cachestat, unsigned int, fd,
+ struct cachestat_range __user *, cstat_range,
+ struct cachestat __user *, cstat, unsigned int, flags)
+{
+ struct fd f = fdget(fd);
+ struct address_space *mapping;
+ struct cachestat_range csr;
+ struct cachestat cs;
+ pgoff_t first_index, last_index;
+
+ if (!f.file)
+ return -EBADF;
+
+ if (copy_from_user(&csr, cstat_range,
+ sizeof(struct cachestat_range))) {
+ fdput(f);
+ return -EFAULT;
+ }
+
+ /* hugetlbfs is not supported */
+ if (is_file_hugepages(f.file)) {
+ fdput(f);
+ return -EOPNOTSUPP;
+ }
+
+ if (flags != 0) {
+ fdput(f);
+ return -EINVAL;
+ }
+
+ first_index = csr.off >> PAGE_SHIFT;
+ last_index =
+ csr.len == 0 ? ULONG_MAX : (csr.off + csr.len - 1) >> PAGE_SHIFT;
+ memset(&cs, 0, sizeof(struct cachestat));
+ mapping = f.file->f_mapping;
+ filemap_cachestat(mapping, first_index, last_index, &cs);
+ fdput(f);
+
+ if (copy_to_user(cstat, &cs, sizeof(struct cachestat)))
+ return -EFAULT;
+
+ return 0;
+}
+#endif /* CONFIG_CACHESTAT_SYSCALL */
diff --git a/mm/frontswap.c b/mm/frontswap.c
index 279e55b4ed87..2fb5df3384b8 100644
--- a/mm/frontswap.c
+++ b/mm/frontswap.c
@@ -206,6 +206,7 @@ int __frontswap_load(struct page *page)
int type = swp_type(entry);
struct swap_info_struct *sis = swap_info[type];
pgoff_t offset = swp_offset(entry);
+ bool exclusive = false;
VM_BUG_ON(!frontswap_ops);
VM_BUG_ON(!PageLocked(page));
@@ -215,9 +216,14 @@ int __frontswap_load(struct page *page)
return -1;
/* Try loading from each implementation, until one succeeds. */
- ret = frontswap_ops->load(type, offset, page);
- if (ret == 0)
+ ret = frontswap_ops->load(type, offset, page, &exclusive);
+ if (ret == 0) {
inc_frontswap_loads();
+ if (exclusive) {
+ SetPageDirty(page);
+ __frontswap_clear(sis, offset);
+ }
+ }
return ret;
}
diff --git a/mm/gup.c b/mm/gup.c
index bbe416236593..ef29641671c7 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -18,6 +18,7 @@
#include <linux/migrate.h>
#include <linux/mm_inline.h>
#include <linux/sched/mm.h>
+#include <linux/shmem_fs.h>
#include <asm/mmu_context.h>
#include <asm/tlbflush.h>
@@ -51,7 +52,8 @@ static inline void sanity_check_pinned_pages(struct page **pages,
struct page *page = *pages;
struct folio *folio = page_folio(page);
- if (!folio_test_anon(folio))
+ if (is_zero_page(page) ||
+ !folio_test_anon(folio))
continue;
if (!folio_test_large(folio) || folio_test_hugetlb(folio))
VM_BUG_ON_PAGE(!PageAnonExclusive(&folio->page), page);
@@ -123,63 +125,72 @@ retry:
*/
struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags)
{
+ struct folio *folio;
+
+ if (WARN_ON_ONCE((flags & (FOLL_GET | FOLL_PIN)) == 0))
+ return NULL;
+
if (unlikely(!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(page)))
return NULL;
if (flags & FOLL_GET)
return try_get_folio(page, refs);
- else if (flags & FOLL_PIN) {
- struct folio *folio;
- /*
- * Can't do FOLL_LONGTERM + FOLL_PIN gup fast path if not in a
- * right zone, so fail and let the caller fall back to the slow
- * path.
- */
- if (unlikely((flags & FOLL_LONGTERM) &&
- !is_longterm_pinnable_page(page)))
- return NULL;
-
- /*
- * CAUTION: Don't use compound_head() on the page before this
- * point, the result won't be stable.
- */
- folio = try_get_folio(page, refs);
- if (!folio)
- return NULL;
+ /* FOLL_PIN is set */
- /*
- * When pinning a large folio, use an exact count to track it.
- *
- * However, be sure to *also* increment the normal folio
- * refcount field at least once, so that the folio really
- * is pinned. That's why the refcount from the earlier
- * try_get_folio() is left intact.
- */
- if (folio_test_large(folio))
- atomic_add(refs, &folio->_pincount);
- else
- folio_ref_add(folio,
- refs * (GUP_PIN_COUNTING_BIAS - 1));
- /*
- * Adjust the pincount before re-checking the PTE for changes.
- * This is essentially a smp_mb() and is paired with a memory
- * barrier in page_try_share_anon_rmap().
- */
- smp_mb__after_atomic();
+ /*
+ * Don't take a pin on the zero page - it's not going anywhere
+ * and it is used in a *lot* of places.
+ */
+ if (is_zero_page(page))
+ return page_folio(page);
- node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, refs);
+ folio = try_get_folio(page, refs);
+ if (!folio)
+ return NULL;
- return folio;
+ /*
+ * Can't do FOLL_LONGTERM + FOLL_PIN gup fast path if not in a
+ * right zone, so fail and let the caller fall back to the slow
+ * path.
+ */
+ if (unlikely((flags & FOLL_LONGTERM) &&
+ !folio_is_longterm_pinnable(folio))) {
+ if (!put_devmap_managed_page_refs(&folio->page, refs))
+ folio_put_refs(folio, refs);
+ return NULL;
}
- WARN_ON_ONCE(1);
- return NULL;
+ /*
+ * When pinning a large folio, use an exact count to track it.
+ *
+ * However, be sure to *also* increment the normal folio
+ * refcount field at least once, so that the folio really
+ * is pinned. That's why the refcount from the earlier
+ * try_get_folio() is left intact.
+ */
+ if (folio_test_large(folio))
+ atomic_add(refs, &folio->_pincount);
+ else
+ folio_ref_add(folio,
+ refs * (GUP_PIN_COUNTING_BIAS - 1));
+ /*
+ * Adjust the pincount before re-checking the PTE for changes.
+ * This is essentially a smp_mb() and is paired with a memory
+ * barrier in page_try_share_anon_rmap().
+ */
+ smp_mb__after_atomic();
+
+ node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, refs);
+
+ return folio;
}
static void gup_put_folio(struct folio *folio, int refs, unsigned int flags)
{
if (flags & FOLL_PIN) {
+ if (is_zero_folio(folio))
+ return;
node_stat_mod_folio(folio, NR_FOLL_PIN_RELEASED, refs);
if (folio_test_large(folio))
atomic_sub(refs, &folio->_pincount);
@@ -225,6 +236,13 @@ int __must_check try_grab_page(struct page *page, unsigned int flags)
folio_ref_inc(folio);
else if (flags & FOLL_PIN) {
/*
+ * Don't take a pin on the zero page - it's not going anywhere
+ * and it is used in a *lot* of places.
+ */
+ if (is_zero_page(page))
+ return 0;
+
+ /*
* Similar to try_grab_folio(): be sure to *also*
* increment the normal page refcount field at least once,
* so that the page really is pinned.
@@ -258,6 +276,33 @@ void unpin_user_page(struct page *page)
}
EXPORT_SYMBOL(unpin_user_page);
+/**
+ * folio_add_pin - Try to get an additional pin on a pinned folio
+ * @folio: The folio to be pinned
+ *
+ * Get an additional pin on a folio we already have a pin on. Makes no change
+ * if the folio is a zero_page.
+ */
+void folio_add_pin(struct folio *folio)
+{
+ if (is_zero_folio(folio))
+ return;
+
+ /*
+ * Similar to try_grab_folio(): be sure to *also* increment the normal
+ * page refcount field at least once, so that the page really is
+ * pinned.
+ */
+ if (folio_test_large(folio)) {
+ WARN_ON_ONCE(atomic_read(&folio->_pincount) < 1);
+ folio_ref_inc(folio);
+ atomic_inc(&folio->_pincount);
+ } else {
+ WARN_ON_ONCE(folio_ref_count(folio) < GUP_PIN_COUNTING_BIAS);
+ folio_ref_add(folio, GUP_PIN_COUNTING_BIAS);
+ }
+}
+
static inline struct folio *gup_folio_range_next(struct page *start,
unsigned long npages, unsigned long i, unsigned int *ntails)
{
@@ -476,13 +521,14 @@ static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address,
pte_t *pte, unsigned int flags)
{
if (flags & FOLL_TOUCH) {
- pte_t entry = *pte;
+ pte_t orig_entry = ptep_get(pte);
+ pte_t entry = orig_entry;
if (flags & FOLL_WRITE)
entry = pte_mkdirty(entry);
entry = pte_mkyoung(entry);
- if (!pte_same(*pte, entry)) {
+ if (!pte_same(orig_entry, entry)) {
set_pte_at(vma->vm_mm, address, pte, entry);
update_mmu_cache(vma, address, pte);
}
@@ -544,11 +590,11 @@ static struct page *follow_page_pte(struct vm_area_struct *vma,
if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
(FOLL_PIN | FOLL_GET)))
return ERR_PTR(-EINVAL);
- if (unlikely(pmd_bad(*pmd)))
- return no_page_table(vma, flags);
ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
- pte = *ptep;
+ if (!ptep)
+ return no_page_table(vma, flags);
+ pte = ptep_get(ptep);
if (!pte_present(pte))
goto no_page;
if (pte_protnone(pte) && !gup_can_follow_protnone(flags))
@@ -653,11 +699,7 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
struct mm_struct *mm = vma->vm_mm;
pmd = pmd_offset(pudp, address);
- /*
- * The READ_ONCE() will stabilize the pmdval in a register or
- * on the stack so that it will stop changing under the code.
- */
- pmdval = READ_ONCE(*pmd);
+ pmdval = pmdp_get_lockless(pmd);
if (pmd_none(pmdval))
return no_page_table(vma, flags);
if (!pmd_present(pmdval))
@@ -685,21 +727,10 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
}
if (flags & FOLL_SPLIT_PMD) {
- int ret;
- page = pmd_page(*pmd);
- if (is_huge_zero_page(page)) {
- spin_unlock(ptl);
- ret = 0;
- split_huge_pmd(vma, pmd, address);
- if (pmd_trans_unstable(pmd))
- ret = -EBUSY;
- } else {
- spin_unlock(ptl);
- split_huge_pmd(vma, pmd, address);
- ret = pte_alloc(mm, pmd) ? -ENOMEM : 0;
- }
-
- return ret ? ERR_PTR(ret) :
+ spin_unlock(ptl);
+ split_huge_pmd(vma, pmd, address);
+ /* If pmd was left empty, stuff a page table in there quickly */
+ return pte_alloc(mm, pmd) ? ERR_PTR(-ENOMEM) :
follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
}
page = follow_trans_huge_pmd(vma, address, pmd, flags);
@@ -835,6 +866,7 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address,
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
+ pte_t entry;
int ret = -EFAULT;
/* user gate pages are read-only */
@@ -855,18 +887,20 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address,
pmd = pmd_offset(pud, address);
if (!pmd_present(*pmd))
return -EFAULT;
- VM_BUG_ON(pmd_trans_huge(*pmd));
pte = pte_offset_map(pmd, address);
- if (pte_none(*pte))
+ if (!pte)
+ return -EFAULT;
+ entry = ptep_get(pte);
+ if (pte_none(entry))
goto unmap;
*vma = get_gate_vma(mm);
if (!page)
goto out;
- *page = vm_normal_page(*vma, address, *pte);
+ *page = vm_normal_page(*vma, address, entry);
if (!*page) {
- if ((gup_flags & FOLL_DUMP) || !is_zero_pfn(pte_pfn(*pte)))
+ if ((gup_flags & FOLL_DUMP) || !is_zero_pfn(pte_pfn(entry)))
goto unmap;
- *page = pte_page(*pte);
+ *page = pte_page(entry);
}
ret = try_grab_page(*page, gup_flags);
if (unlikely(ret))
@@ -959,16 +993,54 @@ static int faultin_page(struct vm_area_struct *vma,
return 0;
}
+/*
+ * Writing to file-backed mappings which require folio dirty tracking using GUP
+ * is a fundamentally broken operation, as kernel write access to GUP mappings
+ * do not adhere to the semantics expected by a file system.
+ *
+ * Consider the following scenario:-
+ *
+ * 1. A folio is written to via GUP which write-faults the memory, notifying
+ * the file system and dirtying the folio.
+ * 2. Later, writeback is triggered, resulting in the folio being cleaned and
+ * the PTE being marked read-only.
+ * 3. The GUP caller writes to the folio, as it is mapped read/write via the
+ * direct mapping.
+ * 4. The GUP caller, now done with the page, unpins it and sets it dirty
+ * (though it does not have to).
+ *
+ * This results in both data being written to a folio without writenotify, and
+ * the folio being dirtied unexpectedly (if the caller decides to do so).
+ */
+static bool writable_file_mapping_allowed(struct vm_area_struct *vma,
+ unsigned long gup_flags)
+{
+ /*
+ * If we aren't pinning then no problematic write can occur. A long term
+ * pin is the most egregious case so this is the case we disallow.
+ */
+ if ((gup_flags & (FOLL_PIN | FOLL_LONGTERM)) !=
+ (FOLL_PIN | FOLL_LONGTERM))
+ return true;
+
+ /*
+ * If the VMA does not require dirty tracking then no problematic write
+ * can occur either.
+ */
+ return !vma_needs_dirty_tracking(vma);
+}
+
static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
{
vm_flags_t vm_flags = vma->vm_flags;
int write = (gup_flags & FOLL_WRITE);
int foreign = (gup_flags & FOLL_REMOTE);
+ bool vma_anon = vma_is_anonymous(vma);
if (vm_flags & (VM_IO | VM_PFNMAP))
return -EFAULT;
- if (gup_flags & FOLL_ANON && !vma_is_anonymous(vma))
+ if ((gup_flags & FOLL_ANON) && !vma_anon)
return -EFAULT;
if ((gup_flags & FOLL_LONGTERM) && vma_is_fsdax(vma))
@@ -978,6 +1050,10 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
return -EFAULT;
if (write) {
+ if (!vma_anon &&
+ !writable_file_mapping_allowed(vma, gup_flags))
+ return -EFAULT;
+
if (!(vm_flags & VM_WRITE)) {
if (!(gup_flags & FOLL_FORCE))
return -EFAULT;
@@ -1024,8 +1100,6 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
* @pages: array that receives pointers to the pages pinned.
* Should be at least nr_pages long. Or NULL, if caller
* only intends to ensure the pages are faulted in.
- * @vmas: array of pointers to vmas corresponding to each page.
- * Or NULL if the caller does not require them.
* @locked: whether we're still with the mmap_lock held
*
* Returns either number of pages pinned (which may be less than the
@@ -1039,8 +1113,6 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
*
* The caller is responsible for releasing returned @pages, via put_page().
*
- * @vmas are valid only as long as mmap_lock is held.
- *
* Must be called with mmap_lock held. It may be released. See below.
*
* __get_user_pages walks a process's page tables and takes a reference to
@@ -1076,7 +1148,7 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
static long __get_user_pages(struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
- struct vm_area_struct **vmas, int *locked)
+ int *locked)
{
long ret = 0, i = 0;
struct vm_area_struct *vma = NULL;
@@ -1096,7 +1168,11 @@ static long __get_user_pages(struct mm_struct *mm,
/* first iteration or cross vma bound */
if (!vma || start >= vma->vm_end) {
- vma = find_extend_vma(mm, start);
+ vma = find_vma(mm, start);
+ if (vma && (start < vma->vm_start)) {
+ WARN_ON_ONCE(vma->vm_flags & VM_GROWSDOWN);
+ vma = NULL;
+ }
if (!vma && in_gate_area(mm, start)) {
ret = get_gate_page(mm, start & PAGE_MASK,
gup_flags, &vma,
@@ -1116,9 +1192,9 @@ static long __get_user_pages(struct mm_struct *mm,
goto out;
if (is_vm_hugetlb_page(vma)) {
- i = follow_hugetlb_page(mm, vma, pages, vmas,
- &start, &nr_pages, i,
- gup_flags, locked);
+ i = follow_hugetlb_page(mm, vma, pages,
+ &start, &nr_pages, i,
+ gup_flags, locked);
if (!*locked) {
/*
* We've got a VM_FAULT_RETRY
@@ -1183,10 +1259,6 @@ retry:
ctx.page_mask = 0;
}
next_page:
- if (vmas) {
- vmas[i] = vma;
- ctx.page_mask = 0;
- }
page_increm = 1 + (~(start >> PAGE_SHIFT) & ctx.page_mask);
if (page_increm > nr_pages)
page_increm = nr_pages;
@@ -1265,9 +1337,13 @@ int fixup_user_fault(struct mm_struct *mm,
fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
retry:
- vma = find_extend_vma(mm, address);
- if (!vma || address < vma->vm_start)
+ vma = find_vma(mm, address);
+ if (!vma)
+ return -EFAULT;
+ if (address < vma->vm_start ) {
+ WARN_ON_ONCE(vma->vm_flags & VM_GROWSDOWN);
return -EFAULT;
+ }
if (!vma_permits_fault(vma, fault_flags))
return -EFAULT;
@@ -1341,7 +1417,6 @@ static __always_inline long __get_user_pages_locked(struct mm_struct *mm,
unsigned long start,
unsigned long nr_pages,
struct page **pages,
- struct vm_area_struct **vmas,
int *locked,
unsigned int flags)
{
@@ -1379,7 +1454,7 @@ static __always_inline long __get_user_pages_locked(struct mm_struct *mm,
pages_done = 0;
for (;;) {
ret = __get_user_pages(mm, start, nr_pages, flags, pages,
- vmas, locked);
+ locked);
if (!(flags & FOLL_UNLOCKABLE)) {
/* VM_FAULT_RETRY couldn't trigger, bypass */
pages_done = ret;
@@ -1443,7 +1518,7 @@ retry:
*locked = 1;
ret = __get_user_pages(mm, start, 1, flags | FOLL_TRIED,
- pages, NULL, locked);
+ pages, locked);
if (!*locked) {
/* Continue to retry until we succeeded */
BUG_ON(ret != 0);
@@ -1541,7 +1616,7 @@ long populate_vma_page_range(struct vm_area_struct *vma,
* not result in a stack expansion that recurses back here.
*/
ret = __get_user_pages(mm, start, nr_pages, gup_flags,
- NULL, NULL, locked ? locked : &local_locked);
+ NULL, locked ? locked : &local_locked);
lru_add_drain();
return ret;
}
@@ -1599,7 +1674,7 @@ long faultin_vma_page_range(struct vm_area_struct *vma, unsigned long start,
return -EINVAL;
ret = __get_user_pages(mm, start, nr_pages, gup_flags,
- NULL, NULL, locked);
+ NULL, locked);
lru_add_drain();
return ret;
}
@@ -1667,8 +1742,7 @@ int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
#else /* CONFIG_MMU */
static long __get_user_pages_locked(struct mm_struct *mm, unsigned long start,
unsigned long nr_pages, struct page **pages,
- struct vm_area_struct **vmas, int *locked,
- unsigned int foll_flags)
+ int *locked, unsigned int foll_flags)
{
struct vm_area_struct *vma;
bool must_unlock = false;
@@ -1712,8 +1786,7 @@ static long __get_user_pages_locked(struct mm_struct *mm, unsigned long start,
if (pages[i])
get_page(pages[i]);
}
- if (vmas)
- vmas[i] = vma;
+
start = (start + PAGE_SIZE) & PAGE_MASK;
}
@@ -1894,8 +1967,7 @@ struct page *get_dump_page(unsigned long addr)
int locked = 0;
int ret;
- ret = __get_user_pages_locked(current->mm, addr, 1, &page, NULL,
- &locked,
+ ret = __get_user_pages_locked(current->mm, addr, 1, &page, &locked,
FOLL_FORCE | FOLL_DUMP | FOLL_GET);
return (ret == 1) ? page : NULL;
}
@@ -2068,7 +2140,6 @@ static long __gup_longterm_locked(struct mm_struct *mm,
unsigned long start,
unsigned long nr_pages,
struct page **pages,
- struct vm_area_struct **vmas,
int *locked,
unsigned int gup_flags)
{
@@ -2076,13 +2147,13 @@ static long __gup_longterm_locked(struct mm_struct *mm,
long rc, nr_pinned_pages;
if (!(gup_flags & FOLL_LONGTERM))
- return __get_user_pages_locked(mm, start, nr_pages, pages, vmas,
+ return __get_user_pages_locked(mm, start, nr_pages, pages,
locked, gup_flags);
flags = memalloc_pin_save();
do {
nr_pinned_pages = __get_user_pages_locked(mm, start, nr_pages,
- pages, vmas, locked,
+ pages, locked,
gup_flags);
if (nr_pinned_pages <= 0) {
rc = nr_pinned_pages;
@@ -2100,9 +2171,8 @@ static long __gup_longterm_locked(struct mm_struct *mm,
* Check that the given flags are valid for the exported gup/pup interface, and
* update them with the required flags that the caller must have set.
*/
-static bool is_valid_gup_args(struct page **pages, struct vm_area_struct **vmas,
- int *locked, unsigned int *gup_flags_p,
- unsigned int to_set)
+static bool is_valid_gup_args(struct page **pages, int *locked,
+ unsigned int *gup_flags_p, unsigned int to_set)
{
unsigned int gup_flags = *gup_flags_p;
@@ -2144,13 +2214,6 @@ static bool is_valid_gup_args(struct page **pages, struct vm_area_struct **vmas,
(gup_flags & FOLL_PCI_P2PDMA)))
return false;
- /*
- * Can't use VMAs with locked, as locked allows GUP to unlock
- * which invalidates the vmas array
- */
- if (WARN_ON_ONCE(vmas && (gup_flags & FOLL_UNLOCKABLE)))
- return false;
-
*gup_flags_p = gup_flags;
return true;
}
@@ -2165,8 +2228,6 @@ static bool is_valid_gup_args(struct page **pages, struct vm_area_struct **vmas,
* @pages: array that receives pointers to the pages pinned.
* Should be at least nr_pages long. Or NULL, if caller
* only intends to ensure the pages are faulted in.
- * @vmas: array of pointers to vmas corresponding to each page.
- * Or NULL if the caller does not require them.
* @locked: pointer to lock flag indicating whether lock is held and
* subsequently whether VM_FAULT_RETRY functionality can be
* utilised. Lock must initially be held.
@@ -2181,8 +2242,6 @@ static bool is_valid_gup_args(struct page **pages, struct vm_area_struct **vmas,
*
* The caller is responsible for releasing returned @pages, via put_page().
*
- * @vmas are valid only as long as mmap_lock is held.
- *
* Must be called with mmap_lock held for read or write.
*
* get_user_pages_remote walks a process's page tables and takes a reference
@@ -2219,15 +2278,15 @@ static bool is_valid_gup_args(struct page **pages, struct vm_area_struct **vmas,
long get_user_pages_remote(struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
- struct vm_area_struct **vmas, int *locked)
+ int *locked)
{
int local_locked = 1;
- if (!is_valid_gup_args(pages, vmas, locked, &gup_flags,
+ if (!is_valid_gup_args(pages, locked, &gup_flags,
FOLL_TOUCH | FOLL_REMOTE))
return -EINVAL;
- return __get_user_pages_locked(mm, start, nr_pages, pages, vmas,
+ return __get_user_pages_locked(mm, start, nr_pages, pages,
locked ? locked : &local_locked,
gup_flags);
}
@@ -2237,7 +2296,7 @@ EXPORT_SYMBOL(get_user_pages_remote);
long get_user_pages_remote(struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
- struct vm_area_struct **vmas, int *locked)
+ int *locked)
{
return 0;
}
@@ -2251,8 +2310,6 @@ long get_user_pages_remote(struct mm_struct *mm,
* @pages: array that receives pointers to the pages pinned.
* Should be at least nr_pages long. Or NULL, if caller
* only intends to ensure the pages are faulted in.
- * @vmas: array of pointers to vmas corresponding to each page.
- * Or NULL if the caller does not require them.
*
* This is the same as get_user_pages_remote(), just with a less-flexible
* calling convention where we assume that the mm being operated on belongs to
@@ -2260,16 +2317,15 @@ long get_user_pages_remote(struct mm_struct *mm,
* obviously don't pass FOLL_REMOTE in here.
*/
long get_user_pages(unsigned long start, unsigned long nr_pages,
- unsigned int gup_flags, struct page **pages,
- struct vm_area_struct **vmas)
+ unsigned int gup_flags, struct page **pages)
{
int locked = 1;
- if (!is_valid_gup_args(pages, vmas, NULL, &gup_flags, FOLL_TOUCH))
+ if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_TOUCH))
return -EINVAL;
return __get_user_pages_locked(current->mm, start, nr_pages, pages,
- vmas, &locked, gup_flags);
+ &locked, gup_flags);
}
EXPORT_SYMBOL(get_user_pages);
@@ -2293,12 +2349,12 @@ long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
{
int locked = 0;
- if (!is_valid_gup_args(pages, NULL, NULL, &gup_flags,
+ if (!is_valid_gup_args(pages, NULL, &gup_flags,
FOLL_TOUCH | FOLL_UNLOCKABLE))
return -EINVAL;
return __get_user_pages_locked(current->mm, start, nr_pages, pages,
- NULL, &locked, gup_flags);
+ &locked, gup_flags);
}
EXPORT_SYMBOL(get_user_pages_unlocked);
@@ -2337,6 +2393,82 @@ EXPORT_SYMBOL(get_user_pages_unlocked);
*/
#ifdef CONFIG_HAVE_FAST_GUP
+/*
+ * Used in the GUP-fast path to determine whether a pin is permitted for a
+ * specific folio.
+ *
+ * This call assumes the caller has pinned the folio, that the lowest page table
+ * level still points to this folio, and that interrupts have been disabled.
+ *
+ * Writing to pinned file-backed dirty tracked folios is inherently problematic
+ * (see comment describing the writable_file_mapping_allowed() function). We
+ * therefore try to avoid the most egregious case of a long-term mapping doing
+ * so.
+ *
+ * This function cannot be as thorough as that one as the VMA is not available
+ * in the fast path, so instead we whitelist known good cases and if in doubt,
+ * fall back to the slow path.
+ */
+static bool folio_fast_pin_allowed(struct folio *folio, unsigned int flags)
+{
+ struct address_space *mapping;
+ unsigned long mapping_flags;
+
+ /*
+ * If we aren't pinning then no problematic write can occur. A long term
+ * pin is the most egregious case so this is the one we disallow.
+ */
+ if ((flags & (FOLL_PIN | FOLL_LONGTERM | FOLL_WRITE)) !=
+ (FOLL_PIN | FOLL_LONGTERM | FOLL_WRITE))
+ return true;
+
+ /* The folio is pinned, so we can safely access folio fields. */
+
+ if (WARN_ON_ONCE(folio_test_slab(folio)))
+ return false;
+
+ /* hugetlb mappings do not require dirty-tracking. */
+ if (folio_test_hugetlb(folio))
+ return true;
+
+ /*
+ * GUP-fast disables IRQs. When IRQS are disabled, RCU grace periods
+ * cannot proceed, which means no actions performed under RCU can
+ * proceed either.
+ *
+ * inodes and thus their mappings are freed under RCU, which means the
+ * mapping cannot be freed beneath us and thus we can safely dereference
+ * it.
+ */
+ lockdep_assert_irqs_disabled();
+
+ /*
+ * However, there may be operations which _alter_ the mapping, so ensure
+ * we read it once and only once.
+ */
+ mapping = READ_ONCE(folio->mapping);
+
+ /*
+ * The mapping may have been truncated, in any case we cannot determine
+ * if this mapping is safe - fall back to slow path to determine how to
+ * proceed.
+ */
+ if (!mapping)
+ return false;
+
+ /* Anonymous folios pose no problem. */
+ mapping_flags = (unsigned long)mapping & PAGE_MAPPING_FLAGS;
+ if (mapping_flags)
+ return mapping_flags & PAGE_MAPPING_ANON;
+
+ /*
+ * At this point, we know the mapping is non-null and points to an
+ * address_space object. The only remaining whitelisted file system is
+ * shmem.
+ */
+ return shmem_mapping(mapping);
+}
+
static void __maybe_unused undo_dev_pagemap(int *nr, int nr_start,
unsigned int flags,
struct page **pages)
@@ -2381,6 +2513,8 @@ static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
pte_t *ptep, *ptem;
ptem = ptep = pte_offset_map(&pmd, addr);
+ if (!ptep)
+ return 0;
do {
pte_t pte = ptep_get_lockless(ptep);
struct page *page;
@@ -2417,7 +2551,12 @@ static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
}
if (unlikely(pmd_val(pmd) != pmd_val(*pmdp)) ||
- unlikely(pte_val(pte) != pte_val(*ptep))) {
+ unlikely(pte_val(pte) != pte_val(ptep_get(ptep)))) {
+ gup_put_folio(folio, 1, flags);
+ goto pte_unmap;
+ }
+
+ if (!folio_fast_pin_allowed(folio, flags)) {
gup_put_folio(folio, 1, flags);
goto pte_unmap;
}
@@ -2609,7 +2748,12 @@ static int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
if (!folio)
return 0;
- if (unlikely(pte_val(pte) != pte_val(*ptep))) {
+ if (unlikely(pte_val(pte) != pte_val(ptep_get(ptep)))) {
+ gup_put_folio(folio, refs, flags);
+ return 0;
+ }
+
+ if (!folio_fast_pin_allowed(folio, flags)) {
gup_put_folio(folio, refs, flags);
return 0;
}
@@ -2680,6 +2824,10 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
return 0;
}
+ if (!folio_fast_pin_allowed(folio, flags)) {
+ gup_put_folio(folio, refs, flags);
+ return 0;
+ }
if (!pmd_write(orig) && gup_must_unshare(NULL, flags, &folio->page)) {
gup_put_folio(folio, refs, flags);
return 0;
@@ -2720,6 +2868,11 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
return 0;
}
+ if (!folio_fast_pin_allowed(folio, flags)) {
+ gup_put_folio(folio, refs, flags);
+ return 0;
+ }
+
if (!pud_write(orig) && gup_must_unshare(NULL, flags, &folio->page)) {
gup_put_folio(folio, refs, flags);
return 0;
@@ -2755,6 +2908,16 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
return 0;
}
+ if (!pgd_write(orig) && gup_must_unshare(NULL, flags, &folio->page)) {
+ gup_put_folio(folio, refs, flags);
+ return 0;
+ }
+
+ if (!folio_fast_pin_allowed(folio, flags)) {
+ gup_put_folio(folio, refs, flags);
+ return 0;
+ }
+
*nr += refs;
folio_set_referenced(folio);
return 1;
@@ -2969,7 +3132,7 @@ static int internal_get_user_pages_fast(unsigned long start,
start = untagged_addr(start) & PAGE_MASK;
len = nr_pages << PAGE_SHIFT;
if (check_add_overflow(start, len, &end))
- return 0;
+ return -EOVERFLOW;
if (end > TASK_SIZE_MAX)
return -EFAULT;
if (unlikely(!access_ok((void __user *)start, len)))
@@ -2983,7 +3146,7 @@ static int internal_get_user_pages_fast(unsigned long start,
start += nr_pinned << PAGE_SHIFT;
pages += nr_pinned;
ret = __gup_longterm_locked(current->mm, start, nr_pages - nr_pinned,
- pages, NULL, &locked,
+ pages, &locked,
gup_flags | FOLL_TOUCH | FOLL_UNLOCKABLE);
if (ret < 0) {
/*
@@ -3025,7 +3188,7 @@ int get_user_pages_fast_only(unsigned long start, int nr_pages,
* FOLL_FAST_ONLY is required in order to match the API description of
* this routine: no fall back to regular ("slow") GUP.
*/
- if (!is_valid_gup_args(pages, NULL, NULL, &gup_flags,
+ if (!is_valid_gup_args(pages, NULL, &gup_flags,
FOLL_GET | FOLL_FAST_ONLY))
return -EINVAL;
@@ -3058,7 +3221,7 @@ int get_user_pages_fast(unsigned long start, int nr_pages,
* FOLL_GET, because gup fast is always a "pin with a +1 page refcount"
* request.
*/
- if (!is_valid_gup_args(pages, NULL, NULL, &gup_flags, FOLL_GET))
+ if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_GET))
return -EINVAL;
return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages);
}
@@ -3079,11 +3242,14 @@ EXPORT_SYMBOL_GPL(get_user_pages_fast);
*
* FOLL_PIN means that the pages must be released via unpin_user_page(). Please
* see Documentation/core-api/pin_user_pages.rst for further details.
+ *
+ * Note that if a zero_page is amongst the returned pages, it will not have
+ * pins in it and unpin_user_page() will not remove pins from it.
*/
int pin_user_pages_fast(unsigned long start, int nr_pages,
unsigned int gup_flags, struct page **pages)
{
- if (!is_valid_gup_args(pages, NULL, NULL, &gup_flags, FOLL_PIN))
+ if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_PIN))
return -EINVAL;
return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages);
}
@@ -3098,8 +3264,6 @@ EXPORT_SYMBOL_GPL(pin_user_pages_fast);
* @gup_flags: flags modifying lookup behaviour
* @pages: array that receives pointers to the pages pinned.
* Should be at least nr_pages long.
- * @vmas: array of pointers to vmas corresponding to each page.
- * Or NULL if the caller does not require them.
* @locked: pointer to lock flag indicating whether lock is held and
* subsequently whether VM_FAULT_RETRY functionality can be
* utilised. Lock must initially be held.
@@ -3110,18 +3274,21 @@ EXPORT_SYMBOL_GPL(pin_user_pages_fast);
*
* FOLL_PIN means that the pages must be released via unpin_user_page(). Please
* see Documentation/core-api/pin_user_pages.rst for details.
+ *
+ * Note that if a zero_page is amongst the returned pages, it will not have
+ * pins in it and unpin_user_page*() will not remove pins from it.
*/
long pin_user_pages_remote(struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
- struct vm_area_struct **vmas, int *locked)
+ int *locked)
{
int local_locked = 1;
- if (!is_valid_gup_args(pages, vmas, locked, &gup_flags,
+ if (!is_valid_gup_args(pages, locked, &gup_flags,
FOLL_PIN | FOLL_TOUCH | FOLL_REMOTE))
return 0;
- return __gup_longterm_locked(mm, start, nr_pages, pages, vmas,
+ return __gup_longterm_locked(mm, start, nr_pages, pages,
locked ? locked : &local_locked,
gup_flags);
}
@@ -3135,25 +3302,25 @@ EXPORT_SYMBOL(pin_user_pages_remote);
* @gup_flags: flags modifying lookup behaviour
* @pages: array that receives pointers to the pages pinned.
* Should be at least nr_pages long.
- * @vmas: array of pointers to vmas corresponding to each page.
- * Or NULL if the caller does not require them.
*
* Nearly the same as get_user_pages(), except that FOLL_TOUCH is not set, and
* FOLL_PIN is set.
*
* FOLL_PIN means that the pages must be released via unpin_user_page(). Please
* see Documentation/core-api/pin_user_pages.rst for details.
+ *
+ * Note that if a zero_page is amongst the returned pages, it will not have
+ * pins in it and unpin_user_page*() will not remove pins from it.
*/
long pin_user_pages(unsigned long start, unsigned long nr_pages,
- unsigned int gup_flags, struct page **pages,
- struct vm_area_struct **vmas)
+ unsigned int gup_flags, struct page **pages)
{
int locked = 1;
- if (!is_valid_gup_args(pages, vmas, NULL, &gup_flags, FOLL_PIN))
+ if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_PIN))
return 0;
return __gup_longterm_locked(current->mm, start, nr_pages,
- pages, vmas, &locked, gup_flags);
+ pages, &locked, gup_flags);
}
EXPORT_SYMBOL(pin_user_pages);
@@ -3161,17 +3328,20 @@ EXPORT_SYMBOL(pin_user_pages);
* pin_user_pages_unlocked() is the FOLL_PIN variant of
* get_user_pages_unlocked(). Behavior is the same, except that this one sets
* FOLL_PIN and rejects FOLL_GET.
+ *
+ * Note that if a zero_page is amongst the returned pages, it will not have
+ * pins in it and unpin_user_page*() will not remove pins from it.
*/
long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
struct page **pages, unsigned int gup_flags)
{
int locked = 0;
- if (!is_valid_gup_args(pages, NULL, NULL, &gup_flags,
+ if (!is_valid_gup_args(pages, NULL, &gup_flags,
FOLL_PIN | FOLL_TOUCH | FOLL_UNLOCKABLE))
return 0;
- return __gup_longterm_locked(current->mm, start, nr_pages, pages, NULL,
+ return __gup_longterm_locked(current->mm, start, nr_pages, pages,
&locked, gup_flags);
}
EXPORT_SYMBOL(pin_user_pages_unlocked);
diff --git a/mm/gup_test.c b/mm/gup_test.c
index 8ae7307a1bb6..eeb3f4d87c51 100644
--- a/mm/gup_test.c
+++ b/mm/gup_test.c
@@ -40,24 +40,25 @@ static void verify_dma_pinned(unsigned int cmd, struct page **pages,
unsigned long nr_pages)
{
unsigned long i;
- struct page *page;
+ struct folio *folio;
switch (cmd) {
case PIN_FAST_BENCHMARK:
case PIN_BASIC_TEST:
case PIN_LONGTERM_BENCHMARK:
for (i = 0; i < nr_pages; i++) {
- page = pages[i];
- if (WARN(!page_maybe_dma_pinned(page),
+ folio = page_folio(pages[i]);
+
+ if (WARN(!folio_maybe_dma_pinned(folio),
"pages[%lu] is NOT dma-pinned\n", i)) {
- dump_page(page, "gup_test failure");
+ dump_page(&folio->page, "gup_test failure");
break;
} else if (cmd == PIN_LONGTERM_BENCHMARK &&
- WARN(!is_longterm_pinnable_page(page),
+ WARN(!folio_is_longterm_pinnable(folio),
"pages[%lu] is NOT pinnable but pinned\n",
i)) {
- dump_page(page, "gup_test failure");
+ dump_page(&folio->page, "gup_test failure");
break;
}
}
@@ -139,29 +140,27 @@ static int __gup_test_ioctl(unsigned int cmd,
pages + i);
break;
case GUP_BASIC_TEST:
- nr = get_user_pages(addr, nr, gup->gup_flags, pages + i,
- NULL);
+ nr = get_user_pages(addr, nr, gup->gup_flags, pages + i);
break;
case PIN_FAST_BENCHMARK:
nr = pin_user_pages_fast(addr, nr, gup->gup_flags,
pages + i);
break;
case PIN_BASIC_TEST:
- nr = pin_user_pages(addr, nr, gup->gup_flags, pages + i,
- NULL);
+ nr = pin_user_pages(addr, nr, gup->gup_flags, pages + i);
break;
case PIN_LONGTERM_BENCHMARK:
nr = pin_user_pages(addr, nr,
gup->gup_flags | FOLL_LONGTERM,
- pages + i, NULL);
+ pages + i);
break;
case DUMP_USER_PAGES_TEST:
if (gup->test_flags & GUP_TEST_FLAG_DUMP_PAGES_USE_PIN)
nr = pin_user_pages(addr, nr, gup->gup_flags,
- pages + i, NULL);
+ pages + i);
else
nr = get_user_pages(addr, nr, gup->gup_flags,
- pages + i, NULL);
+ pages + i);
break;
default:
ret = -EINVAL;
@@ -271,7 +270,7 @@ static inline int pin_longterm_test_start(unsigned long arg)
gup_flags, pages);
else
cur_pages = pin_user_pages(addr, remaining_pages,
- gup_flags, pages, NULL);
+ gup_flags, pages);
if (cur_pages < 0) {
pin_longterm_test_stop();
ret = cur_pages;
@@ -381,6 +380,7 @@ static int gup_test_release(struct inode *inode, struct file *file)
static const struct file_operations gup_test_fops = {
.open = nonseekable_open,
.unlocked_ioctl = gup_test_ioctl,
+ .compat_ioctl = compat_ptr_ioctl,
.release = gup_test_release,
};
diff --git a/mm/highmem.c b/mm/highmem.c
index db251e77f98f..e19269093a93 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -161,7 +161,7 @@ struct page *__kmap_to_page(void *vaddr)
/* kmap() mappings */
if (WARN_ON_ONCE(addr >= PKMAP_ADDR(0) &&
addr < PKMAP_ADDR(LAST_PKMAP)))
- return pte_page(pkmap_page_table[PKMAP_NR(addr)]);
+ return pte_page(ptep_get(&pkmap_page_table[PKMAP_NR(addr)]));
/* kmap_local_page() mappings */
if (WARN_ON_ONCE(base >= __fix_to_virt(FIX_KMAP_END) &&
@@ -191,6 +191,7 @@ static void flush_all_zero_pkmaps(void)
for (i = 0; i < LAST_PKMAP; i++) {
struct page *page;
+ pte_t ptent;
/*
* zero means we don't have anything to do,
@@ -203,7 +204,8 @@ static void flush_all_zero_pkmaps(void)
pkmap_count[i] = 0;
/* sanity check */
- BUG_ON(pte_none(pkmap_page_table[i]));
+ ptent = ptep_get(&pkmap_page_table[i]);
+ BUG_ON(pte_none(ptent));
/*
* Don't need an atomic fetch-and-clear op here;
@@ -212,7 +214,7 @@ static void flush_all_zero_pkmaps(void)
* getting the kmap_lock (which is held here).
* So no dangers, even with speculative execution.
*/
- page = pte_page(pkmap_page_table[i]);
+ page = pte_page(ptent);
pte_clear(&init_mm, PKMAP_ADDR(i), &pkmap_page_table[i]);
set_page_address(page, NULL);
@@ -511,7 +513,7 @@ static inline bool kmap_high_unmap_local(unsigned long vaddr)
{
#ifdef ARCH_NEEDS_KMAP_HIGH_GET
if (vaddr >= PKMAP_ADDR(0) && vaddr < PKMAP_ADDR(LAST_PKMAP)) {
- kunmap_high(pte_page(pkmap_page_table[PKMAP_NR(vaddr)]));
+ kunmap_high(pte_page(ptep_get(&pkmap_page_table[PKMAP_NR(vaddr)])));
return true;
}
#endif
@@ -548,7 +550,7 @@ void *__kmap_local_pfn_prot(unsigned long pfn, pgprot_t prot)
idx = arch_kmap_local_map_idx(kmap_local_idx_push(), pfn);
vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
kmap_pte = kmap_get_pte(vaddr, idx);
- BUG_ON(!pte_none(*kmap_pte));
+ BUG_ON(!pte_none(ptep_get(kmap_pte)));
pteval = pfn_pte(pfn, prot);
arch_kmap_local_set_pte(&init_mm, vaddr, kmap_pte, pteval);
arch_kmap_local_post_map(vaddr, pteval);
diff --git a/mm/hmm.c b/mm/hmm.c
index 6a151c09de5e..855e25e59d8f 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -228,7 +228,7 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
struct hmm_range *range = hmm_vma_walk->range;
unsigned int required_fault;
unsigned long cpu_flags;
- pte_t pte = *ptep;
+ pte_t pte = ptep_get(ptep);
uint64_t pfn_req_flags = *hmm_pfn;
if (pte_none_mostly(pte)) {
@@ -332,7 +332,7 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,
pmd_t pmd;
again:
- pmd = READ_ONCE(*pmdp);
+ pmd = pmdp_get_lockless(pmdp);
if (pmd_none(pmd))
return hmm_vma_walk_hole(start, end, -1, walk);
@@ -381,6 +381,8 @@ again:
}
ptep = pte_offset_map(pmdp, addr);
+ if (!ptep)
+ goto again;
for (; addr < end; addr += PAGE_SIZE, ptep++, hmm_pfns++) {
int r;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 624671aaa60d..eb3678360b97 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -583,7 +583,7 @@ void prep_transhuge_page(struct page *page)
VM_BUG_ON_FOLIO(folio_order(folio) < 2, folio);
INIT_LIST_HEAD(&folio->_deferred_list);
- set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR);
+ folio_set_compound_dtor(folio, TRANSHUGE_PAGE_DTOR);
}
static inline bool is_transparent_hugepage(struct page *page)
@@ -1344,7 +1344,7 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
/*
* See do_wp_page(): we can only reuse the folio exclusively if
* there are no additional references. Note that we always drain
- * the LRU pagevecs immediately after adding a THP.
+ * the LRU cache immediately after adding a THP.
*/
if (folio_ref_count(folio) >
1 + folio_test_swapcache(folio) * folio_nr_pages(folio))
@@ -1760,9 +1760,10 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
/*
* The destination pmd shouldn't be established, free_pgtables()
- * should have release it.
+ * should have released it; but move_page_tables() might have already
+ * inserted a page table, if racing against shmem/file collapse.
*/
- if (WARN_ON(!pmd_none(*new_pmd))) {
+ if (!pmd_none(*new_pmd)) {
VM_BUG_ON(pmd_trans_huge(*new_pmd));
return false;
}
@@ -2036,6 +2037,8 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
struct mm_struct *mm = vma->vm_mm;
pgtable_t pgtable;
pmd_t _pmd, old_pmd;
+ unsigned long addr;
+ pte_t *pte;
int i;
/*
@@ -2051,17 +2054,20 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
pgtable = pgtable_trans_huge_withdraw(mm, pmd);
pmd_populate(mm, &_pmd, pgtable);
- for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
- pte_t *pte, entry;
- entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot);
+ pte = pte_offset_map(&_pmd, haddr);
+ VM_BUG_ON(!pte);
+ for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
+ pte_t entry;
+
+ entry = pfn_pte(my_zero_pfn(addr), vma->vm_page_prot);
entry = pte_mkspecial(entry);
if (pmd_uffd_wp(old_pmd))
entry = pte_mkuffd_wp(entry);
- pte = pte_offset_map(&_pmd, haddr);
- VM_BUG_ON(!pte_none(*pte));
- set_pte_at(mm, haddr, pte, entry);
- pte_unmap(pte);
+ VM_BUG_ON(!pte_none(ptep_get(pte)));
+ set_pte_at(mm, addr, pte, entry);
+ pte++;
}
+ pte_unmap(pte - 1);
smp_wmb(); /* make pte visible before pmd */
pmd_populate(mm, pmd, pgtable);
}
@@ -2076,6 +2082,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
bool young, write, soft_dirty, pmd_migration = false, uffd_wp = false;
bool anon_exclusive = false, dirty = false;
unsigned long addr;
+ pte_t *pte;
int i;
VM_BUG_ON(haddr & ~HPAGE_PMD_MASK);
@@ -2204,8 +2211,10 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
pgtable = pgtable_trans_huge_withdraw(mm, pmd);
pmd_populate(mm, &_pmd, pgtable);
+ pte = pte_offset_map(&_pmd, haddr);
+ VM_BUG_ON(!pte);
for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
- pte_t entry, *pte;
+ pte_t entry;
/*
* Note that NUMA hinting access restrictions are not
* transferred to avoid any possibility of altering
@@ -2248,11 +2257,11 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
entry = pte_mkuffd_wp(entry);
page_add_anon_rmap(page + i, vma, addr, false);
}
- pte = pte_offset_map(&_pmd, addr);
- BUG_ON(!pte_none(*pte));
+ VM_BUG_ON(!pte_none(ptep_get(pte)));
set_pte_at(mm, addr, pte, entry);
- pte_unmap(pte);
+ pte++;
}
+ pte_unmap(pte - 1);
if (!pmd_migration)
page_remove_rmap(page, vma, true);
@@ -2792,12 +2801,19 @@ void free_transhuge_page(struct page *page)
struct deferred_split *ds_queue = get_deferred_split_queue(folio);
unsigned long flags;
- spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
- if (!list_empty(&folio->_deferred_list)) {
- ds_queue->split_queue_len--;
- list_del(&folio->_deferred_list);
+ /*
+ * At this point, there is no one trying to add the folio to
+ * deferred_list. If folio is not in deferred_list, it's safe
+ * to check without acquiring the split_queue_lock.
+ */
+ if (data_race(!list_empty(&folio->_deferred_list))) {
+ spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
+ if (!list_empty(&folio->_deferred_list)) {
+ ds_queue->split_queue_len--;
+ list_del(&folio->_deferred_list);
+ }
+ spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
}
- spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
free_compound_page(page);
}
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index f154019e6b84..bce28cca73a1 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1489,7 +1489,6 @@ static void __destroy_compound_gigantic_folio(struct folio *folio,
set_page_refcounted(p);
}
- folio_set_order(folio, 0);
__folio_clear_head(folio);
}
@@ -1951,9 +1950,6 @@ static bool __prep_compound_gigantic_folio(struct folio *folio,
struct page *p;
__folio_clear_reserved(folio);
- __folio_set_head(folio);
- /* we rely on prep_new_hugetlb_folio to set the destructor */
- folio_set_order(folio, order);
for (i = 0; i < nr_pages; i++) {
p = folio_page(folio, i);
@@ -1999,6 +1995,9 @@ static bool __prep_compound_gigantic_folio(struct folio *folio,
if (i != 0)
set_compound_head(p, &folio->page);
}
+ __folio_set_head(folio);
+ /* we rely on prep_new_hugetlb_folio to set the destructor */
+ folio_set_order(folio, order);
atomic_set(&folio->_entire_mapcount, -1);
atomic_set(&folio->_nr_pages_mapped, 0);
atomic_set(&folio->_pincount, 0);
@@ -2017,8 +2016,6 @@ out_error:
p = folio_page(folio, j);
__ClearPageReserved(p);
}
- folio_set_order(folio, 0);
- __folio_clear_head(folio);
return false;
}
@@ -5016,7 +5013,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
struct vm_area_struct *src_vma)
{
pte_t *src_pte, *dst_pte, entry;
- struct page *ptepage;
+ struct folio *pte_folio;
unsigned long addr;
bool cow = is_cow_mapping(src_vma->vm_flags);
struct hstate *h = hstate_vma(src_vma);
@@ -5115,8 +5112,8 @@ again:
set_huge_pte_at(dst, addr, dst_pte, entry);
} else {
entry = huge_ptep_get(src_pte);
- ptepage = pte_page(entry);
- get_page(ptepage);
+ pte_folio = page_folio(pte_page(entry));
+ folio_get(pte_folio);
/*
* Failing to duplicate the anon rmap is a rare case
@@ -5128,10 +5125,10 @@ again:
* need to be without the pgtable locks since we could
* sleep during the process.
*/
- if (!PageAnon(ptepage)) {
- page_dup_file_rmap(ptepage, true);
- } else if (page_try_dup_anon_rmap(ptepage, true,
- src_vma)) {
+ if (!folio_test_anon(pte_folio)) {
+ page_dup_file_rmap(&pte_folio->page, true);
+ } else if (page_try_dup_anon_rmap(&pte_folio->page,
+ true, src_vma)) {
pte_t src_pte_old = entry;
struct folio *new_folio;
@@ -5140,14 +5137,14 @@ again:
/* Do not use reserve as it's private owned */
new_folio = alloc_hugetlb_folio(dst_vma, addr, 1);
if (IS_ERR(new_folio)) {
- put_page(ptepage);
+ folio_put(pte_folio);
ret = PTR_ERR(new_folio);
break;
}
ret = copy_user_large_folio(new_folio,
- page_folio(ptepage),
- addr, dst_vma);
- put_page(ptepage);
+ pte_folio,
+ addr, dst_vma);
+ folio_put(pte_folio);
if (ret) {
folio_put(new_folio);
break;
@@ -5540,7 +5537,7 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma,
const bool unshare = flags & FAULT_FLAG_UNSHARE;
pte_t pte = huge_ptep_get(ptep);
struct hstate *h = hstate_vma(vma);
- struct page *old_page;
+ struct folio *old_folio;
struct folio *new_folio;
int outside_reserve = 0;
vm_fault_t ret = 0;
@@ -5571,7 +5568,7 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma,
return 0;
}
- old_page = pte_page(pte);
+ old_folio = page_folio(pte_page(pte));
delayacct_wpcopy_start();
@@ -5580,17 +5577,17 @@ retry_avoidcopy:
* If no-one else is actually using this page, we're the exclusive
* owner and can reuse this page.
*/
- if (page_mapcount(old_page) == 1 && PageAnon(old_page)) {
- if (!PageAnonExclusive(old_page))
- page_move_anon_rmap(old_page, vma);
+ if (folio_mapcount(old_folio) == 1 && folio_test_anon(old_folio)) {
+ if (!PageAnonExclusive(&old_folio->page))
+ page_move_anon_rmap(&old_folio->page, vma);
if (likely(!unshare))
set_huge_ptep_writable(vma, haddr, ptep);
delayacct_wpcopy_end();
return 0;
}
- VM_BUG_ON_PAGE(PageAnon(old_page) && PageAnonExclusive(old_page),
- old_page);
+ VM_BUG_ON_PAGE(folio_test_anon(old_folio) &&
+ PageAnonExclusive(&old_folio->page), &old_folio->page);
/*
* If the process that created a MAP_PRIVATE mapping is about to
@@ -5602,10 +5599,10 @@ retry_avoidcopy:
* of the full address range.
*/
if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) &&
- page_folio(old_page) != pagecache_folio)
+ old_folio != pagecache_folio)
outside_reserve = 1;
- get_page(old_page);
+ folio_get(old_folio);
/*
* Drop page table lock as buddy allocator may be called. It will
@@ -5627,7 +5624,7 @@ retry_avoidcopy:
pgoff_t idx;
u32 hash;
- put_page(old_page);
+ folio_put(old_folio);
/*
* Drop hugetlb_fault_mutex and vma_lock before
* unmapping. unmapping needs to hold vma_lock
@@ -5642,7 +5639,7 @@ retry_avoidcopy:
hugetlb_vma_unlock_read(vma);
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
- unmap_ref_private(mm, vma, old_page, haddr);
+ unmap_ref_private(mm, vma, &old_folio->page, haddr);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
hugetlb_vma_lock_read(vma);
@@ -5672,7 +5669,7 @@ retry_avoidcopy:
goto out_release_all;
}
- if (copy_user_large_folio(new_folio, page_folio(old_page), address, vma)) {
+ if (copy_user_large_folio(new_folio, old_folio, address, vma)) {
ret = VM_FAULT_HWPOISON_LARGE;
goto out_release_all;
}
@@ -5694,14 +5691,14 @@ retry_avoidcopy:
/* Break COW or unshare */
huge_ptep_clear_flush(vma, haddr, ptep);
mmu_notifier_invalidate_range(mm, range.start, range.end);
- page_remove_rmap(old_page, vma, true);
+ page_remove_rmap(&old_folio->page, vma, true);
hugepage_add_new_anon_rmap(new_folio, vma, haddr);
if (huge_pte_uffd_wp(pte))
newpte = huge_pte_mkuffd_wp(newpte);
set_huge_pte_at(mm, haddr, ptep, newpte);
folio_set_hugetlb_migratable(new_folio);
/* Make the old page be freed below */
- new_folio = page_folio(old_page);
+ new_folio = old_folio;
}
spin_unlock(ptl);
mmu_notifier_invalidate_range_end(&range);
@@ -5710,11 +5707,11 @@ out_release_all:
* No restore in case of successful pagetable update (Break COW or
* unshare)
*/
- if (new_folio != page_folio(old_page))
+ if (new_folio != old_folio)
restore_reserve_on_error(h, vma, haddr, new_folio);
folio_put(new_folio);
out_release_old:
- put_page(old_page);
+ folio_put(old_folio);
spin_lock(ptl); /* Caller expects lock to be held */
@@ -5731,13 +5728,13 @@ static bool hugetlbfs_pagecache_present(struct hstate *h,
{
struct address_space *mapping = vma->vm_file->f_mapping;
pgoff_t idx = vma_hugecache_offset(h, vma, address);
- bool present;
-
- rcu_read_lock();
- present = page_cache_next_miss(mapping, idx, 1) != idx;
- rcu_read_unlock();
+ struct folio *folio;
- return present;
+ folio = filemap_get_folio(mapping, idx);
+ if (IS_ERR(folio))
+ return false;
+ folio_put(folio);
+ return true;
}
int hugetlb_add_to_page_cache(struct folio *folio, struct address_space *mapping,
@@ -6062,7 +6059,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
vm_fault_t ret;
u32 hash;
pgoff_t idx;
- struct page *page = NULL;
+ struct folio *folio = NULL;
struct folio *pagecache_folio = NULL;
struct hstate *h = hstate_vma(vma);
struct address_space *mapping;
@@ -6179,16 +6176,16 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
/*
* hugetlb_wp() requires page locks of pte_page(entry) and
* pagecache_folio, so here we need take the former one
- * when page != pagecache_folio or !pagecache_folio.
+ * when folio != pagecache_folio or !pagecache_folio.
*/
- page = pte_page(entry);
- if (page_folio(page) != pagecache_folio)
- if (!trylock_page(page)) {
+ folio = page_folio(pte_page(entry));
+ if (folio != pagecache_folio)
+ if (!folio_trylock(folio)) {
need_wait_lock = 1;
goto out_ptl;
}
- get_page(page);
+ folio_get(folio);
if (flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) {
if (!huge_pte_write(entry)) {
@@ -6204,9 +6201,9 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
flags & FAULT_FLAG_WRITE))
update_mmu_cache(vma, haddr, ptep);
out_put_page:
- if (page_folio(page) != pagecache_folio)
- unlock_page(page);
- put_page(page);
+ if (folio != pagecache_folio)
+ folio_unlock(folio);
+ folio_put(folio);
out_ptl:
spin_unlock(ptl);
@@ -6225,7 +6222,7 @@ out_mutex:
* here without taking refcount.
*/
if (need_wait_lock)
- wait_on_page_locked(page);
+ folio_wait_locked(folio);
return ret;
}
@@ -6425,17 +6422,14 @@ out_release_nounlock:
}
#endif /* CONFIG_USERFAULTFD */
-static void record_subpages_vmas(struct page *page, struct vm_area_struct *vma,
- int refs, struct page **pages,
- struct vm_area_struct **vmas)
+static void record_subpages(struct page *page, struct vm_area_struct *vma,
+ int refs, struct page **pages)
{
int nr;
for (nr = 0; nr < refs; nr++) {
if (likely(pages))
pages[nr] = nth_page(page, nr);
- if (vmas)
- vmas[nr] = vma;
}
}
@@ -6508,9 +6502,9 @@ out_unlock:
}
long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
- struct page **pages, struct vm_area_struct **vmas,
- unsigned long *position, unsigned long *nr_pages,
- long i, unsigned int flags, int *locked)
+ struct page **pages, unsigned long *position,
+ unsigned long *nr_pages, long i, unsigned int flags,
+ int *locked)
{
unsigned long pfn_offset;
unsigned long vaddr = *position;
@@ -6638,7 +6632,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
* If subpage information not requested, update counters
* and skip the same_page loop below.
*/
- if (!pages && !vmas && !pfn_offset &&
+ if (!pages && !pfn_offset &&
(vaddr + huge_page_size(h) < vma->vm_end) &&
(remainder >= pages_per_huge_page(h))) {
vaddr += huge_page_size(h);
@@ -6653,11 +6647,10 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
refs = min3(pages_per_huge_page(h) - pfn_offset, remainder,
(vma->vm_end - ALIGN_DOWN(vaddr, PAGE_SIZE)) >> PAGE_SHIFT);
- if (pages || vmas)
- record_subpages_vmas(nth_page(page, pfn_offset),
- vma, refs,
- likely(pages) ? pages + i : NULL,
- vmas ? vmas + i : NULL);
+ if (pages)
+ record_subpages(nth_page(page, pfn_offset),
+ vma, refs,
+ likely(pages) ? pages + i : NULL);
if (pages) {
/*
@@ -7137,7 +7130,6 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long saddr;
pte_t *spte = NULL;
pte_t *pte;
- spinlock_t *ptl;
i_mmap_lock_read(mapping);
vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) {
@@ -7158,7 +7150,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
if (!spte)
goto out;
- ptl = huge_pte_lock(hstate_vma(vma), mm, spte);
+ spin_lock(&mm->page_table_lock);
if (pud_none(*pud)) {
pud_populate(mm, pud,
(pmd_t *)((unsigned long)spte & PAGE_MASK));
@@ -7166,7 +7158,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
} else {
put_page(virt_to_page(spte));
}
- spin_unlock(ptl);
+ spin_unlock(&mm->page_table_lock);
out:
pte = (pte_t *)pmd_alloc(mm, pud, addr);
i_mmap_unlock_read(mapping);
@@ -7254,7 +7246,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
pte = (pte_t *)pmd_alloc(mm, pud, addr);
}
}
- BUG_ON(pte && pte_present(*pte) && !pte_huge(*pte));
+ BUG_ON(pte && pte_present(ptep_get(pte)) && !pte_huge(ptep_get(pte)));
return pte;
}
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index 27f001e0f0a2..c2007ef5e9b0 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -105,7 +105,7 @@ static void vmemmap_pte_range(pmd_t *pmd, unsigned long addr,
* remapping (which is calling @walk->remap_pte).
*/
if (!walk->reuse_page) {
- walk->reuse_page = pte_page(*pte);
+ walk->reuse_page = pte_page(ptep_get(pte));
/*
* Because the reuse address is part of the range that we are
* walking, skip the reuse address range.
@@ -239,7 +239,7 @@ static void vmemmap_remap_pte(pte_t *pte, unsigned long addr,
* to the tail pages.
*/
pgprot_t pgprot = PAGE_KERNEL_RO;
- struct page *page = pte_page(*pte);
+ struct page *page = pte_page(ptep_get(pte));
pte_t entry;
/* Remapping the head page requires r/w */
@@ -286,7 +286,7 @@ static void vmemmap_restore_pte(pte_t *pte, unsigned long addr,
struct page *page;
void *to;
- BUG_ON(pte_page(*pte) != walk->reuse_page);
+ BUG_ON(pte_page(ptep_get(pte)) != walk->reuse_page);
page = list_first_entry(walk->vmemmap_pages, struct page, lru);
list_del(&page->lru);
@@ -384,8 +384,9 @@ static int vmemmap_remap_free(unsigned long start, unsigned long end,
}
static int alloc_vmemmap_page_list(unsigned long start, unsigned long end,
- gfp_t gfp_mask, struct list_head *list)
+ struct list_head *list)
{
+ gfp_t gfp_mask = GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_THISNODE;
unsigned long nr_pages = (end - start) >> PAGE_SHIFT;
int nid = page_to_nid((struct page *)start);
struct page *page, *next;
@@ -413,12 +414,11 @@ out:
* @end: end address of the vmemmap virtual address range that we want to
* remap.
* @reuse: reuse address.
- * @gfp_mask: GFP flag for allocating vmemmap pages.
*
* Return: %0 on success, negative error code otherwise.
*/
static int vmemmap_remap_alloc(unsigned long start, unsigned long end,
- unsigned long reuse, gfp_t gfp_mask)
+ unsigned long reuse)
{
LIST_HEAD(vmemmap_pages);
struct vmemmap_remap_walk walk = {
@@ -430,7 +430,7 @@ static int vmemmap_remap_alloc(unsigned long start, unsigned long end,
/* See the comment in the vmemmap_remap_free(). */
BUG_ON(start - reuse != PAGE_SIZE);
- if (alloc_vmemmap_page_list(start, end, gfp_mask, &vmemmap_pages))
+ if (alloc_vmemmap_page_list(start, end, &vmemmap_pages))
return -ENOMEM;
mmap_read_lock(&init_mm);
@@ -476,8 +476,7 @@ int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head)
* When a HugeTLB page is freed to the buddy allocator, previously
* discarded vmemmap pages must be allocated and remapping.
*/
- ret = vmemmap_remap_alloc(vmemmap_start, vmemmap_end, vmemmap_reuse,
- GFP_KERNEL | __GFP_NORETRY | __GFP_THISNODE);
+ ret = vmemmap_remap_alloc(vmemmap_start, vmemmap_end, vmemmap_reuse);
if (!ret) {
ClearHPageVmemmapOptimized(head);
static_branch_dec(&hugetlb_optimize_vmemmap_key);
diff --git a/mm/internal.h b/mm/internal.h
index 68410c6d97ac..a7d9e980429a 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -133,8 +133,8 @@ int truncate_inode_folio(struct address_space *mapping, struct folio *folio);
bool truncate_inode_partial_folio(struct folio *folio, loff_t start,
loff_t end);
long invalidate_inode_page(struct page *page);
-unsigned long invalidate_mapping_pagevec(struct address_space *mapping,
- pgoff_t start, pgoff_t end, unsigned long *nr_pagevec);
+unsigned long mapping_try_invalidate(struct address_space *mapping,
+ pgoff_t start, pgoff_t end, unsigned long *nr_failed);
/**
* folio_evictable - Test whether a folio is evictable.
@@ -179,12 +179,6 @@ extern unsigned long highest_memmap_pfn;
#define MAX_RECLAIM_RETRIES 16
/*
- * in mm/early_ioremap.c
- */
-pgprot_t __init early_memremap_pgprot_adjust(resource_size_t phys_addr,
- unsigned long size, pgprot_t prot);
-
-/*
* in mm/vmscan.c:
*/
bool isolate_lru_page(struct page *page);
@@ -208,10 +202,12 @@ extern char * const zone_names[MAX_NR_ZONES];
/* perform sanity checks on struct pages being allocated or freed */
DECLARE_STATIC_KEY_MAYBE(CONFIG_DEBUG_VM, check_pages_enabled);
-static inline bool is_check_pages_enabled(void)
-{
- return static_branch_unlikely(&check_pages_enabled);
-}
+extern int min_free_kbytes;
+
+void setup_per_zone_wmarks(void);
+void calculate_min_free_kbytes(void);
+int __meminit init_per_zone_wmark_min(void);
+void page_alloc_sysctl_init(void);
/*
* Structure for holding the mostly immutable allocation parameters passed
@@ -371,6 +367,13 @@ static inline struct page *pageblock_pfn_to_page(unsigned long start_pfn,
return __pageblock_pfn_to_page(start_pfn, end_pfn, zone);
}
+void set_zone_contiguous(struct zone *zone);
+
+static inline void clear_zone_contiguous(struct zone *zone)
+{
+ zone->contiguous = false;
+}
+
extern int __isolate_free_page(struct page *page, unsigned int order);
extern void __putback_isolated_page(struct page *page, unsigned int order,
int mt);
@@ -378,12 +381,27 @@ extern void memblock_free_pages(struct page *page, unsigned long pfn,
unsigned int order);
extern void __free_pages_core(struct page *page, unsigned int order);
+/*
+ * This will have no effect, other than possibly generating a warning, if the
+ * caller passes in a non-large folio.
+ */
+static inline void folio_set_order(struct folio *folio, unsigned int order)
+{
+ if (WARN_ON_ONCE(!order || !folio_test_large(folio)))
+ return;
+
+ folio->_folio_order = order;
+#ifdef CONFIG_64BIT
+ folio->_folio_nr_pages = 1U << order;
+#endif
+}
+
static inline void prep_compound_head(struct page *page, unsigned int order)
{
struct folio *folio = (struct folio *)page;
- set_compound_page_dtor(page, COMPOUND_PAGE_DTOR);
- set_compound_order(page, order);
+ folio_set_compound_dtor(folio, COMPOUND_PAGE_DTOR);
+ folio_set_order(folio, order);
atomic_set(&folio->_entire_mapcount, -1);
atomic_set(&folio->_nr_pages_mapped, 0);
atomic_set(&folio->_pincount, 0);
@@ -416,27 +434,12 @@ extern void *memmap_alloc(phys_addr_t size, phys_addr_t align,
phys_addr_t min_addr,
int nid, bool exact_nid);
-int split_free_page(struct page *free_page,
- unsigned int order, unsigned long split_pfn_offset);
+void memmap_init_range(unsigned long, int, unsigned long, unsigned long,
+ unsigned long, enum meminit_context, struct vmem_altmap *, int);
-/*
- * This will have no effect, other than possibly generating a warning, if the
- * caller passes in a non-large folio.
- */
-static inline void folio_set_order(struct folio *folio, unsigned int order)
-{
- if (WARN_ON_ONCE(!folio_test_large(folio)))
- return;
- folio->_folio_order = order;
-#ifdef CONFIG_64BIT
- /*
- * When hugetlb dissolves a folio, we need to clear the tail
- * page, rather than setting nr_pages to 1.
- */
- folio->_folio_nr_pages = order ? 1U << order : 0;
-#endif
-}
+int split_free_page(struct page *free_page,
+ unsigned int order, unsigned long split_pfn_offset);
#if defined CONFIG_COMPACTION || defined CONFIG_CMA
@@ -563,8 +566,8 @@ extern long populate_vma_page_range(struct vm_area_struct *vma,
extern long faultin_vma_page_range(struct vm_area_struct *vma,
unsigned long start, unsigned long end,
bool write, int *locked);
-extern int mlock_future_check(struct mm_struct *mm, unsigned long flags,
- unsigned long len);
+extern bool mlock_future_ok(struct mm_struct *mm, unsigned long flags,
+ unsigned long bytes);
/*
* mlock_vma_folio() and munlock_vma_folio():
* should be called with vma's mmap_lock held for read or write,
@@ -1047,17 +1050,17 @@ static inline void vma_iter_store(struct vma_iterator *vmi,
{
#if defined(CONFIG_DEBUG_VM_MAPLE_TREE)
- if (WARN_ON(vmi->mas.node != MAS_START && vmi->mas.index > vma->vm_start)) {
- printk("%lu > %lu\n", vmi->mas.index, vma->vm_start);
- printk("store of vma %lu-%lu", vma->vm_start, vma->vm_end);
- printk("into slot %lu-%lu", vmi->mas.index, vmi->mas.last);
- mt_dump(vmi->mas.tree);
+ if (MAS_WARN_ON(&vmi->mas, vmi->mas.node != MAS_START &&
+ vmi->mas.index > vma->vm_start)) {
+ pr_warn("%lx > %lx\n store vma %lx-%lx\n into slot %lx-%lx\n",
+ vmi->mas.index, vma->vm_start, vma->vm_start,
+ vma->vm_end, vmi->mas.index, vmi->mas.last);
}
- if (WARN_ON(vmi->mas.node != MAS_START && vmi->mas.last < vma->vm_start)) {
- printk("%lu < %lu\n", vmi->mas.last, vma->vm_start);
- printk("store of vma %lu-%lu", vma->vm_start, vma->vm_end);
- printk("into slot %lu-%lu", vmi->mas.index, vmi->mas.last);
- mt_dump(vmi->mas.tree);
+ if (MAS_WARN_ON(&vmi->mas, vmi->mas.node != MAS_START &&
+ vmi->mas.last < vma->vm_start)) {
+ pr_warn("%lx < %lx\nstore vma %lx-%lx\ninto slot %lx-%lx\n",
+ vmi->mas.last, vma->vm_start, vma->vm_start, vma->vm_end,
+ vmi->mas.index, vmi->mas.last);
}
#endif
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index b376a5d055e5..256930da578a 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -445,7 +445,7 @@ void * __must_check __kasan_krealloc(const void *object, size_t size, gfp_t flag
bool __kasan_check_byte(const void *address, unsigned long ip)
{
if (!kasan_byte_accessible(address)) {
- kasan_report((unsigned long)address, 1, false, ip);
+ kasan_report(address, 1, false, ip);
return false;
}
return true;
diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c
index e5eef670735e..5b4c97baa656 100644
--- a/mm/kasan/generic.c
+++ b/mm/kasan/generic.c
@@ -40,39 +40,39 @@
* depending on memory access size X.
*/
-static __always_inline bool memory_is_poisoned_1(unsigned long addr)
+static __always_inline bool memory_is_poisoned_1(const void *addr)
{
- s8 shadow_value = *(s8 *)kasan_mem_to_shadow((void *)addr);
+ s8 shadow_value = *(s8 *)kasan_mem_to_shadow(addr);
if (unlikely(shadow_value)) {
- s8 last_accessible_byte = addr & KASAN_GRANULE_MASK;
+ s8 last_accessible_byte = (unsigned long)addr & KASAN_GRANULE_MASK;
return unlikely(last_accessible_byte >= shadow_value);
}
return false;
}
-static __always_inline bool memory_is_poisoned_2_4_8(unsigned long addr,
+static __always_inline bool memory_is_poisoned_2_4_8(const void *addr,
unsigned long size)
{
- u8 *shadow_addr = (u8 *)kasan_mem_to_shadow((void *)addr);
+ u8 *shadow_addr = (u8 *)kasan_mem_to_shadow(addr);
/*
* Access crosses 8(shadow size)-byte boundary. Such access maps
* into 2 shadow bytes, so we need to check them both.
*/
- if (unlikely(((addr + size - 1) & KASAN_GRANULE_MASK) < size - 1))
+ if (unlikely((((unsigned long)addr + size - 1) & KASAN_GRANULE_MASK) < size - 1))
return *shadow_addr || memory_is_poisoned_1(addr + size - 1);
return memory_is_poisoned_1(addr + size - 1);
}
-static __always_inline bool memory_is_poisoned_16(unsigned long addr)
+static __always_inline bool memory_is_poisoned_16(const void *addr)
{
- u16 *shadow_addr = (u16 *)kasan_mem_to_shadow((void *)addr);
+ u16 *shadow_addr = (u16 *)kasan_mem_to_shadow(addr);
/* Unaligned 16-bytes access maps into 3 shadow bytes. */
- if (unlikely(!IS_ALIGNED(addr, KASAN_GRANULE_SIZE)))
+ if (unlikely(!IS_ALIGNED((unsigned long)addr, KASAN_GRANULE_SIZE)))
return *shadow_addr || memory_is_poisoned_1(addr + 15);
return *shadow_addr;
@@ -120,26 +120,25 @@ static __always_inline unsigned long memory_is_nonzero(const void *start,
return bytes_is_nonzero(start, (end - start) % 8);
}
-static __always_inline bool memory_is_poisoned_n(unsigned long addr,
- size_t size)
+static __always_inline bool memory_is_poisoned_n(const void *addr, size_t size)
{
unsigned long ret;
- ret = memory_is_nonzero(kasan_mem_to_shadow((void *)addr),
- kasan_mem_to_shadow((void *)addr + size - 1) + 1);
+ ret = memory_is_nonzero(kasan_mem_to_shadow(addr),
+ kasan_mem_to_shadow(addr + size - 1) + 1);
if (unlikely(ret)) {
- unsigned long last_byte = addr + size - 1;
- s8 *last_shadow = (s8 *)kasan_mem_to_shadow((void *)last_byte);
+ const void *last_byte = addr + size - 1;
+ s8 *last_shadow = (s8 *)kasan_mem_to_shadow(last_byte);
if (unlikely(ret != (unsigned long)last_shadow ||
- ((long)(last_byte & KASAN_GRANULE_MASK) >= *last_shadow)))
+ (((long)last_byte & KASAN_GRANULE_MASK) >= *last_shadow)))
return true;
}
return false;
}
-static __always_inline bool memory_is_poisoned(unsigned long addr, size_t size)
+static __always_inline bool memory_is_poisoned(const void *addr, size_t size)
{
if (__builtin_constant_p(size)) {
switch (size) {
@@ -159,7 +158,7 @@ static __always_inline bool memory_is_poisoned(unsigned long addr, size_t size)
return memory_is_poisoned_n(addr, size);
}
-static __always_inline bool check_region_inline(unsigned long addr,
+static __always_inline bool check_region_inline(const void *addr,
size_t size, bool write,
unsigned long ret_ip)
{
@@ -172,7 +171,7 @@ static __always_inline bool check_region_inline(unsigned long addr,
if (unlikely(addr + size < addr))
return !kasan_report(addr, size, write, ret_ip);
- if (unlikely(!addr_has_metadata((void *)addr)))
+ if (unlikely(!addr_has_metadata(addr)))
return !kasan_report(addr, size, write, ret_ip);
if (likely(!memory_is_poisoned(addr, size)))
@@ -181,7 +180,7 @@ static __always_inline bool check_region_inline(unsigned long addr,
return !kasan_report(addr, size, write, ret_ip);
}
-bool kasan_check_range(unsigned long addr, size_t size, bool write,
+bool kasan_check_range(const void *addr, size_t size, bool write,
unsigned long ret_ip)
{
return check_region_inline(addr, size, write, ret_ip);
@@ -221,36 +220,37 @@ static void register_global(struct kasan_global *global)
KASAN_GLOBAL_REDZONE, false);
}
-void __asan_register_globals(struct kasan_global *globals, size_t size)
+void __asan_register_globals(void *ptr, ssize_t size)
{
int i;
+ struct kasan_global *globals = ptr;
for (i = 0; i < size; i++)
register_global(&globals[i]);
}
EXPORT_SYMBOL(__asan_register_globals);
-void __asan_unregister_globals(struct kasan_global *globals, size_t size)
+void __asan_unregister_globals(void *ptr, ssize_t size)
{
}
EXPORT_SYMBOL(__asan_unregister_globals);
#define DEFINE_ASAN_LOAD_STORE(size) \
- void __asan_load##size(unsigned long addr) \
+ void __asan_load##size(void *addr) \
{ \
check_region_inline(addr, size, false, _RET_IP_); \
} \
EXPORT_SYMBOL(__asan_load##size); \
__alias(__asan_load##size) \
- void __asan_load##size##_noabort(unsigned long); \
+ void __asan_load##size##_noabort(void *); \
EXPORT_SYMBOL(__asan_load##size##_noabort); \
- void __asan_store##size(unsigned long addr) \
+ void __asan_store##size(void *addr) \
{ \
check_region_inline(addr, size, true, _RET_IP_); \
} \
EXPORT_SYMBOL(__asan_store##size); \
__alias(__asan_store##size) \
- void __asan_store##size##_noabort(unsigned long); \
+ void __asan_store##size##_noabort(void *); \
EXPORT_SYMBOL(__asan_store##size##_noabort)
DEFINE_ASAN_LOAD_STORE(1);
@@ -259,24 +259,24 @@ DEFINE_ASAN_LOAD_STORE(4);
DEFINE_ASAN_LOAD_STORE(8);
DEFINE_ASAN_LOAD_STORE(16);
-void __asan_loadN(unsigned long addr, size_t size)
+void __asan_loadN(void *addr, ssize_t size)
{
kasan_check_range(addr, size, false, _RET_IP_);
}
EXPORT_SYMBOL(__asan_loadN);
__alias(__asan_loadN)
-void __asan_loadN_noabort(unsigned long, size_t);
+void __asan_loadN_noabort(void *, ssize_t);
EXPORT_SYMBOL(__asan_loadN_noabort);
-void __asan_storeN(unsigned long addr, size_t size)
+void __asan_storeN(void *addr, ssize_t size)
{
kasan_check_range(addr, size, true, _RET_IP_);
}
EXPORT_SYMBOL(__asan_storeN);
__alias(__asan_storeN)
-void __asan_storeN_noabort(unsigned long, size_t);
+void __asan_storeN_noabort(void *, ssize_t);
EXPORT_SYMBOL(__asan_storeN_noabort);
/* to shut up compiler complaints */
@@ -284,7 +284,7 @@ void __asan_handle_no_return(void) {}
EXPORT_SYMBOL(__asan_handle_no_return);
/* Emitted by compiler to poison alloca()ed objects. */
-void __asan_alloca_poison(unsigned long addr, size_t size)
+void __asan_alloca_poison(void *addr, ssize_t size)
{
size_t rounded_up_size = round_up(size, KASAN_GRANULE_SIZE);
size_t padding_size = round_up(size, KASAN_ALLOCA_REDZONE_SIZE) -
@@ -295,7 +295,7 @@ void __asan_alloca_poison(unsigned long addr, size_t size)
KASAN_ALLOCA_REDZONE_SIZE);
const void *right_redzone = (const void *)(addr + rounded_up_size);
- WARN_ON(!IS_ALIGNED(addr, KASAN_ALLOCA_REDZONE_SIZE));
+ WARN_ON(!IS_ALIGNED((unsigned long)addr, KASAN_ALLOCA_REDZONE_SIZE));
kasan_unpoison((const void *)(addr + rounded_down_size),
size - rounded_down_size, false);
@@ -307,18 +307,18 @@ void __asan_alloca_poison(unsigned long addr, size_t size)
EXPORT_SYMBOL(__asan_alloca_poison);
/* Emitted by compiler to unpoison alloca()ed areas when the stack unwinds. */
-void __asan_allocas_unpoison(const void *stack_top, const void *stack_bottom)
+void __asan_allocas_unpoison(void *stack_top, ssize_t stack_bottom)
{
- if (unlikely(!stack_top || stack_top > stack_bottom))
+ if (unlikely(!stack_top || stack_top > (void *)stack_bottom))
return;
- kasan_unpoison(stack_top, stack_bottom - stack_top, false);
+ kasan_unpoison(stack_top, (void *)stack_bottom - stack_top, false);
}
EXPORT_SYMBOL(__asan_allocas_unpoison);
/* Emitted by the compiler to [un]poison local variables. */
#define DEFINE_ASAN_SET_SHADOW(byte) \
- void __asan_set_shadow_##byte(const void *addr, size_t size) \
+ void __asan_set_shadow_##byte(const void *addr, ssize_t size) \
{ \
__memset((void *)addr, 0x##byte, size); \
} \
@@ -488,7 +488,7 @@ static void __kasan_record_aux_stack(void *addr, bool can_alloc)
return;
alloc_meta->aux_stack[1] = alloc_meta->aux_stack[0];
- alloc_meta->aux_stack[0] = kasan_save_stack(GFP_NOWAIT, can_alloc);
+ alloc_meta->aux_stack[0] = kasan_save_stack(0, can_alloc);
}
void kasan_record_aux_stack(void *addr)
@@ -518,7 +518,7 @@ void kasan_save_free_info(struct kmem_cache *cache, void *object)
if (!free_meta)
return;
- kasan_set_track(&free_meta->free_track, GFP_NOWAIT);
+ kasan_set_track(&free_meta->free_track, 0);
/* The object was freed and has free track set. */
*(u8 *)kasan_mem_to_shadow(object) = KASAN_SLAB_FREETRACK;
}
diff --git a/mm/kasan/init.c b/mm/kasan/init.c
index cc64ed6858c6..dcfec277e839 100644
--- a/mm/kasan/init.c
+++ b/mm/kasan/init.c
@@ -286,7 +286,7 @@ static void kasan_free_pte(pte_t *pte_start, pmd_t *pmd)
for (i = 0; i < PTRS_PER_PTE; i++) {
pte = pte_start + i;
- if (!pte_none(*pte))
+ if (!pte_none(ptep_get(pte)))
return;
}
@@ -343,16 +343,19 @@ static void kasan_remove_pte_table(pte_t *pte, unsigned long addr,
unsigned long end)
{
unsigned long next;
+ pte_t ptent;
for (; addr < end; addr = next, pte++) {
next = (addr + PAGE_SIZE) & PAGE_MASK;
if (next > end)
next = end;
- if (!pte_present(*pte))
+ ptent = ptep_get(pte);
+
+ if (!pte_present(ptent))
continue;
- if (WARN_ON(!kasan_early_shadow_page_entry(*pte)))
+ if (WARN_ON(!kasan_early_shadow_page_entry(ptent)))
continue;
pte_clear(&init_mm, addr, pte);
}
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index f5e4f5f2ba20..b799f11e45dc 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -198,13 +198,13 @@ enum kasan_report_type {
struct kasan_report_info {
/* Filled in by kasan_report_*(). */
enum kasan_report_type type;
- void *access_addr;
+ const void *access_addr;
size_t access_size;
bool is_write;
unsigned long ip;
/* Filled in by the common reporting code. */
- void *first_bad_addr;
+ const void *first_bad_addr;
struct kmem_cache *cache;
void *object;
size_t alloc_size;
@@ -311,7 +311,7 @@ static __always_inline bool addr_has_metadata(const void *addr)
* @ret_ip: return address
* @return: true if access was valid, false if invalid
*/
-bool kasan_check_range(unsigned long addr, size_t size, bool write,
+bool kasan_check_range(const void *addr, size_t size, bool write,
unsigned long ret_ip);
#else /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */
@@ -323,7 +323,7 @@ static __always_inline bool addr_has_metadata(const void *addr)
#endif /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */
-void *kasan_find_first_bad_addr(void *addr, size_t size);
+const void *kasan_find_first_bad_addr(const void *addr, size_t size);
size_t kasan_get_alloc_size(void *object, struct kmem_cache *cache);
void kasan_complete_mode_report_info(struct kasan_report_info *info);
void kasan_metadata_fetch_row(char *buffer, void *row);
@@ -346,7 +346,7 @@ void kasan_print_aux_stacks(struct kmem_cache *cache, const void *object);
static inline void kasan_print_aux_stacks(struct kmem_cache *cache, const void *object) { }
#endif
-bool kasan_report(unsigned long addr, size_t size,
+bool kasan_report(const void *addr, size_t size,
bool is_write, unsigned long ip);
void kasan_report_invalid_free(void *object, unsigned long ip, enum kasan_report_type type);
@@ -571,79 +571,82 @@ void kasan_restore_multi_shot(bool enabled);
*/
asmlinkage void kasan_unpoison_task_stack_below(const void *watermark);
-void __asan_register_globals(struct kasan_global *globals, size_t size);
-void __asan_unregister_globals(struct kasan_global *globals, size_t size);
+void __asan_register_globals(void *globals, ssize_t size);
+void __asan_unregister_globals(void *globals, ssize_t size);
void __asan_handle_no_return(void);
-void __asan_alloca_poison(unsigned long addr, size_t size);
-void __asan_allocas_unpoison(const void *stack_top, const void *stack_bottom);
-
-void __asan_load1(unsigned long addr);
-void __asan_store1(unsigned long addr);
-void __asan_load2(unsigned long addr);
-void __asan_store2(unsigned long addr);
-void __asan_load4(unsigned long addr);
-void __asan_store4(unsigned long addr);
-void __asan_load8(unsigned long addr);
-void __asan_store8(unsigned long addr);
-void __asan_load16(unsigned long addr);
-void __asan_store16(unsigned long addr);
-void __asan_loadN(unsigned long addr, size_t size);
-void __asan_storeN(unsigned long addr, size_t size);
-
-void __asan_load1_noabort(unsigned long addr);
-void __asan_store1_noabort(unsigned long addr);
-void __asan_load2_noabort(unsigned long addr);
-void __asan_store2_noabort(unsigned long addr);
-void __asan_load4_noabort(unsigned long addr);
-void __asan_store4_noabort(unsigned long addr);
-void __asan_load8_noabort(unsigned long addr);
-void __asan_store8_noabort(unsigned long addr);
-void __asan_load16_noabort(unsigned long addr);
-void __asan_store16_noabort(unsigned long addr);
-void __asan_loadN_noabort(unsigned long addr, size_t size);
-void __asan_storeN_noabort(unsigned long addr, size_t size);
-
-void __asan_report_load1_noabort(unsigned long addr);
-void __asan_report_store1_noabort(unsigned long addr);
-void __asan_report_load2_noabort(unsigned long addr);
-void __asan_report_store2_noabort(unsigned long addr);
-void __asan_report_load4_noabort(unsigned long addr);
-void __asan_report_store4_noabort(unsigned long addr);
-void __asan_report_load8_noabort(unsigned long addr);
-void __asan_report_store8_noabort(unsigned long addr);
-void __asan_report_load16_noabort(unsigned long addr);
-void __asan_report_store16_noabort(unsigned long addr);
-void __asan_report_load_n_noabort(unsigned long addr, size_t size);
-void __asan_report_store_n_noabort(unsigned long addr, size_t size);
-
-void __asan_set_shadow_00(const void *addr, size_t size);
-void __asan_set_shadow_f1(const void *addr, size_t size);
-void __asan_set_shadow_f2(const void *addr, size_t size);
-void __asan_set_shadow_f3(const void *addr, size_t size);
-void __asan_set_shadow_f5(const void *addr, size_t size);
-void __asan_set_shadow_f8(const void *addr, size_t size);
-
-void *__asan_memset(void *addr, int c, size_t len);
-void *__asan_memmove(void *dest, const void *src, size_t len);
-void *__asan_memcpy(void *dest, const void *src, size_t len);
-
-void __hwasan_load1_noabort(unsigned long addr);
-void __hwasan_store1_noabort(unsigned long addr);
-void __hwasan_load2_noabort(unsigned long addr);
-void __hwasan_store2_noabort(unsigned long addr);
-void __hwasan_load4_noabort(unsigned long addr);
-void __hwasan_store4_noabort(unsigned long addr);
-void __hwasan_load8_noabort(unsigned long addr);
-void __hwasan_store8_noabort(unsigned long addr);
-void __hwasan_load16_noabort(unsigned long addr);
-void __hwasan_store16_noabort(unsigned long addr);
-void __hwasan_loadN_noabort(unsigned long addr, size_t size);
-void __hwasan_storeN_noabort(unsigned long addr, size_t size);
-
-void __hwasan_tag_memory(unsigned long addr, u8 tag, unsigned long size);
-
-void *__hwasan_memset(void *addr, int c, size_t len);
-void *__hwasan_memmove(void *dest, const void *src, size_t len);
-void *__hwasan_memcpy(void *dest, const void *src, size_t len);
+void __asan_alloca_poison(void *, ssize_t size);
+void __asan_allocas_unpoison(void *stack_top, ssize_t stack_bottom);
+
+void __asan_load1(void *);
+void __asan_store1(void *);
+void __asan_load2(void *);
+void __asan_store2(void *);
+void __asan_load4(void *);
+void __asan_store4(void *);
+void __asan_load8(void *);
+void __asan_store8(void *);
+void __asan_load16(void *);
+void __asan_store16(void *);
+void __asan_loadN(void *, ssize_t size);
+void __asan_storeN(void *, ssize_t size);
+
+void __asan_load1_noabort(void *);
+void __asan_store1_noabort(void *);
+void __asan_load2_noabort(void *);
+void __asan_store2_noabort(void *);
+void __asan_load4_noabort(void *);
+void __asan_store4_noabort(void *);
+void __asan_load8_noabort(void *);
+void __asan_store8_noabort(void *);
+void __asan_load16_noabort(void *);
+void __asan_store16_noabort(void *);
+void __asan_loadN_noabort(void *, ssize_t size);
+void __asan_storeN_noabort(void *, ssize_t size);
+
+void __asan_report_load1_noabort(void *);
+void __asan_report_store1_noabort(void *);
+void __asan_report_load2_noabort(void *);
+void __asan_report_store2_noabort(void *);
+void __asan_report_load4_noabort(void *);
+void __asan_report_store4_noabort(void *);
+void __asan_report_load8_noabort(void *);
+void __asan_report_store8_noabort(void *);
+void __asan_report_load16_noabort(void *);
+void __asan_report_store16_noabort(void *);
+void __asan_report_load_n_noabort(void *, ssize_t size);
+void __asan_report_store_n_noabort(void *, ssize_t size);
+
+void __asan_set_shadow_00(const void *addr, ssize_t size);
+void __asan_set_shadow_f1(const void *addr, ssize_t size);
+void __asan_set_shadow_f2(const void *addr, ssize_t size);
+void __asan_set_shadow_f3(const void *addr, ssize_t size);
+void __asan_set_shadow_f5(const void *addr, ssize_t size);
+void __asan_set_shadow_f8(const void *addr, ssize_t size);
+
+void *__asan_memset(void *addr, int c, ssize_t len);
+void *__asan_memmove(void *dest, const void *src, ssize_t len);
+void *__asan_memcpy(void *dest, const void *src, ssize_t len);
+
+void __hwasan_load1_noabort(void *);
+void __hwasan_store1_noabort(void *);
+void __hwasan_load2_noabort(void *);
+void __hwasan_store2_noabort(void *);
+void __hwasan_load4_noabort(void *);
+void __hwasan_store4_noabort(void *);
+void __hwasan_load8_noabort(void *);
+void __hwasan_store8_noabort(void *);
+void __hwasan_load16_noabort(void *);
+void __hwasan_store16_noabort(void *);
+void __hwasan_loadN_noabort(void *, ssize_t size);
+void __hwasan_storeN_noabort(void *, ssize_t size);
+
+void __hwasan_tag_memory(void *, u8 tag, ssize_t size);
+
+void *__hwasan_memset(void *addr, int c, ssize_t len);
+void *__hwasan_memmove(void *dest, const void *src, ssize_t len);
+void *__hwasan_memcpy(void *dest, const void *src, ssize_t len);
+
+void kasan_tag_mismatch(void *addr, unsigned long access_info,
+ unsigned long ret_ip);
#endif /* __MM_KASAN_KASAN_H */
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index 892a9dc9d4d3..ca4b6ff080a6 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -43,6 +43,7 @@ enum kasan_arg_fault {
KASAN_ARG_FAULT_DEFAULT,
KASAN_ARG_FAULT_REPORT,
KASAN_ARG_FAULT_PANIC,
+ KASAN_ARG_FAULT_PANIC_ON_WRITE,
};
static enum kasan_arg_fault kasan_arg_fault __ro_after_init = KASAN_ARG_FAULT_DEFAULT;
@@ -57,6 +58,8 @@ static int __init early_kasan_fault(char *arg)
kasan_arg_fault = KASAN_ARG_FAULT_REPORT;
else if (!strcmp(arg, "panic"))
kasan_arg_fault = KASAN_ARG_FAULT_PANIC;
+ else if (!strcmp(arg, "panic_on_write"))
+ kasan_arg_fault = KASAN_ARG_FAULT_PANIC_ON_WRITE;
else
return -EINVAL;
@@ -211,7 +214,7 @@ static void start_report(unsigned long *flags, bool sync)
pr_err("==================================================================\n");
}
-static void end_report(unsigned long *flags, void *addr)
+static void end_report(unsigned long *flags, const void *addr, bool is_write)
{
if (addr)
trace_error_report_end(ERROR_DETECTOR_KASAN,
@@ -220,8 +223,18 @@ static void end_report(unsigned long *flags, void *addr)
spin_unlock_irqrestore(&report_lock, *flags);
if (!test_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags))
check_panic_on_warn("KASAN");
- if (kasan_arg_fault == KASAN_ARG_FAULT_PANIC)
+ switch (kasan_arg_fault) {
+ case KASAN_ARG_FAULT_DEFAULT:
+ case KASAN_ARG_FAULT_REPORT:
+ break;
+ case KASAN_ARG_FAULT_PANIC:
panic("kasan.fault=panic set ...\n");
+ break;
+ case KASAN_ARG_FAULT_PANIC_ON_WRITE:
+ if (is_write)
+ panic("kasan.fault=panic_on_write set ...\n");
+ break;
+ }
add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
lockdep_on();
report_suppress_stop();
@@ -450,8 +463,8 @@ static void print_memory_metadata(const void *addr)
static void print_report(struct kasan_report_info *info)
{
- void *addr = kasan_reset_tag(info->access_addr);
- u8 tag = get_tag(info->access_addr);
+ void *addr = kasan_reset_tag((void *)info->access_addr);
+ u8 tag = get_tag((void *)info->access_addr);
print_error_description(info);
if (addr_has_metadata(addr))
@@ -468,12 +481,12 @@ static void print_report(struct kasan_report_info *info)
static void complete_report_info(struct kasan_report_info *info)
{
- void *addr = kasan_reset_tag(info->access_addr);
+ void *addr = kasan_reset_tag((void *)info->access_addr);
struct slab *slab;
if (info->type == KASAN_REPORT_ACCESS)
info->first_bad_addr = kasan_find_first_bad_addr(
- info->access_addr, info->access_size);
+ (void *)info->access_addr, info->access_size);
else
info->first_bad_addr = addr;
@@ -536,7 +549,11 @@ void kasan_report_invalid_free(void *ptr, unsigned long ip, enum kasan_report_ty
print_report(&info);
- end_report(&flags, ptr);
+ /*
+ * Invalid free is considered a "write" since the allocator's metadata
+ * updates involves writes.
+ */
+ end_report(&flags, ptr, true);
}
/*
@@ -544,11 +561,10 @@ void kasan_report_invalid_free(void *ptr, unsigned long ip, enum kasan_report_ty
* user_access_save/restore(): kasan_report_invalid_free() cannot be called
* from a UACCESS region, and kasan_report_async() is not used on x86.
*/
-bool kasan_report(unsigned long addr, size_t size, bool is_write,
+bool kasan_report(const void *addr, size_t size, bool is_write,
unsigned long ip)
{
bool ret = true;
- void *ptr = (void *)addr;
unsigned long ua_flags = user_access_save();
unsigned long irq_flags;
struct kasan_report_info info;
@@ -562,7 +578,7 @@ bool kasan_report(unsigned long addr, size_t size, bool is_write,
memset(&info, 0, sizeof(info));
info.type = KASAN_REPORT_ACCESS;
- info.access_addr = ptr;
+ info.access_addr = addr;
info.access_size = size;
info.is_write = is_write;
info.ip = ip;
@@ -571,7 +587,7 @@ bool kasan_report(unsigned long addr, size_t size, bool is_write,
print_report(&info);
- end_report(&irq_flags, ptr);
+ end_report(&irq_flags, (void *)addr, is_write);
out:
user_access_restore(ua_flags);
@@ -597,7 +613,11 @@ void kasan_report_async(void)
pr_err("Asynchronous fault: no details available\n");
pr_err("\n");
dump_stack_lvl(KERN_ERR);
- end_report(&flags, NULL);
+ /*
+ * Conservatively set is_write=true, because no details are available.
+ * In this mode, kasan.fault=panic_on_write is like kasan.fault=panic.
+ */
+ end_report(&flags, NULL, true);
}
#endif /* CONFIG_KASAN_HW_TAGS */
diff --git a/mm/kasan/report_generic.c b/mm/kasan/report_generic.c
index 87d39bc0a673..51a1e8a8877f 100644
--- a/mm/kasan/report_generic.c
+++ b/mm/kasan/report_generic.c
@@ -30,9 +30,9 @@
#include "kasan.h"
#include "../slab.h"
-void *kasan_find_first_bad_addr(void *addr, size_t size)
+const void *kasan_find_first_bad_addr(const void *addr, size_t size)
{
- void *p = addr;
+ const void *p = addr;
if (!addr_has_metadata(p))
return p;
@@ -362,14 +362,14 @@ void kasan_print_address_stack_frame(const void *addr)
#endif /* CONFIG_KASAN_STACK */
#define DEFINE_ASAN_REPORT_LOAD(size) \
-void __asan_report_load##size##_noabort(unsigned long addr) \
+void __asan_report_load##size##_noabort(void *addr) \
{ \
kasan_report(addr, size, false, _RET_IP_); \
} \
EXPORT_SYMBOL(__asan_report_load##size##_noabort)
#define DEFINE_ASAN_REPORT_STORE(size) \
-void __asan_report_store##size##_noabort(unsigned long addr) \
+void __asan_report_store##size##_noabort(void *addr) \
{ \
kasan_report(addr, size, true, _RET_IP_); \
} \
@@ -386,13 +386,13 @@ DEFINE_ASAN_REPORT_STORE(4);
DEFINE_ASAN_REPORT_STORE(8);
DEFINE_ASAN_REPORT_STORE(16);
-void __asan_report_load_n_noabort(unsigned long addr, size_t size)
+void __asan_report_load_n_noabort(void *addr, ssize_t size)
{
kasan_report(addr, size, false, _RET_IP_);
}
EXPORT_SYMBOL(__asan_report_load_n_noabort);
-void __asan_report_store_n_noabort(unsigned long addr, size_t size)
+void __asan_report_store_n_noabort(void *addr, ssize_t size)
{
kasan_report(addr, size, true, _RET_IP_);
}
diff --git a/mm/kasan/report_hw_tags.c b/mm/kasan/report_hw_tags.c
index 32e80f78de7d..065e1b2fc484 100644
--- a/mm/kasan/report_hw_tags.c
+++ b/mm/kasan/report_hw_tags.c
@@ -15,7 +15,7 @@
#include "kasan.h"
-void *kasan_find_first_bad_addr(void *addr, size_t size)
+const void *kasan_find_first_bad_addr(const void *addr, size_t size)
{
/*
* Hardware Tag-Based KASAN only calls this function for normal memory
diff --git a/mm/kasan/report_sw_tags.c b/mm/kasan/report_sw_tags.c
index 8b1f5a73ee6d..689e94f9fe3c 100644
--- a/mm/kasan/report_sw_tags.c
+++ b/mm/kasan/report_sw_tags.c
@@ -30,7 +30,7 @@
#include "kasan.h"
#include "../slab.h"
-void *kasan_find_first_bad_addr(void *addr, size_t size)
+const void *kasan_find_first_bad_addr(const void *addr, size_t size)
{
u8 tag = get_tag(addr);
void *p = kasan_reset_tag(addr);
diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c
index c8b86f3273b5..dd772f9d0f08 100644
--- a/mm/kasan/shadow.c
+++ b/mm/kasan/shadow.c
@@ -28,13 +28,13 @@
bool __kasan_check_read(const volatile void *p, unsigned int size)
{
- return kasan_check_range((unsigned long)p, size, false, _RET_IP_);
+ return kasan_check_range((void *)p, size, false, _RET_IP_);
}
EXPORT_SYMBOL(__kasan_check_read);
bool __kasan_check_write(const volatile void *p, unsigned int size)
{
- return kasan_check_range((unsigned long)p, size, true, _RET_IP_);
+ return kasan_check_range((void *)p, size, true, _RET_IP_);
}
EXPORT_SYMBOL(__kasan_check_write);
@@ -50,7 +50,7 @@ EXPORT_SYMBOL(__kasan_check_write);
#undef memset
void *memset(void *addr, int c, size_t len)
{
- if (!kasan_check_range((unsigned long)addr, len, true, _RET_IP_))
+ if (!kasan_check_range(addr, len, true, _RET_IP_))
return NULL;
return __memset(addr, c, len);
@@ -60,8 +60,8 @@ void *memset(void *addr, int c, size_t len)
#undef memmove
void *memmove(void *dest, const void *src, size_t len)
{
- if (!kasan_check_range((unsigned long)src, len, false, _RET_IP_) ||
- !kasan_check_range((unsigned long)dest, len, true, _RET_IP_))
+ if (!kasan_check_range(src, len, false, _RET_IP_) ||
+ !kasan_check_range(dest, len, true, _RET_IP_))
return NULL;
return __memmove(dest, src, len);
@@ -71,17 +71,17 @@ void *memmove(void *dest, const void *src, size_t len)
#undef memcpy
void *memcpy(void *dest, const void *src, size_t len)
{
- if (!kasan_check_range((unsigned long)src, len, false, _RET_IP_) ||
- !kasan_check_range((unsigned long)dest, len, true, _RET_IP_))
+ if (!kasan_check_range(src, len, false, _RET_IP_) ||
+ !kasan_check_range(dest, len, true, _RET_IP_))
return NULL;
return __memcpy(dest, src, len);
}
#endif
-void *__asan_memset(void *addr, int c, size_t len)
+void *__asan_memset(void *addr, int c, ssize_t len)
{
- if (!kasan_check_range((unsigned long)addr, len, true, _RET_IP_))
+ if (!kasan_check_range(addr, len, true, _RET_IP_))
return NULL;
return __memset(addr, c, len);
@@ -89,10 +89,10 @@ void *__asan_memset(void *addr, int c, size_t len)
EXPORT_SYMBOL(__asan_memset);
#ifdef __HAVE_ARCH_MEMMOVE
-void *__asan_memmove(void *dest, const void *src, size_t len)
+void *__asan_memmove(void *dest, const void *src, ssize_t len)
{
- if (!kasan_check_range((unsigned long)src, len, false, _RET_IP_) ||
- !kasan_check_range((unsigned long)dest, len, true, _RET_IP_))
+ if (!kasan_check_range(src, len, false, _RET_IP_) ||
+ !kasan_check_range(dest, len, true, _RET_IP_))
return NULL;
return __memmove(dest, src, len);
@@ -100,10 +100,10 @@ void *__asan_memmove(void *dest, const void *src, size_t len)
EXPORT_SYMBOL(__asan_memmove);
#endif
-void *__asan_memcpy(void *dest, const void *src, size_t len)
+void *__asan_memcpy(void *dest, const void *src, ssize_t len)
{
- if (!kasan_check_range((unsigned long)src, len, false, _RET_IP_) ||
- !kasan_check_range((unsigned long)dest, len, true, _RET_IP_))
+ if (!kasan_check_range(src, len, false, _RET_IP_) ||
+ !kasan_check_range(dest, len, true, _RET_IP_))
return NULL;
return __memcpy(dest, src, len);
@@ -111,13 +111,13 @@ void *__asan_memcpy(void *dest, const void *src, size_t len)
EXPORT_SYMBOL(__asan_memcpy);
#ifdef CONFIG_KASAN_SW_TAGS
-void *__hwasan_memset(void *addr, int c, size_t len) __alias(__asan_memset);
+void *__hwasan_memset(void *addr, int c, ssize_t len) __alias(__asan_memset);
EXPORT_SYMBOL(__hwasan_memset);
#ifdef __HAVE_ARCH_MEMMOVE
-void *__hwasan_memmove(void *dest, const void *src, size_t len) __alias(__asan_memmove);
+void *__hwasan_memmove(void *dest, const void *src, ssize_t len) __alias(__asan_memmove);
EXPORT_SYMBOL(__hwasan_memmove);
#endif
-void *__hwasan_memcpy(void *dest, const void *src, size_t len) __alias(__asan_memcpy);
+void *__hwasan_memcpy(void *dest, const void *src, ssize_t len) __alias(__asan_memcpy);
EXPORT_SYMBOL(__hwasan_memcpy);
#endif
@@ -226,7 +226,7 @@ static bool shadow_mapped(unsigned long addr)
if (pmd_bad(*pmd))
return true;
pte = pte_offset_kernel(pmd, addr);
- return !pte_none(*pte);
+ return !pte_none(ptep_get(pte));
}
static int __meminit kasan_mem_notifier(struct notifier_block *nb,
@@ -317,7 +317,7 @@ static int kasan_populate_vmalloc_pte(pte_t *ptep, unsigned long addr,
unsigned long page;
pte_t pte;
- if (likely(!pte_none(*ptep)))
+ if (likely(!pte_none(ptep_get(ptep))))
return 0;
page = __get_free_page(GFP_KERNEL);
@@ -328,7 +328,7 @@ static int kasan_populate_vmalloc_pte(pte_t *ptep, unsigned long addr,
pte = pfn_pte(PFN_DOWN(__pa(page)), PAGE_KERNEL);
spin_lock(&init_mm.page_table_lock);
- if (likely(pte_none(*ptep))) {
+ if (likely(pte_none(ptep_get(ptep)))) {
set_pte_at(&init_mm, addr, ptep, pte);
page = 0;
}
@@ -418,11 +418,11 @@ static int kasan_depopulate_vmalloc_pte(pte_t *ptep, unsigned long addr,
{
unsigned long page;
- page = (unsigned long)__va(pte_pfn(*ptep) << PAGE_SHIFT);
+ page = (unsigned long)__va(pte_pfn(ptep_get(ptep)) << PAGE_SHIFT);
spin_lock(&init_mm.page_table_lock);
- if (likely(!pte_none(*ptep))) {
+ if (likely(!pte_none(ptep_get(ptep)))) {
pte_clear(&init_mm, addr, ptep);
free_page(page);
}
diff --git a/mm/kasan/sw_tags.c b/mm/kasan/sw_tags.c
index 30da65fa02a1..220b5d4c6876 100644
--- a/mm/kasan/sw_tags.c
+++ b/mm/kasan/sw_tags.c
@@ -70,8 +70,8 @@ u8 kasan_random_tag(void)
return (u8)(state % (KASAN_TAG_MAX + 1));
}
-bool kasan_check_range(unsigned long addr, size_t size, bool write,
- unsigned long ret_ip)
+bool kasan_check_range(const void *addr, size_t size, bool write,
+ unsigned long ret_ip)
{
u8 tag;
u8 *shadow_first, *shadow_last, *shadow;
@@ -133,12 +133,12 @@ bool kasan_byte_accessible(const void *addr)
}
#define DEFINE_HWASAN_LOAD_STORE(size) \
- void __hwasan_load##size##_noabort(unsigned long addr) \
+ void __hwasan_load##size##_noabort(void *addr) \
{ \
- kasan_check_range(addr, size, false, _RET_IP_); \
+ kasan_check_range(addr, size, false, _RET_IP_); \
} \
EXPORT_SYMBOL(__hwasan_load##size##_noabort); \
- void __hwasan_store##size##_noabort(unsigned long addr) \
+ void __hwasan_store##size##_noabort(void *addr) \
{ \
kasan_check_range(addr, size, true, _RET_IP_); \
} \
@@ -150,25 +150,25 @@ DEFINE_HWASAN_LOAD_STORE(4);
DEFINE_HWASAN_LOAD_STORE(8);
DEFINE_HWASAN_LOAD_STORE(16);
-void __hwasan_loadN_noabort(unsigned long addr, unsigned long size)
+void __hwasan_loadN_noabort(void *addr, ssize_t size)
{
kasan_check_range(addr, size, false, _RET_IP_);
}
EXPORT_SYMBOL(__hwasan_loadN_noabort);
-void __hwasan_storeN_noabort(unsigned long addr, unsigned long size)
+void __hwasan_storeN_noabort(void *addr, ssize_t size)
{
kasan_check_range(addr, size, true, _RET_IP_);
}
EXPORT_SYMBOL(__hwasan_storeN_noabort);
-void __hwasan_tag_memory(unsigned long addr, u8 tag, unsigned long size)
+void __hwasan_tag_memory(void *addr, u8 tag, ssize_t size)
{
- kasan_poison((void *)addr, size, tag, false);
+ kasan_poison(addr, size, tag, false);
}
EXPORT_SYMBOL(__hwasan_tag_memory);
-void kasan_tag_mismatch(unsigned long addr, unsigned long access_info,
+void kasan_tag_mismatch(void *addr, unsigned long access_info,
unsigned long ret_ip)
{
kasan_report(addr, 1 << (access_info & 0xf), access_info & 0x10,
diff --git a/mm/kasan/tags.c b/mm/kasan/tags.c
index 67a222586846..7dcfe341d48e 100644
--- a/mm/kasan/tags.c
+++ b/mm/kasan/tags.c
@@ -140,5 +140,5 @@ void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags)
void kasan_save_free_info(struct kmem_cache *cache, void *object)
{
- save_stack_info(cache, object, GFP_NOWAIT, true);
+ save_stack_info(cache, object, 0, true);
}
diff --git a/mm/kfence/kfence_test.c b/mm/kfence/kfence_test.c
index 6aee19a79236..9e008a336d9f 100644
--- a/mm/kfence/kfence_test.c
+++ b/mm/kfence/kfence_test.c
@@ -191,11 +191,10 @@ static size_t setup_test_cache(struct kunit *test, size_t size, slab_flags_t fla
kunit_info(test, "%s: size=%zu, ctor=%ps\n", __func__, size, ctor);
/*
- * Use SLAB_NOLEAKTRACE to prevent merging with existing caches. Any
- * other flag in SLAB_NEVER_MERGE also works. Use SLAB_ACCOUNT to
- * allocate via memcg, if enabled.
+ * Use SLAB_NO_MERGE to prevent merging with existing caches.
+ * Use SLAB_ACCOUNT to allocate via memcg, if enabled.
*/
- flags |= SLAB_NOLEAKTRACE | SLAB_ACCOUNT;
+ flags |= SLAB_NO_MERGE | SLAB_ACCOUNT;
test_cache = kmem_cache_create("test", size, 1, flags, ctor);
KUNIT_ASSERT_TRUE_MSG(test, test_cache, "could not create cache");
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 6b9d39d65b73..78c8d5d8b628 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -88,7 +88,7 @@ static unsigned int khugepaged_max_ptes_swap __read_mostly;
static unsigned int khugepaged_max_ptes_shared __read_mostly;
#define MM_SLOTS_HASH_BITS 10
-static __read_mostly DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
+static DEFINE_READ_MOSTLY_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
static struct kmem_cache *mm_slot_cache __read_mostly;
@@ -422,19 +422,17 @@ void __khugepaged_enter(struct mm_struct *mm)
struct mm_slot *slot;
int wakeup;
+ /* __khugepaged_exit() must not run from under us */
+ VM_BUG_ON_MM(hpage_collapse_test_exit(mm), mm);
+ if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags)))
+ return;
+
mm_slot = mm_slot_alloc(mm_slot_cache);
if (!mm_slot)
return;
slot = &mm_slot->slot;
- /* __khugepaged_exit() must not run from under us */
- VM_BUG_ON_MM(hpage_collapse_test_exit(mm), mm);
- if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) {
- mm_slot_free(mm_slot_cache, mm_slot);
- return;
- }
-
spin_lock(&khugepaged_mm_lock);
mm_slot_insert(mm_slots_hash, mm, slot);
/*
@@ -513,7 +511,7 @@ static void release_pte_pages(pte_t *pte, pte_t *_pte,
struct folio *folio, *tmp;
while (--_pte >= pte) {
- pte_t pteval = *_pte;
+ pte_t pteval = ptep_get(_pte);
unsigned long pfn;
if (pte_none(pteval))
@@ -557,7 +555,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
_pte++, address += PAGE_SIZE) {
- pte_t pteval = *_pte;
+ pte_t pteval = ptep_get(_pte);
if (pte_none(pteval) || (pte_present(pteval) &&
is_zero_pfn(pte_pfn(pteval)))) {
++none_or_zero;
@@ -701,7 +699,7 @@ static void __collapse_huge_page_copy_succeeded(pte_t *pte,
for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
_pte++, address += PAGE_SIZE) {
- pteval = *_pte;
+ pteval = ptep_get(_pte);
if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1);
if (is_zero_pfn(pte_pfn(pteval))) {
@@ -799,7 +797,7 @@ static int __collapse_huge_page_copy(pte_t *pte,
*/
for (_pte = pte, _address = address; _pte < pte + HPAGE_PMD_NR;
_pte++, page++, _address += PAGE_SIZE) {
- pteval = *_pte;
+ pteval = ptep_get(_pte);
if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
clear_user_highpage(page, _address);
continue;
@@ -946,10 +944,6 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
return SCAN_SUCCEED;
}
-/*
- * See pmd_trans_unstable() for how the result may change out from
- * underneath us, even if we hold mmap_lock in read.
- */
static int find_pmd_or_thp_or_none(struct mm_struct *mm,
unsigned long address,
pmd_t **pmd)
@@ -961,11 +955,6 @@ static int find_pmd_or_thp_or_none(struct mm_struct *mm,
return SCAN_PMD_NULL;
pmde = pmdp_get_lockless(*pmd);
-
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- /* See comments in pmd_none_or_trans_huge_or_clear_bad() */
- barrier();
-#endif
if (pmd_none(pmde))
return SCAN_PMD_NONE;
if (!pmd_present(pmde))
@@ -998,9 +987,8 @@ static int check_pmd_still_valid(struct mm_struct *mm,
* Only done if hpage_collapse_scan_pmd believes it is worthwhile.
*
* Called and returns without pte mapped or spinlocks held.
- * Note that if false is returned, mmap_lock will be released.
+ * Returns result: if not SCAN_SUCCEED, mmap_lock has been released.
*/
-
static int __collapse_huge_page_swapin(struct mm_struct *mm,
struct vm_area_struct *vma,
unsigned long haddr, pmd_t *pmd,
@@ -1009,23 +997,37 @@ static int __collapse_huge_page_swapin(struct mm_struct *mm,
int swapped_in = 0;
vm_fault_t ret = 0;
unsigned long address, end = haddr + (HPAGE_PMD_NR * PAGE_SIZE);
+ int result;
+ pte_t *pte = NULL;
+ spinlock_t *ptl;
for (address = haddr; address < end; address += PAGE_SIZE) {
struct vm_fault vmf = {
.vma = vma,
.address = address,
- .pgoff = linear_page_index(vma, haddr),
+ .pgoff = linear_page_index(vma, address),
.flags = FAULT_FLAG_ALLOW_RETRY,
.pmd = pmd,
};
- vmf.pte = pte_offset_map(pmd, address);
- vmf.orig_pte = *vmf.pte;
- if (!is_swap_pte(vmf.orig_pte)) {
- pte_unmap(vmf.pte);
- continue;
+ if (!pte++) {
+ pte = pte_offset_map_nolock(mm, pmd, address, &ptl);
+ if (!pte) {
+ mmap_read_unlock(mm);
+ result = SCAN_PMD_NULL;
+ goto out;
+ }
}
+
+ vmf.orig_pte = ptep_get_lockless(pte);
+ if (!is_swap_pte(vmf.orig_pte))
+ continue;
+
+ vmf.pte = pte;
+ vmf.ptl = ptl;
ret = do_swap_page(&vmf);
+ /* Which unmaps pte (after perhaps re-checking the entry) */
+ pte = NULL;
/*
* do_swap_page returns VM_FAULT_RETRY with released mmap_lock.
@@ -1034,24 +1036,29 @@ static int __collapse_huge_page_swapin(struct mm_struct *mm,
* resulting in later failure.
*/
if (ret & VM_FAULT_RETRY) {
- trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
/* Likely, but not guaranteed, that page lock failed */
- return SCAN_PAGE_LOCK;
+ result = SCAN_PAGE_LOCK;
+ goto out;
}
if (ret & VM_FAULT_ERROR) {
mmap_read_unlock(mm);
- trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
- return SCAN_FAIL;
+ result = SCAN_FAIL;
+ goto out;
}
swapped_in++;
}
- /* Drain LRU add pagevec to remove extra pin on the swapped in pages */
+ if (pte)
+ pte_unmap(pte);
+
+ /* Drain LRU cache to remove extra pin on the swapped in pages */
if (swapped_in)
lru_add_drain();
- trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 1);
- return SCAN_SUCCEED;
+ result = SCAN_SUCCEED;
+out:
+ trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, result);
+ return result;
}
static int alloc_charge_hpage(struct page **hpage, struct mm_struct *mm,
@@ -1151,9 +1158,6 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
address + HPAGE_PMD_SIZE);
mmu_notifier_invalidate_range_start(&range);
- pte = pte_offset_map(pmd, address);
- pte_ptl = pte_lockptr(mm, pmd);
-
pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */
/*
* This removes any huge TLB entry from the CPU so we won't allow
@@ -1168,13 +1172,18 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
mmu_notifier_invalidate_range_end(&range);
tlb_remove_table_sync_one();
- spin_lock(pte_ptl);
- result = __collapse_huge_page_isolate(vma, address, pte, cc,
- &compound_pagelist);
- spin_unlock(pte_ptl);
+ pte = pte_offset_map_lock(mm, &_pmd, address, &pte_ptl);
+ if (pte) {
+ result = __collapse_huge_page_isolate(vma, address, pte, cc,
+ &compound_pagelist);
+ spin_unlock(pte_ptl);
+ } else {
+ result = SCAN_PMD_NULL;
+ }
if (unlikely(result != SCAN_SUCCEED)) {
- pte_unmap(pte);
+ if (pte)
+ pte_unmap(pte);
spin_lock(pmd_ptl);
BUG_ON(!pmd_none(*pmd));
/*
@@ -1258,9 +1267,14 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
memset(cc->node_load, 0, sizeof(cc->node_load));
nodes_clear(cc->alloc_nmask);
pte = pte_offset_map_lock(mm, pmd, address, &ptl);
+ if (!pte) {
+ result = SCAN_PMD_NULL;
+ goto out;
+ }
+
for (_address = address, _pte = pte; _pte < pte + HPAGE_PMD_NR;
_pte++, _address += PAGE_SIZE) {
- pte_t pteval = *_pte;
+ pte_t pteval = ptep_get(_pte);
if (is_swap_pte(pteval)) {
++unmapped;
if (!cc->is_khugepaged ||
@@ -1627,25 +1641,28 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
* lockless_pages_from_mm() and the hardware page walker can access page
* tables while all the high-level locks are held in write mode.
*/
- start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl);
result = SCAN_FAIL;
+ start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl);
+ if (!start_pte)
+ goto drop_immap;
/* step 1: check all mapped PTEs are to the right huge page */
for (i = 0, addr = haddr, pte = start_pte;
i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
struct page *page;
+ pte_t ptent = ptep_get(pte);
/* empty pte, skip */
- if (pte_none(*pte))
+ if (pte_none(ptent))
continue;
/* page swapped out, abort */
- if (!pte_present(*pte)) {
+ if (!pte_present(ptent)) {
result = SCAN_PTE_NON_PRESENT;
goto abort;
}
- page = vm_normal_page(vma, addr, *pte);
+ page = vm_normal_page(vma, addr, ptent);
if (WARN_ON_ONCE(page && is_zone_device_page(page)))
page = NULL;
/*
@@ -1661,10 +1678,11 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
for (i = 0, addr = haddr, pte = start_pte;
i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
struct page *page;
+ pte_t ptent = ptep_get(pte);
- if (pte_none(*pte))
+ if (pte_none(ptent))
continue;
- page = vm_normal_page(vma, addr, *pte);
+ page = vm_normal_page(vma, addr, ptent);
if (WARN_ON_ONCE(page && is_zone_device_page(page)))
goto abort;
page_remove_rmap(page, vma, false);
@@ -1702,6 +1720,7 @@ drop_hpage:
abort:
pte_unmap_unlock(start_pte, ptl);
+drop_immap:
i_mmap_unlock_write(vma->vm_file->f_mapping);
goto drop_hpage;
}
@@ -1918,9 +1937,9 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
}
} while (1);
- xas_set(&xas, start);
for (index = start; index < end; index++) {
- page = xas_next(&xas);
+ xas_set(&xas, index);
+ page = xas_load(&xas);
VM_BUG_ON(index != xas.xa_index);
if (is_shmem) {
@@ -1935,7 +1954,6 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
result = SCAN_TRUNCATED;
goto xa_locked;
}
- xas_set(&xas, index + 1);
}
if (!shmem_charge(mapping->host, 1)) {
result = SCAN_FAIL;
@@ -1953,7 +1971,7 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
result = SCAN_FAIL;
goto xa_unlocked;
}
- /* drain pagevecs to help isolate_lru_page() */
+ /* drain lru cache to help isolate_lru_page() */
lru_add_drain();
page = folio_file_page(folio, index);
} else if (trylock_page(page)) {
@@ -1969,7 +1987,7 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
page_cache_sync_readahead(mapping, &file->f_ra,
file, index,
end - index);
- /* drain pagevecs to help isolate_lru_page() */
+ /* drain lru cache to help isolate_lru_page() */
lru_add_drain();
page = find_lock_page(mapping, index);
if (unlikely(page == NULL)) {
@@ -2070,9 +2088,8 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
TTU_IGNORE_MLOCK | TTU_BATCH_FLUSH);
xas_lock_irq(&xas);
- xas_set(&xas, index);
- VM_BUG_ON_PAGE(page != xas_load(&xas), page);
+ VM_BUG_ON_PAGE(page != xa_load(xas.xa, index), page);
/*
* We control three references to the page:
diff --git a/mm/kmsan/core.c b/mm/kmsan/core.c
index 7d1e4aa30bae..3adb4c1d3b19 100644
--- a/mm/kmsan/core.c
+++ b/mm/kmsan/core.c
@@ -74,7 +74,7 @@ depot_stack_handle_t kmsan_save_stack_with_flags(gfp_t flags,
nr_entries = stack_trace_save(entries, KMSAN_STACK_DEPTH, 0);
/* Don't sleep. */
- flags &= ~__GFP_DIRECT_RECLAIM;
+ flags &= ~(__GFP_DIRECT_RECLAIM | __GFP_KSWAPD_RECLAIM);
handle = __stack_depot_save(entries, nr_entries, flags, true);
return stack_depot_set_extra_bits(handle, extra);
@@ -245,7 +245,7 @@ depot_stack_handle_t kmsan_internal_chain_origin(depot_stack_handle_t id)
extra_bits = kmsan_extra_bits(depth, uaf);
entries[0] = KMSAN_CHAIN_MAGIC_ORIGIN;
- entries[1] = kmsan_save_stack_with_flags(GFP_ATOMIC, 0);
+ entries[1] = kmsan_save_stack_with_flags(__GFP_HIGH, 0);
entries[2] = id;
/*
* @entries is a local var in non-instrumented code, so KMSAN does not
@@ -253,7 +253,7 @@ depot_stack_handle_t kmsan_internal_chain_origin(depot_stack_handle_t id)
* positives when __stack_depot_save() passes it to instrumented code.
*/
kmsan_internal_unpoison_memory(entries, sizeof(entries), false);
- handle = __stack_depot_save(entries, ARRAY_SIZE(entries), GFP_ATOMIC,
+ handle = __stack_depot_save(entries, ARRAY_SIZE(entries), __GFP_HIGH,
true);
return stack_depot_set_extra_bits(handle, extra_bits);
}
diff --git a/mm/kmsan/instrumentation.c b/mm/kmsan/instrumentation.c
index cf12e9616b24..cc3907a9c33a 100644
--- a/mm/kmsan/instrumentation.c
+++ b/mm/kmsan/instrumentation.c
@@ -282,7 +282,7 @@ void __msan_poison_alloca(void *address, uintptr_t size, char *descr)
/* stack_depot_save() may allocate memory. */
kmsan_enter_runtime();
- handle = stack_depot_save(entries, ARRAY_SIZE(entries), GFP_ATOMIC);
+ handle = stack_depot_save(entries, ARRAY_SIZE(entries), __GFP_HIGH);
kmsan_leave_runtime();
kmsan_internal_set_shadow_origin(address, size, -1, handle,
diff --git a/mm/ksm.c b/mm/ksm.c
index 0156bded3a66..ba266359da55 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -429,16 +429,17 @@ static int break_ksm_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long nex
struct page *page = NULL;
spinlock_t *ptl;
pte_t *pte;
+ pte_t ptent;
int ret;
- if (pmd_leaf(*pmd) || !pmd_present(*pmd))
- return 0;
-
pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
- if (pte_present(*pte)) {
- page = vm_normal_page(walk->vma, addr, *pte);
- } else if (!pte_none(*pte)) {
- swp_entry_t entry = pte_to_swp_entry(*pte);
+ if (!pte)
+ return 0;
+ ptent = ptep_get(pte);
+ if (pte_present(ptent)) {
+ page = vm_normal_page(walk->vma, addr, ptent);
+ } else if (!pte_none(ptent)) {
+ swp_entry_t entry = pte_to_swp_entry(ptent);
/*
* As KSM pages remain KSM pages until freed, no need to wait
@@ -931,7 +932,7 @@ static int remove_stable_node(struct ksm_stable_node *stable_node)
* The stable node did not yet appear stale to get_ksm_page(),
* since that allows for an unmapped ksm page to be recognized
* right up until it is freed; but the node is safe to remove.
- * This page might be in a pagevec waiting to be freed,
+ * This page might be in an LRU cache waiting to be freed,
* or it might be PageSwapCache (perhaps under writeback),
* or it might have been removed from swapcache a moment ago.
*/
@@ -1086,6 +1087,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
int err = -EFAULT;
struct mmu_notifier_range range;
bool anon_exclusive;
+ pte_t entry;
pvmw.address = page_address_in_vma(page, vma);
if (pvmw.address == -EFAULT)
@@ -1103,10 +1105,9 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
goto out_unlock;
anon_exclusive = PageAnonExclusive(page);
- if (pte_write(*pvmw.pte) || pte_dirty(*pvmw.pte) ||
+ entry = ptep_get(pvmw.pte);
+ if (pte_write(entry) || pte_dirty(entry) ||
anon_exclusive || mm_tlb_flush_pending(mm)) {
- pte_t entry;
-
swapped = PageSwapCache(page);
flush_cache_page(vma, pvmw.address, page_to_pfn(page));
/*
@@ -1148,7 +1149,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
set_pte_at_notify(mm, pvmw.address, pvmw.pte, entry);
}
- *orig_pte = *pvmw.pte;
+ *orig_pte = entry;
err = 0;
out_unlock:
@@ -1194,8 +1195,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
* without holding anon_vma lock for write. So when looking for a
* genuine pmde (in which to find pte), test present and !THP together.
*/
- pmde = *pmd;
- barrier();
+ pmde = pmdp_get_lockless(pmd);
if (!pmd_present(pmde) || pmd_trans_huge(pmde))
goto out;
@@ -1204,7 +1204,9 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
mmu_notifier_invalidate_range_start(&range);
ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
- if (!pte_same(*ptep, orig_pte)) {
+ if (!ptep)
+ goto out_mn;
+ if (!pte_same(ptep_get(ptep), orig_pte)) {
pte_unmap_unlock(ptep, ptl);
goto out_mn;
}
@@ -1231,7 +1233,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
dec_mm_counter(mm, MM_ANONPAGES);
}
- flush_cache_page(vma, addr, pte_pfn(*ptep));
+ flush_cache_page(vma, addr, pte_pfn(ptep_get(ptep)));
/*
* No need to notify as we are replacing a read only page with another
* read only page with the same content.
@@ -2301,8 +2303,8 @@ static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page)
trace_ksm_start_scan(ksm_scan.seqnr, ksm_rmap_items);
/*
- * A number of pages can hang around indefinitely on per-cpu
- * pagevecs, raised page count preventing write_protect_page
+ * A number of pages can hang around indefinitely in per-cpu
+ * LRU cache, raised page count preventing write_protect_page
* from merging them. Though it doesn't really matter much,
* it is puzzling to see some stuck in pages_volatile until
* other activity jostles them out, and they also prevented
diff --git a/mm/madvise.c b/mm/madvise.c
index b5ffbaf616f5..886f06066622 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -188,37 +188,43 @@ success:
#ifdef CONFIG_SWAP
static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
- unsigned long end, struct mm_walk *walk)
+ unsigned long end, struct mm_walk *walk)
{
struct vm_area_struct *vma = walk->private;
- unsigned long index;
struct swap_iocb *splug = NULL;
+ pte_t *ptep = NULL;
+ spinlock_t *ptl;
+ unsigned long addr;
- if (pmd_none_or_trans_huge_or_clear_bad(pmd))
- return 0;
-
- for (index = start; index != end; index += PAGE_SIZE) {
+ for (addr = start; addr < end; addr += PAGE_SIZE) {
pte_t pte;
swp_entry_t entry;
struct page *page;
- spinlock_t *ptl;
- pte_t *ptep;
- ptep = pte_offset_map_lock(vma->vm_mm, pmd, index, &ptl);
- pte = *ptep;
- pte_unmap_unlock(ptep, ptl);
+ if (!ptep++) {
+ ptep = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+ if (!ptep)
+ break;
+ }
+ pte = ptep_get(ptep);
if (!is_swap_pte(pte))
continue;
entry = pte_to_swp_entry(pte);
if (unlikely(non_swap_entry(entry)))
continue;
+ pte_unmap_unlock(ptep, ptl);
+ ptep = NULL;
+
page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE,
- vma, index, false, &splug);
+ vma, addr, false, &splug);
if (page)
put_page(page);
}
+
+ if (ptep)
+ pte_unmap_unlock(ptep, ptl);
swap_read_unplug(splug);
cond_resched();
@@ -229,30 +235,34 @@ static const struct mm_walk_ops swapin_walk_ops = {
.pmd_entry = swapin_walk_pmd_entry,
};
-static void force_shm_swapin_readahead(struct vm_area_struct *vma,
+static void shmem_swapin_range(struct vm_area_struct *vma,
unsigned long start, unsigned long end,
struct address_space *mapping)
{
XA_STATE(xas, &mapping->i_pages, linear_page_index(vma, start));
- pgoff_t end_index = linear_page_index(vma, end + PAGE_SIZE - 1);
+ pgoff_t end_index = linear_page_index(vma, end) - 1;
struct page *page;
struct swap_iocb *splug = NULL;
rcu_read_lock();
xas_for_each(&xas, page, end_index) {
- swp_entry_t swap;
+ unsigned long addr;
+ swp_entry_t entry;
if (!xa_is_value(page))
continue;
- swap = radix_to_swp_entry(page);
+ entry = radix_to_swp_entry(page);
/* There might be swapin error entries in shmem mapping. */
- if (non_swap_entry(swap))
+ if (non_swap_entry(entry))
continue;
+
+ addr = vma->vm_start +
+ ((xas.xa_index - vma->vm_pgoff) << PAGE_SHIFT);
xas_pause(&xas);
rcu_read_unlock();
- page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE,
- NULL, 0, false, &splug);
+ page = read_swap_cache_async(entry, mapping_gfp_mask(mapping),
+ vma, addr, false, &splug);
if (page)
put_page(page);
@@ -260,8 +270,6 @@ static void force_shm_swapin_readahead(struct vm_area_struct *vma,
}
rcu_read_unlock();
swap_read_unplug(splug);
-
- lru_add_drain(); /* Push any new pages onto the LRU now */
}
#endif /* CONFIG_SWAP */
@@ -285,8 +293,8 @@ static long madvise_willneed(struct vm_area_struct *vma,
}
if (shmem_mapping(file->f_mapping)) {
- force_shm_swapin_readahead(vma, start, end,
- file->f_mapping);
+ shmem_swapin_range(vma, start, end, file->f_mapping);
+ lru_add_drain(); /* Push any new pages onto the LRU now */
return 0;
}
#else
@@ -340,7 +348,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
bool pageout = private->pageout;
struct mm_struct *mm = tlb->mm;
struct vm_area_struct *vma = walk->vma;
- pte_t *orig_pte, *pte, ptent;
+ pte_t *start_pte, *pte, ptent;
spinlock_t *ptl;
struct folio *folio = NULL;
LIST_HEAD(folio_list);
@@ -422,15 +430,15 @@ huge_unlock:
}
regular_folio:
- if (pmd_trans_unstable(pmd))
- return 0;
#endif
tlb_change_page_size(tlb, PAGE_SIZE);
- orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+ start_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+ if (!start_pte)
+ return 0;
flush_tlb_batched_pending(mm);
arch_enter_lazy_mmu_mode();
for (; addr < end; pte++, addr += PAGE_SIZE) {
- ptent = *pte;
+ ptent = ptep_get(pte);
if (pte_none(ptent))
continue;
@@ -447,25 +455,28 @@ regular_folio:
* are sure it's worth. Split it if we are only owner.
*/
if (folio_test_large(folio)) {
+ int err;
+
if (folio_mapcount(folio) != 1)
break;
if (pageout_anon_only_filter && !folio_test_anon(folio))
break;
- folio_get(folio);
- if (!folio_trylock(folio)) {
- folio_put(folio);
- break;
- }
- pte_unmap_unlock(orig_pte, ptl);
- if (split_folio(folio)) {
- folio_unlock(folio);
- folio_put(folio);
- orig_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+ if (!folio_trylock(folio))
break;
- }
+ folio_get(folio);
+ arch_leave_lazy_mmu_mode();
+ pte_unmap_unlock(start_pte, ptl);
+ start_pte = NULL;
+ err = split_folio(folio);
folio_unlock(folio);
folio_put(folio);
- orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+ if (err)
+ break;
+ start_pte = pte =
+ pte_offset_map_lock(mm, pmd, addr, &ptl);
+ if (!start_pte)
+ break;
+ arch_enter_lazy_mmu_mode();
pte--;
addr -= PAGE_SIZE;
continue;
@@ -510,8 +521,10 @@ regular_folio:
folio_deactivate(folio);
}
- arch_leave_lazy_mmu_mode();
- pte_unmap_unlock(orig_pte, ptl);
+ if (start_pte) {
+ arch_leave_lazy_mmu_mode();
+ pte_unmap_unlock(start_pte, ptl);
+ }
if (pageout)
reclaim_pages(&folio_list);
cond_resched();
@@ -612,7 +625,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
struct mm_struct *mm = tlb->mm;
struct vm_area_struct *vma = walk->vma;
spinlock_t *ptl;
- pte_t *orig_pte, *pte, ptent;
+ pte_t *start_pte, *pte, ptent;
struct folio *folio;
int nr_swap = 0;
unsigned long next;
@@ -620,17 +633,16 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
next = pmd_addr_end(addr, end);
if (pmd_trans_huge(*pmd))
if (madvise_free_huge_pmd(tlb, vma, pmd, addr, next))
- goto next;
-
- if (pmd_trans_unstable(pmd))
- return 0;
+ return 0;
tlb_change_page_size(tlb, PAGE_SIZE);
- orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+ start_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+ if (!start_pte)
+ return 0;
flush_tlb_batched_pending(mm);
arch_enter_lazy_mmu_mode();
for (; addr != end; pte++, addr += PAGE_SIZE) {
- ptent = *pte;
+ ptent = ptep_get(pte);
if (pte_none(ptent))
continue;
@@ -664,23 +676,26 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
* deactivate all pages.
*/
if (folio_test_large(folio)) {
+ int err;
+
if (folio_mapcount(folio) != 1)
- goto out;
+ break;
+ if (!folio_trylock(folio))
+ break;
folio_get(folio);
- if (!folio_trylock(folio)) {
- folio_put(folio);
- goto out;
- }
- pte_unmap_unlock(orig_pte, ptl);
- if (split_folio(folio)) {
- folio_unlock(folio);
- folio_put(folio);
- orig_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
- goto out;
- }
+ arch_leave_lazy_mmu_mode();
+ pte_unmap_unlock(start_pte, ptl);
+ start_pte = NULL;
+ err = split_folio(folio);
folio_unlock(folio);
folio_put(folio);
- orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+ if (err)
+ break;
+ start_pte = pte =
+ pte_offset_map_lock(mm, pmd, addr, &ptl);
+ if (!start_pte)
+ break;
+ arch_enter_lazy_mmu_mode();
pte--;
addr -= PAGE_SIZE;
continue;
@@ -725,17 +740,18 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
}
folio_mark_lazyfree(folio);
}
-out:
+
if (nr_swap) {
if (current->mm == mm)
sync_mm_rss(mm);
-
add_mm_counter(mm, MM_SWAPENTS, nr_swap);
}
- arch_leave_lazy_mmu_mode();
- pte_unmap_unlock(orig_pte, ptl);
+ if (start_pte) {
+ arch_leave_lazy_mmu_mode();
+ pte_unmap_unlock(start_pte, ptl);
+ }
cond_resched();
-next:
+
return 0;
}
diff --git a/mm/mapping_dirty_helpers.c b/mm/mapping_dirty_helpers.c
index e1eb33f49059..a26dd8bcfcdb 100644
--- a/mm/mapping_dirty_helpers.c
+++ b/mm/mapping_dirty_helpers.c
@@ -35,7 +35,7 @@ static int wp_pte(pte_t *pte, unsigned long addr, unsigned long end,
struct mm_walk *walk)
{
struct wp_walk *wpwalk = walk->private;
- pte_t ptent = *pte;
+ pte_t ptent = ptep_get(pte);
if (pte_write(ptent)) {
pte_t old_pte = ptep_modify_prot_start(walk->vma, addr, pte);
@@ -91,7 +91,7 @@ static int clean_record_pte(pte_t *pte, unsigned long addr,
{
struct wp_walk *wpwalk = walk->private;
struct clean_walk *cwalk = to_clean_walk(wpwalk);
- pte_t ptent = *pte;
+ pte_t ptent = ptep_get(pte);
if (pte_dirty(ptent)) {
pgoff_t pgoff = ((addr - walk->vma->vm_start) >> PAGE_SHIFT) +
@@ -128,19 +128,11 @@ static int wp_clean_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long end,
{
pmd_t pmdval = pmdp_get_lockless(pmd);
- if (!pmd_trans_unstable(&pmdval))
- return 0;
-
- if (pmd_none(pmdval)) {
- walk->action = ACTION_AGAIN;
- return 0;
- }
-
- /* Huge pmd, present or migrated */
- walk->action = ACTION_CONTINUE;
- if (pmd_trans_huge(pmdval) || pmd_devmap(pmdval))
+ /* Do not split a huge pmd, present or migrated */
+ if (pmd_trans_huge(pmdval) || pmd_devmap(pmdval)) {
WARN_ON(pmd_write(pmdval) || pmd_dirty(pmdval));
-
+ walk->action = ACTION_CONTINUE;
+ }
return 0;
}
@@ -156,23 +148,15 @@ static int wp_clean_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long end,
static int wp_clean_pud_entry(pud_t *pud, unsigned long addr, unsigned long end,
struct mm_walk *walk)
{
+#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
pud_t pudval = READ_ONCE(*pud);
- if (!pud_trans_unstable(&pudval))
- return 0;
-
- if (pud_none(pudval)) {
- walk->action = ACTION_AGAIN;
- return 0;
- }
-
-#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
- /* Huge pud */
- walk->action = ACTION_CONTINUE;
- if (pud_trans_huge(pudval) || pud_devmap(pudval))
+ /* Do not split a huge pud */
+ if (pud_trans_huge(pudval) || pud_devmap(pudval)) {
WARN_ON(pud_write(pudval) || pud_dirty(pudval));
+ walk->action = ACTION_CONTINUE;
+ }
#endif
-
return 0;
}
diff --git a/mm/memblock.c b/mm/memblock.c
index 3d449aaba052..f9e61e565a53 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -1436,6 +1436,15 @@ done:
*/
kmemleak_alloc_phys(found, size, 0);
+ /*
+ * Some Virtual Machine platforms, such as Intel TDX or AMD SEV-SNP,
+ * require memory to be accepted before it can be used by the
+ * guest.
+ *
+ * Accept the memory of the allocated buffer.
+ */
+ accept_memory(found, found + size);
+
return found;
}
@@ -2082,19 +2091,30 @@ static void __init memmap_init_reserved_pages(void)
{
struct memblock_region *region;
phys_addr_t start, end;
- u64 i;
+ int nid;
+
+ /*
+ * set nid on all reserved pages and also treat struct
+ * pages for the NOMAP regions as PageReserved
+ */
+ for_each_mem_region(region) {
+ nid = memblock_get_region_node(region);
+ start = region->base;
+ end = start + region->size;
+
+ if (memblock_is_nomap(region))
+ reserve_bootmem_region(start, end, nid);
+
+ memblock_set_node(start, end, &memblock.reserved, nid);
+ }
/* initialize struct pages for the reserved regions */
- for_each_reserved_mem_range(i, &start, &end)
- reserve_bootmem_region(start, end);
+ for_each_reserved_mem_region(region) {
+ nid = memblock_get_region_node(region);
+ start = region->base;
+ end = start + region->size;
- /* and also treat struct pages for the NOMAP regions as PageReserved */
- for_each_mem_region(region) {
- if (memblock_is_nomap(region)) {
- start = region->base;
- end = start + region->size;
- reserve_bootmem_region(start, end);
- }
+ reserve_bootmem_region(start, end, nid);
}
}
@@ -2122,7 +2142,7 @@ static unsigned long __init free_low_memory_core_early(void)
static int reset_managed_pages_done __initdata;
-void reset_node_managed_pages(pg_data_t *pgdat)
+static void __init reset_node_managed_pages(pg_data_t *pgdat)
{
struct zone *z;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 4b27e245a055..e8ca4bdcb03c 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -485,7 +485,7 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, int nid)
if (lru_gen_enabled()) {
if (soft_limit_excess(memcg))
- lru_gen_soft_reclaim(&memcg->nodeinfo[nid]->lruvec);
+ lru_gen_soft_reclaim(memcg, nid);
return;
}
@@ -639,7 +639,7 @@ static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val)
}
}
-static void do_flush_stats(bool atomic)
+static void do_flush_stats(void)
{
/*
* We always flush the entire tree, so concurrent flushers can just
@@ -652,30 +652,16 @@ static void do_flush_stats(bool atomic)
WRITE_ONCE(flush_next_time, jiffies_64 + 2*FLUSH_TIME);
- if (atomic)
- cgroup_rstat_flush_atomic(root_mem_cgroup->css.cgroup);
- else
- cgroup_rstat_flush(root_mem_cgroup->css.cgroup);
+ cgroup_rstat_flush(root_mem_cgroup->css.cgroup);
atomic_set(&stats_flush_threshold, 0);
atomic_set(&stats_flush_ongoing, 0);
}
-static bool should_flush_stats(void)
-{
- return atomic_read(&stats_flush_threshold) > num_online_cpus();
-}
-
void mem_cgroup_flush_stats(void)
{
- if (should_flush_stats())
- do_flush_stats(false);
-}
-
-void mem_cgroup_flush_stats_atomic(void)
-{
- if (should_flush_stats())
- do_flush_stats(true);
+ if (atomic_read(&stats_flush_threshold) > num_online_cpus())
+ do_flush_stats();
}
void mem_cgroup_flush_stats_ratelimited(void)
@@ -690,7 +676,7 @@ static void flush_memcg_stats_dwork(struct work_struct *w)
* Always flush here so that flushing in latency-sensitive paths is
* as cheap as possible.
*/
- do_flush_stats(false);
+ do_flush_stats();
queue_delayed_work(system_unbound_wq, &stats_flush_dwork, FLUSH_TIME);
}
@@ -1273,13 +1259,13 @@ static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
*
* This function iterates over tasks attached to @memcg or to any of its
* descendants and calls @fn for each task. If @fn returns a non-zero
- * value, the function breaks the iteration loop and returns the value.
- * Otherwise, it will iterate over all tasks and return 0.
+ * value, the function breaks the iteration loop. Otherwise, it will iterate
+ * over all tasks and return 0.
*
* This function must not be called for the root memory cgroup.
*/
-int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
- int (*fn)(struct task_struct *, void *), void *arg)
+void mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
+ int (*fn)(struct task_struct *, void *), void *arg)
{
struct mem_cgroup *iter;
int ret = 0;
@@ -1299,7 +1285,6 @@ int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
break;
}
}
- return ret;
}
#ifdef CONFIG_DEBUG_VM
@@ -1580,13 +1565,10 @@ static inline unsigned long memcg_page_state_output(struct mem_cgroup *memcg,
return memcg_page_state(memcg, item) * memcg_page_state_unit(item);
}
-static void memory_stat_format(struct mem_cgroup *memcg, char *buf, int bufsize)
+static void memcg_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
{
- struct seq_buf s;
int i;
- seq_buf_init(&s, buf, bufsize);
-
/*
* Provide statistics on the state of the memory subsystem as
* well as cumulative event counters that show past behavior.
@@ -1603,21 +1585,21 @@ static void memory_stat_format(struct mem_cgroup *memcg, char *buf, int bufsize)
u64 size;
size = memcg_page_state_output(memcg, memory_stats[i].idx);
- seq_buf_printf(&s, "%s %llu\n", memory_stats[i].name, size);
+ seq_buf_printf(s, "%s %llu\n", memory_stats[i].name, size);
if (unlikely(memory_stats[i].idx == NR_SLAB_UNRECLAIMABLE_B)) {
size += memcg_page_state_output(memcg,
NR_SLAB_RECLAIMABLE_B);
- seq_buf_printf(&s, "slab %llu\n", size);
+ seq_buf_printf(s, "slab %llu\n", size);
}
}
/* Accumulated memory events */
- seq_buf_printf(&s, "pgscan %lu\n",
+ seq_buf_printf(s, "pgscan %lu\n",
memcg_events(memcg, PGSCAN_KSWAPD) +
memcg_events(memcg, PGSCAN_DIRECT) +
memcg_events(memcg, PGSCAN_KHUGEPAGED));
- seq_buf_printf(&s, "pgsteal %lu\n",
+ seq_buf_printf(s, "pgsteal %lu\n",
memcg_events(memcg, PGSTEAL_KSWAPD) +
memcg_events(memcg, PGSTEAL_DIRECT) +
memcg_events(memcg, PGSTEAL_KHUGEPAGED));
@@ -1627,13 +1609,24 @@ static void memory_stat_format(struct mem_cgroup *memcg, char *buf, int bufsize)
memcg_vm_event_stat[i] == PGPGOUT)
continue;
- seq_buf_printf(&s, "%s %lu\n",
+ seq_buf_printf(s, "%s %lu\n",
vm_event_name(memcg_vm_event_stat[i]),
memcg_events(memcg, memcg_vm_event_stat[i]));
}
/* The above should easily fit into one page */
- WARN_ON_ONCE(seq_buf_has_overflowed(&s));
+ WARN_ON_ONCE(seq_buf_has_overflowed(s));
+}
+
+static void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s);
+
+static void memory_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
+{
+ if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ memcg_stat_format(memcg, s);
+ else
+ memcg1_stat_format(memcg, s);
+ WARN_ON_ONCE(seq_buf_has_overflowed(s));
}
#define K(x) ((x) << (PAGE_SHIFT-10))
@@ -1671,6 +1664,7 @@ void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg)
{
/* Use static buffer, for the caller is holding oom_lock. */
static char buf[PAGE_SIZE];
+ struct seq_buf s;
lockdep_assert_held(&oom_lock);
@@ -1693,8 +1687,9 @@ void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg)
pr_info("Memory cgroup stats for ");
pr_cont_cgroup_path(memcg->css.cgroup);
pr_cont(":");
- memory_stat_format(memcg, buf, sizeof(buf));
- pr_info("%s", buf);
+ seq_buf_init(&s, buf, sizeof(buf));
+ memory_stat_format(memcg, &s);
+ seq_buf_do_printk(&s, KERN_INFO);
}
/*
@@ -2028,26 +2023,12 @@ bool mem_cgroup_oom_synchronize(bool handle)
if (locked)
mem_cgroup_oom_notify(memcg);
- if (locked && !READ_ONCE(memcg->oom_kill_disable)) {
- mem_cgroup_unmark_under_oom(memcg);
- finish_wait(&memcg_oom_waitq, &owait.wait);
- mem_cgroup_out_of_memory(memcg, current->memcg_oom_gfp_mask,
- current->memcg_oom_order);
- } else {
- schedule();
- mem_cgroup_unmark_under_oom(memcg);
- finish_wait(&memcg_oom_waitq, &owait.wait);
- }
+ schedule();
+ mem_cgroup_unmark_under_oom(memcg);
+ finish_wait(&memcg_oom_waitq, &owait.wait);
- if (locked) {
+ if (locked)
mem_cgroup_oom_unlock(memcg);
- /*
- * There is no guarantee that an OOM-lock contender
- * sees the wakeups triggered by the OOM kill
- * uncharges. Wake any sleepers explicitly.
- */
- memcg_oom_recover(memcg);
- }
cleanup:
current->memcg_in_oom = NULL;
css_put(&memcg->css);
@@ -2166,17 +2147,12 @@ again:
* When charge migration first begins, we can have multiple
* critical sections holding the fast-path RCU lock and one
* holding the slowpath move_lock. Track the task who has the
- * move_lock for unlock_page_memcg().
+ * move_lock for folio_memcg_unlock().
*/
memcg->move_lock_task = current;
memcg->move_lock_flags = flags;
}
-void lock_page_memcg(struct page *page)
-{
- folio_memcg_lock(page_folio(page));
-}
-
static void __folio_memcg_unlock(struct mem_cgroup *memcg)
{
if (memcg && memcg->move_lock_task == current) {
@@ -2204,11 +2180,6 @@ void folio_memcg_unlock(struct folio *folio)
__folio_memcg_unlock(folio_memcg(folio));
}
-void unlock_page_memcg(struct page *page)
-{
- folio_memcg_unlock(page_folio(page));
-}
-
struct memcg_stock_pcp {
local_lock_t stock_lock;
struct mem_cgroup *cached; /* this never be root cgroup */
@@ -2275,7 +2246,7 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
local_lock_irqsave(&memcg_stock.stock_lock, flags);
stock = this_cpu_ptr(&memcg_stock);
- if (memcg == stock->cached && stock->nr_pages >= nr_pages) {
+ if (memcg == READ_ONCE(stock->cached) && stock->nr_pages >= nr_pages) {
stock->nr_pages -= nr_pages;
ret = true;
}
@@ -2290,7 +2261,7 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
*/
static void drain_stock(struct memcg_stock_pcp *stock)
{
- struct mem_cgroup *old = stock->cached;
+ struct mem_cgroup *old = READ_ONCE(stock->cached);
if (!old)
return;
@@ -2303,7 +2274,7 @@ static void drain_stock(struct memcg_stock_pcp *stock)
}
css_put(&old->css);
- stock->cached = NULL;
+ WRITE_ONCE(stock->cached, NULL);
}
static void drain_local_stock(struct work_struct *dummy)
@@ -2338,10 +2309,10 @@ static void __refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
struct memcg_stock_pcp *stock;
stock = this_cpu_ptr(&memcg_stock);
- if (stock->cached != memcg) { /* reset if necessary */
+ if (READ_ONCE(stock->cached) != memcg) { /* reset if necessary */
drain_stock(stock);
css_get(&memcg->css);
- stock->cached = memcg;
+ WRITE_ONCE(stock->cached, memcg);
}
stock->nr_pages += nr_pages;
@@ -2383,7 +2354,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)
bool flush = false;
rcu_read_lock();
- memcg = stock->cached;
+ memcg = READ_ONCE(stock->cached);
if (memcg && stock->nr_pages &&
mem_cgroup_is_descendant(memcg, root_memcg))
flush = true;
@@ -2884,7 +2855,7 @@ static void commit_charge(struct folio *folio, struct mem_cgroup *memcg)
*
* - the page lock
* - LRU isolation
- * - lock_page_memcg()
+ * - folio_memcg_lock()
* - exclusive reference
* - mem_cgroup_trylock_pages()
*/
@@ -3208,12 +3179,12 @@ void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
* accumulating over a page of vmstat data or when pgdat or idx
* changes.
*/
- if (stock->cached_objcg != objcg) {
+ if (READ_ONCE(stock->cached_objcg) != objcg) {
old = drain_obj_stock(stock);
obj_cgroup_get(objcg);
stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes)
? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0;
- stock->cached_objcg = objcg;
+ WRITE_ONCE(stock->cached_objcg, objcg);
stock->cached_pgdat = pgdat;
} else if (stock->cached_pgdat != pgdat) {
/* Flush the existing cached vmstat data */
@@ -3267,7 +3238,7 @@ static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
local_lock_irqsave(&memcg_stock.stock_lock, flags);
stock = this_cpu_ptr(&memcg_stock);
- if (objcg == stock->cached_objcg && stock->nr_bytes >= nr_bytes) {
+ if (objcg == READ_ONCE(stock->cached_objcg) && stock->nr_bytes >= nr_bytes) {
stock->nr_bytes -= nr_bytes;
ret = true;
}
@@ -3279,7 +3250,7 @@ static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock)
{
- struct obj_cgroup *old = stock->cached_objcg;
+ struct obj_cgroup *old = READ_ONCE(stock->cached_objcg);
if (!old)
return NULL;
@@ -3332,7 +3303,7 @@ static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock)
stock->cached_pgdat = NULL;
}
- stock->cached_objcg = NULL;
+ WRITE_ONCE(stock->cached_objcg, NULL);
/*
* The `old' objects needs to be released by the caller via
* obj_cgroup_put() outside of memcg_stock_pcp::stock_lock.
@@ -3343,10 +3314,11 @@ static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock)
static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
struct mem_cgroup *root_memcg)
{
+ struct obj_cgroup *objcg = READ_ONCE(stock->cached_objcg);
struct mem_cgroup *memcg;
- if (stock->cached_objcg) {
- memcg = obj_cgroup_memcg(stock->cached_objcg);
+ if (objcg) {
+ memcg = obj_cgroup_memcg(objcg);
if (memcg && mem_cgroup_is_descendant(memcg, root_memcg))
return true;
}
@@ -3365,10 +3337,10 @@ static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes,
local_lock_irqsave(&memcg_stock.stock_lock, flags);
stock = this_cpu_ptr(&memcg_stock);
- if (stock->cached_objcg != objcg) { /* reset if necessary */
+ if (READ_ONCE(stock->cached_objcg) != objcg) { /* reset if necessary */
old = drain_obj_stock(stock);
obj_cgroup_get(objcg);
- stock->cached_objcg = objcg;
+ WRITE_ONCE(stock->cached_objcg, objcg);
stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes)
? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0;
allow_uncharge = true; /* Allow uncharge when objcg changes */
@@ -3699,27 +3671,13 @@ static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
if (mem_cgroup_is_root(memcg)) {
/*
- * We can reach here from irq context through:
- * uncharge_batch()
- * |--memcg_check_events()
- * |--mem_cgroup_threshold()
- * |--__mem_cgroup_threshold()
- * |--mem_cgroup_usage
- *
- * rstat flushing is an expensive operation that should not be
- * done from irq context; use stale stats in this case.
- * Arguably, usage threshold events are not reliable on the root
- * memcg anyway since its usage is ill-defined.
- *
- * Additionally, other call paths through memcg_check_events()
- * disable irqs, so make sure we are flushing stats atomically.
+ * Approximate root's usage from global state. This isn't
+ * perfect, but the root usage was always an approximation.
*/
- if (in_task())
- mem_cgroup_flush_stats_atomic();
- val = memcg_page_state(memcg, NR_FILE_PAGES) +
- memcg_page_state(memcg, NR_ANON_MAPPED);
+ val = global_node_page_state(NR_FILE_PAGES) +
+ global_node_page_state(NR_ANON_MAPPED);
if (swap)
- val += memcg_page_state(memcg, MEMCG_SWAP);
+ val += total_swap_pages - get_nr_swap_pages();
} else {
if (!swap)
val = page_counter_read(&memcg->memory);
@@ -4135,9 +4093,8 @@ static const unsigned int memcg1_events[] = {
PGMAJFAULT,
};
-static int memcg_stat_show(struct seq_file *m, void *v)
+static void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
{
- struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
unsigned long memory, memsw;
struct mem_cgroup *mi;
unsigned int i;
@@ -4152,18 +4109,18 @@ static int memcg_stat_show(struct seq_file *m, void *v)
if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
continue;
nr = memcg_page_state_local(memcg, memcg1_stats[i]);
- seq_printf(m, "%s %lu\n", memcg1_stat_names[i],
+ seq_buf_printf(s, "%s %lu\n", memcg1_stat_names[i],
nr * memcg_page_state_unit(memcg1_stats[i]));
}
for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
- seq_printf(m, "%s %lu\n", vm_event_name(memcg1_events[i]),
- memcg_events_local(memcg, memcg1_events[i]));
+ seq_buf_printf(s, "%s %lu\n", vm_event_name(memcg1_events[i]),
+ memcg_events_local(memcg, memcg1_events[i]));
for (i = 0; i < NR_LRU_LISTS; i++)
- seq_printf(m, "%s %lu\n", lru_list_name(i),
- memcg_page_state_local(memcg, NR_LRU_BASE + i) *
- PAGE_SIZE);
+ seq_buf_printf(s, "%s %lu\n", lru_list_name(i),
+ memcg_page_state_local(memcg, NR_LRU_BASE + i) *
+ PAGE_SIZE);
/* Hierarchical information */
memory = memsw = PAGE_COUNTER_MAX;
@@ -4171,11 +4128,11 @@ static int memcg_stat_show(struct seq_file *m, void *v)
memory = min(memory, READ_ONCE(mi->memory.max));
memsw = min(memsw, READ_ONCE(mi->memsw.max));
}
- seq_printf(m, "hierarchical_memory_limit %llu\n",
- (u64)memory * PAGE_SIZE);
+ seq_buf_printf(s, "hierarchical_memory_limit %llu\n",
+ (u64)memory * PAGE_SIZE);
if (do_memsw_account())
- seq_printf(m, "hierarchical_memsw_limit %llu\n",
- (u64)memsw * PAGE_SIZE);
+ seq_buf_printf(s, "hierarchical_memsw_limit %llu\n",
+ (u64)memsw * PAGE_SIZE);
for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
unsigned long nr;
@@ -4183,19 +4140,19 @@ static int memcg_stat_show(struct seq_file *m, void *v)
if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
continue;
nr = memcg_page_state(memcg, memcg1_stats[i]);
- seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i],
+ seq_buf_printf(s, "total_%s %llu\n", memcg1_stat_names[i],
(u64)nr * memcg_page_state_unit(memcg1_stats[i]));
}
for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
- seq_printf(m, "total_%s %llu\n",
- vm_event_name(memcg1_events[i]),
- (u64)memcg_events(memcg, memcg1_events[i]));
+ seq_buf_printf(s, "total_%s %llu\n",
+ vm_event_name(memcg1_events[i]),
+ (u64)memcg_events(memcg, memcg1_events[i]));
for (i = 0; i < NR_LRU_LISTS; i++)
- seq_printf(m, "total_%s %llu\n", lru_list_name(i),
- (u64)memcg_page_state(memcg, NR_LRU_BASE + i) *
- PAGE_SIZE);
+ seq_buf_printf(s, "total_%s %llu\n", lru_list_name(i),
+ (u64)memcg_page_state(memcg, NR_LRU_BASE + i) *
+ PAGE_SIZE);
#ifdef CONFIG_DEBUG_VM
{
@@ -4210,12 +4167,10 @@ static int memcg_stat_show(struct seq_file *m, void *v)
anon_cost += mz->lruvec.anon_cost;
file_cost += mz->lruvec.file_cost;
}
- seq_printf(m, "anon_cost %lu\n", anon_cost);
- seq_printf(m, "file_cost %lu\n", file_cost);
+ seq_buf_printf(s, "anon_cost %lu\n", anon_cost);
+ seq_buf_printf(s, "file_cost %lu\n", file_cost);
}
#endif
-
- return 0;
}
static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css,
@@ -4648,11 +4603,7 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
struct mem_cgroup *parent;
- /*
- * wb_writeback() takes a spinlock and calls
- * wb_over_bg_thresh()->mem_cgroup_wb_stats(). Do not sleep.
- */
- mem_cgroup_flush_stats_atomic();
+ mem_cgroup_flush_stats();
*pdirty = memcg_page_state(memcg, NR_FILE_DIRTY);
*pwriteback = memcg_page_state(memcg, NR_WRITEBACK);
@@ -5059,6 +5010,8 @@ static int mem_cgroup_slab_show(struct seq_file *m, void *p)
}
#endif
+static int memory_stat_show(struct seq_file *m, void *v);
+
static struct cftype mem_cgroup_legacy_files[] = {
{
.name = "usage_in_bytes",
@@ -5091,7 +5044,7 @@ static struct cftype mem_cgroup_legacy_files[] = {
},
{
.name = "stat",
- .seq_show = memcg_stat_show,
+ .seq_show = memory_stat_show,
},
{
.name = "force_empty",
@@ -5464,7 +5417,7 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
if (unlikely(mem_cgroup_is_root(memcg)))
queue_delayed_work(system_unbound_wq, &stats_flush_dwork,
- 2UL*HZ);
+ FLUSH_TIME);
lru_gen_online_memcg(memcg);
return 0;
offline_kmem:
@@ -5865,7 +5818,7 @@ static int mem_cgroup_move_account(struct page *page,
* with (un)charging, migration, LRU putback, or anything else
* that would rely on a stable page's memory cgroup.
*
- * Note that lock_page_memcg is a memcg lock, not a page lock,
+ * Note that folio_memcg_lock is a memcg lock, not a page lock,
* to save space. As soon as we switch page's memory cgroup to a
* new memcg that isn't locked, the above state can change
* concurrently again. Make sure we're truly done with it.
@@ -6057,11 +6010,11 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
return 0;
}
- if (pmd_trans_unstable(pmd))
- return 0;
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+ if (!pte)
+ return 0;
for (; addr != end; pte++, addr += PAGE_SIZE)
- if (get_mctgt_type(vma, addr, *pte, NULL))
+ if (get_mctgt_type(vma, addr, ptep_get(pte), NULL))
mc.precharge++; /* increment precharge temporarily */
pte_unmap_unlock(pte - 1, ptl);
cond_resched();
@@ -6277,12 +6230,12 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
return 0;
}
- if (pmd_trans_unstable(pmd))
- return 0;
retry:
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+ if (!pte)
+ return 0;
for (; addr != end; addr += PAGE_SIZE) {
- pte_t ptent = *(pte++);
+ pte_t ptent = ptep_get(pte++);
bool device = false;
swp_entry_t ent;
@@ -6356,7 +6309,7 @@ static void mem_cgroup_move_charge(void)
{
lru_add_drain_all();
/*
- * Signal lock_page_memcg() to take the memcg's move_lock
+ * Signal folio_memcg_lock() to take the memcg's move_lock
* while we're moving its pages to another memcg. Then wait
* for already started RCU-only updates to finish.
*/
@@ -6634,10 +6587,12 @@ static int memory_stat_show(struct seq_file *m, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
char *buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ struct seq_buf s;
if (!buf)
return -ENOMEM;
- memory_stat_format(memcg, buf, PAGE_SIZE);
+ seq_buf_init(&s, buf, PAGE_SIZE);
+ memory_stat_format(memcg, &s);
seq_puts(m, buf);
kfree(buf);
return 0;
@@ -6896,7 +6851,7 @@ static unsigned long effective_protection(unsigned long usage,
protected = min(usage, setting);
/*
* If all cgroups at this level combined claim and use more
- * protection then what the parent affords them, distribute
+ * protection than what the parent affords them, distribute
* shares in proportion to utilization.
*
* We are using actual utilization rather than the statically
@@ -7421,8 +7376,7 @@ static int __init mem_cgroup_init(void)
for_each_node(node) {
struct mem_cgroup_tree_per_node *rtpn;
- rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL,
- node_online(node) ? node : NUMA_NO_NODE);
+ rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, node);
rtpn->rb_root = RB_ROOT;
rtpn->rb_rightmost = NULL;
@@ -7656,6 +7610,14 @@ static u64 swap_current_read(struct cgroup_subsys_state *css,
return (u64)page_counter_read(&memcg->swap) * PAGE_SIZE;
}
+static u64 swap_peak_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+
+ return (u64)memcg->swap.watermark * PAGE_SIZE;
+}
+
static int swap_high_show(struct seq_file *m, void *v)
{
return seq_puts_memcg_tunable(m,
@@ -7735,6 +7697,11 @@ static struct cftype swap_files[] = {
.write = swap_max_write,
},
{
+ .name = "swap.peak",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .read_u64 = swap_peak_read,
+ },
+ {
.name = "swap.events",
.flags = CFTYPE_NOT_ON_ROOT,
.file_offset = offsetof(struct mem_cgroup, swap_events_file),
diff --git a/mm/memfd.c b/mm/memfd.c
index 69b90c31d38c..e763e76f1106 100644
--- a/mm/memfd.c
+++ b/mm/memfd.c
@@ -371,12 +371,15 @@ SYSCALL_DEFINE2(memfd_create,
inode->i_mode &= ~0111;
file_seals = memfd_file_seals_ptr(file);
- *file_seals &= ~F_SEAL_SEAL;
- *file_seals |= F_SEAL_EXEC;
+ if (file_seals) {
+ *file_seals &= ~F_SEAL_SEAL;
+ *file_seals |= F_SEAL_EXEC;
+ }
} else if (flags & MFD_ALLOW_SEALING) {
/* MFD_EXEC and MFD_ALLOW_SEALING are set */
file_seals = memfd_file_seals_ptr(file);
- *file_seals &= ~F_SEAL_SEAL;
+ if (file_seals)
+ *file_seals &= ~F_SEAL_SEAL;
}
fd_install(fd, file);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 5b663eca1f29..e245191e6b04 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -6,16 +6,16 @@
* High level machine check handler. Handles pages reported by the
* hardware as being corrupted usually due to a multi-bit ECC memory or cache
* failure.
- *
+ *
* In addition there is a "soft offline" entry point that allows stop using
* not-yet-corrupted-by-suspicious pages without killing anything.
*
* Handles page cache pages in various states. The tricky part
- * here is that we can access any page asynchronously in respect to
- * other VM users, because memory failures could happen anytime and
- * anywhere. This could violate some of their assumptions. This is why
- * this code has to be extremely careful. Generally it tries to use
- * normal locking rules, as in get the standard locks, even if that means
+ * here is that we can access any page asynchronously in respect to
+ * other VM users, because memory failures could happen anytime and
+ * anywhere. This could violate some of their assumptions. This is why
+ * this code has to be extremely careful. Generally it tries to use
+ * normal locking rules, as in get the standard locks, even if that means
* the error handling takes potentially a long time.
*
* It can be very tempting to add handling for obscure cases here.
@@ -25,12 +25,12 @@
* https://git.kernel.org/cgit/utils/cpu/mce/mce-test.git/
* - The case actually shows up as a frequent (top 10) page state in
* tools/mm/page-types when running a real workload.
- *
+ *
* There are several operations here with exponential complexity because
- * of unsuitable VM data structures. For example the operation to map back
- * from RMAP chains to processes has to walk the complete process list and
+ * of unsuitable VM data structures. For example the operation to map back
+ * from RMAP chains to processes has to walk the complete process list and
* has non linear complexity with the number. But since memory corruptions
- * are rare we hope to get away with this. This avoids impacting the core
+ * are rare we hope to get away with this. This avoids impacting the core
* VM.
*/
@@ -123,7 +123,6 @@ const struct attribute_group memory_failure_attr_group = {
.attrs = memory_failure_attr,
};
-#ifdef CONFIG_SYSCTL
static struct ctl_table memory_failure_table[] = {
{
.procname = "memory_failure_early_kill",
@@ -146,14 +145,6 @@ static struct ctl_table memory_failure_table[] = {
{ }
};
-static int __init memory_failure_sysctl_init(void)
-{
- register_sysctl_init("vm", memory_failure_table);
- return 0;
-}
-late_initcall(memory_failure_sysctl_init);
-#endif /* CONFIG_SYSCTL */
-
/*
* Return values:
* 1: the page is dissolved (if needed) and taken off from buddy,
@@ -395,6 +386,7 @@ static unsigned long dev_pagemap_mapping_shift(struct vm_area_struct *vma,
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
+ pte_t ptent;
VM_BUG_ON_VMA(address == -EFAULT, vma);
pgd = pgd_offset(vma->vm_mm, address);
@@ -414,7 +406,10 @@ static unsigned long dev_pagemap_mapping_shift(struct vm_area_struct *vma,
if (pmd_devmap(*pmd))
return PMD_SHIFT;
pte = pte_offset_map(pmd, address);
- if (pte_present(*pte) && pte_devmap(*pte))
+ if (!pte)
+ return 0;
+ ptent = ptep_get(pte);
+ if (pte_present(ptent) && pte_devmap(ptent))
ret = PAGE_SHIFT;
pte_unmap(pte);
return ret;
@@ -800,13 +795,13 @@ static int hwpoison_pte_range(pmd_t *pmdp, unsigned long addr,
goto out;
}
- if (pmd_trans_unstable(pmdp))
- goto out;
-
mapped_pte = ptep = pte_offset_map_lock(walk->vma->vm_mm, pmdp,
addr, &ptl);
+ if (!ptep)
+ goto out;
+
for (; addr != end; ptep++, addr += PAGE_SIZE) {
- ret = check_hwpoisoned_entry(*ptep, addr, PAGE_SHIFT,
+ ret = check_hwpoisoned_entry(ptep_get(ptep), addr, PAGE_SHIFT,
hwp->pfn, &hwp->tk);
if (ret == 1)
break;
@@ -2441,6 +2436,8 @@ static int __init memory_failure_init(void)
INIT_WORK(&mf_cpu->work, memory_failure_work_func);
}
+ register_sysctl_init("vm", memory_failure_table);
+
return 0;
}
core_initcall(memory_failure_init);
diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
index e593e56e530b..a516e303e304 100644
--- a/mm/memory-tiers.c
+++ b/mm/memory-tiers.c
@@ -366,7 +366,7 @@ static void establish_demotion_targets(void)
lockdep_assert_held_once(&memory_tier_lock);
- if (!node_demotion || !IS_ENABLED(CONFIG_MIGRATION))
+ if (!node_demotion)
return;
disable_all_demotion_targets();
@@ -451,7 +451,6 @@ static void establish_demotion_targets(void)
}
#else
-static inline void disable_all_demotion_targets(void) {}
static inline void establish_demotion_targets(void) {}
#endif /* CONFIG_MIGRATION */
diff --git a/mm/memory.c b/mm/memory.c
index f69fbc251198..0ae594703021 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -77,6 +77,7 @@
#include <linux/ptrace.h>
#include <linux/vmalloc.h>
#include <linux/sched/sysctl.h>
+#include <linux/net_mm.h>
#include <trace/events/kmem.h>
@@ -699,15 +700,17 @@ static void restore_exclusive_pte(struct vm_area_struct *vma,
struct page *page, unsigned long address,
pte_t *ptep)
{
+ pte_t orig_pte;
pte_t pte;
swp_entry_t entry;
+ orig_pte = ptep_get(ptep);
pte = pte_mkold(mk_pte(page, READ_ONCE(vma->vm_page_prot)));
- if (pte_swp_soft_dirty(*ptep))
+ if (pte_swp_soft_dirty(orig_pte))
pte = pte_mksoft_dirty(pte);
- entry = pte_to_swp_entry(*ptep);
- if (pte_swp_uffd_wp(*ptep))
+ entry = pte_to_swp_entry(orig_pte);
+ if (pte_swp_uffd_wp(orig_pte))
pte = pte_mkuffd_wp(pte);
else if (is_writable_device_exclusive_entry(entry))
pte = maybe_mkwrite(pte_mkdirty(pte), vma);
@@ -744,7 +747,7 @@ static int
try_restore_exclusive_pte(pte_t *src_pte, struct vm_area_struct *vma,
unsigned long addr)
{
- swp_entry_t entry = pte_to_swp_entry(*src_pte);
+ swp_entry_t entry = pte_to_swp_entry(ptep_get(src_pte));
struct page *page = pfn_swap_entry_to_page(entry);
if (trylock_page(page)) {
@@ -768,9 +771,10 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
struct vm_area_struct *src_vma, unsigned long addr, int *rss)
{
unsigned long vm_flags = dst_vma->vm_flags;
- pte_t pte = *src_pte;
+ pte_t orig_pte = ptep_get(src_pte);
+ pte_t pte = orig_pte;
struct page *page;
- swp_entry_t entry = pte_to_swp_entry(pte);
+ swp_entry_t entry = pte_to_swp_entry(orig_pte);
if (likely(!non_swap_entry(entry))) {
if (swap_duplicate(entry) < 0)
@@ -785,8 +789,8 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
spin_unlock(&mmlist_lock);
}
/* Mark the swap entry as shared. */
- if (pte_swp_exclusive(*src_pte)) {
- pte = pte_swp_clear_exclusive(*src_pte);
+ if (pte_swp_exclusive(orig_pte)) {
+ pte = pte_swp_clear_exclusive(orig_pte);
set_pte_at(src_mm, addr, src_pte, pte);
}
rss[MM_SWAPENTS]++;
@@ -805,9 +809,9 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
entry = make_readable_migration_entry(
swp_offset(entry));
pte = swp_entry_to_pte(entry);
- if (pte_swp_soft_dirty(*src_pte))
+ if (pte_swp_soft_dirty(orig_pte))
pte = pte_swp_mksoft_dirty(pte);
- if (pte_swp_uffd_wp(*src_pte))
+ if (pte_swp_uffd_wp(orig_pte))
pte = pte_swp_mkuffd_wp(pte);
set_pte_at(src_mm, addr, src_pte, pte);
}
@@ -840,7 +844,7 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
entry = make_readable_device_private_entry(
swp_offset(entry));
pte = swp_entry_to_pte(entry);
- if (pte_swp_uffd_wp(*src_pte))
+ if (pte_swp_uffd_wp(orig_pte))
pte = pte_swp_mkuffd_wp(pte);
set_pte_at(src_mm, addr, src_pte, pte);
}
@@ -904,7 +908,7 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma
/* All done, just insert the new page copy in the child */
pte = mk_pte(&new_folio->page, dst_vma->vm_page_prot);
pte = maybe_mkwrite(pte_mkdirty(pte), dst_vma);
- if (userfaultfd_pte_wp(dst_vma, *src_pte))
+ if (userfaultfd_pte_wp(dst_vma, ptep_get(src_pte)))
/* Uffd-wp needs to be delivered to dest pte as well */
pte = pte_mkuffd_wp(pte);
set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte);
@@ -922,7 +926,7 @@ copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
{
struct mm_struct *src_mm = src_vma->vm_mm;
unsigned long vm_flags = src_vma->vm_flags;
- pte_t pte = *src_pte;
+ pte_t pte = ptep_get(src_pte);
struct page *page;
struct folio *folio;
@@ -1002,6 +1006,7 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
struct mm_struct *src_mm = src_vma->vm_mm;
pte_t *orig_src_pte, *orig_dst_pte;
pte_t *src_pte, *dst_pte;
+ pte_t ptent;
spinlock_t *src_ptl, *dst_ptl;
int progress, ret = 0;
int rss[NR_MM_COUNTERS];
@@ -1012,13 +1017,25 @@ again:
progress = 0;
init_rss_vec(rss);
+ /*
+ * copy_pmd_range()'s prior pmd_none_or_clear_bad(src_pmd), and the
+ * error handling here, assume that exclusive mmap_lock on dst and src
+ * protects anon from unexpected THP transitions; with shmem and file
+ * protected by mmap_lock-less collapse skipping areas with anon_vma
+ * (whereas vma_needs_copy() skips areas without anon_vma). A rework
+ * can remove such assumptions later, but this is good enough for now.
+ */
dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
if (!dst_pte) {
ret = -ENOMEM;
goto out;
}
- src_pte = pte_offset_map(src_pmd, addr);
- src_ptl = pte_lockptr(src_mm, src_pmd);
+ src_pte = pte_offset_map_nolock(src_mm, src_pmd, addr, &src_ptl);
+ if (!src_pte) {
+ pte_unmap_unlock(dst_pte, dst_ptl);
+ /* ret == 0 */
+ goto out;
+ }
spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
orig_src_pte = src_pte;
orig_dst_pte = dst_pte;
@@ -1035,17 +1052,18 @@ again:
spin_needbreak(src_ptl) || spin_needbreak(dst_ptl))
break;
}
- if (pte_none(*src_pte)) {
+ ptent = ptep_get(src_pte);
+ if (pte_none(ptent)) {
progress++;
continue;
}
- if (unlikely(!pte_present(*src_pte))) {
+ if (unlikely(!pte_present(ptent))) {
ret = copy_nonpresent_pte(dst_mm, src_mm,
dst_pte, src_pte,
dst_vma, src_vma,
addr, rss);
if (ret == -EIO) {
- entry = pte_to_swp_entry(*src_pte);
+ entry = pte_to_swp_entry(ptep_get(src_pte));
break;
} else if (ret == -EBUSY) {
break;
@@ -1083,8 +1101,7 @@ again:
} while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
arch_leave_lazy_mmu_mode();
- spin_unlock(src_ptl);
- pte_unmap(orig_src_pte);
+ pte_unmap_unlock(orig_src_pte, src_ptl);
add_mm_rss_vec(dst_mm, rss);
pte_unmap_unlock(orig_dst_pte, dst_ptl);
cond_resched();
@@ -1388,14 +1405,15 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
swp_entry_t entry;
tlb_change_page_size(tlb, PAGE_SIZE);
-again:
init_rss_vec(rss);
- start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
- pte = start_pte;
+ start_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+ if (!pte)
+ return addr;
+
flush_tlb_batched_pending(mm);
arch_enter_lazy_mmu_mode();
do {
- pte_t ptent = *pte;
+ pte_t ptent = ptep_get(pte);
struct page *page;
if (pte_none(ptent))
@@ -1507,17 +1525,10 @@ again:
* If we forced a TLB flush (either due to running out of
* batch buffers or because we needed to flush dirty TLB
* entries before releasing the ptl), free the batched
- * memory too. Restart if we didn't do everything.
+ * memory too. Come back again if we didn't do everything.
*/
- if (force_flush) {
- force_flush = 0;
+ if (force_flush)
tlb_flush_mmu(tlb);
- }
-
- if (addr != end) {
- cond_resched();
- goto again;
- }
return addr;
}
@@ -1536,8 +1547,10 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
if (next - addr != HPAGE_PMD_SIZE)
__split_huge_pmd(vma, pmd, addr, false, NULL);
- else if (zap_huge_pmd(tlb, vma, pmd, addr))
- goto next;
+ else if (zap_huge_pmd(tlb, vma, pmd, addr)) {
+ addr = next;
+ continue;
+ }
/* fall through */
} else if (details && details->single_folio &&
folio_test_pmd_mappable(details->single_folio) &&
@@ -1550,20 +1563,14 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
*/
spin_unlock(ptl);
}
-
- /*
- * Here there can be other concurrent MADV_DONTNEED or
- * trans huge page faults running, and if the pmd is
- * none or trans huge it can change under us. This is
- * because MADV_DONTNEED holds the mmap_lock in read
- * mode.
- */
- if (pmd_none_or_trans_huge_or_clear_bad(pmd))
- goto next;
- next = zap_pte_range(tlb, vma, pmd, addr, next, details);
-next:
- cond_resched();
- } while (pmd++, addr = next, addr != end);
+ if (pmd_none(*pmd)) {
+ addr = next;
+ continue;
+ }
+ addr = zap_pte_range(tlb, vma, pmd, addr, next, details);
+ if (addr != next)
+ pmd--;
+ } while (pmd++, cond_resched(), addr != end);
return addr;
}
@@ -1821,7 +1828,7 @@ static int validate_page_before_insert(struct page *page)
static int insert_page_into_pte_locked(struct vm_area_struct *vma, pte_t *pte,
unsigned long addr, struct page *page, pgprot_t prot)
{
- if (!pte_none(*pte))
+ if (!pte_none(ptep_get(pte)))
return -EBUSY;
/* Ok, finally just insert the thing.. */
get_page(page);
@@ -1905,6 +1912,10 @@ more:
const int batch_size = min_t(int, pages_to_write_in_pmd, 8);
start_pte = pte_offset_map_lock(mm, pmd, addr, &pte_lock);
+ if (!start_pte) {
+ ret = -EFAULT;
+ goto out;
+ }
for (pte = start_pte; pte_idx < batch_size; ++pte, ++pte_idx) {
int err = insert_page_in_batch_locked(vma, pte,
addr, pages[curr_page_idx], prot);
@@ -2111,7 +2122,8 @@ static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr,
pte = get_locked_pte(mm, addr, &ptl);
if (!pte)
return VM_FAULT_OOM;
- if (!pte_none(*pte)) {
+ entry = ptep_get(pte);
+ if (!pte_none(entry)) {
if (mkwrite) {
/*
* For read faults on private mappings the PFN passed
@@ -2123,11 +2135,11 @@ static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr,
* allocation and mapping invalidation so just skip the
* update.
*/
- if (pte_pfn(*pte) != pfn_t_to_pfn(pfn)) {
- WARN_ON_ONCE(!is_zero_pfn(pte_pfn(*pte)));
+ if (pte_pfn(entry) != pfn_t_to_pfn(pfn)) {
+ WARN_ON_ONCE(!is_zero_pfn(pte_pfn(entry)));
goto out_unlock;
}
- entry = pte_mkyoung(*pte);
+ entry = pte_mkyoung(entry);
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
if (ptep_set_access_flags(vma, addr, pte, entry, 1))
update_mmu_cache(vma, addr, pte);
@@ -2339,7 +2351,7 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
return -ENOMEM;
arch_enter_lazy_mmu_mode();
do {
- BUG_ON(!pte_none(*pte));
+ BUG_ON(!pte_none(ptep_get(pte)));
if (!pfn_modify_allowed(pfn, prot)) {
err = -EACCES;
break;
@@ -2572,15 +2584,15 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
mapped_pte = pte = (mm == &init_mm) ?
pte_offset_kernel(pmd, addr) :
pte_offset_map_lock(mm, pmd, addr, &ptl);
+ if (!pte)
+ return -EINVAL;
}
- BUG_ON(pmd_huge(*pmd));
-
arch_enter_lazy_mmu_mode();
if (fn) {
do {
- if (create || !pte_none(*pte)) {
+ if (create || !pte_none(ptep_get(pte))) {
err = fn(pte++, addr, data);
if (err)
break;
@@ -2781,10 +2793,9 @@ static inline int pte_unmap_same(struct vm_fault *vmf)
int same = 1;
#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPTION)
if (sizeof(pte_t) > sizeof(unsigned long)) {
- spinlock_t *ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
- spin_lock(ptl);
- same = pte_same(*vmf->pte, vmf->orig_pte);
- spin_unlock(ptl);
+ spin_lock(vmf->ptl);
+ same = pte_same(ptep_get(vmf->pte), vmf->orig_pte);
+ spin_unlock(vmf->ptl);
}
#endif
pte_unmap(vmf->pte);
@@ -2804,7 +2815,6 @@ static inline int __wp_page_copy_user(struct page *dst, struct page *src,
int ret;
void *kaddr;
void __user *uaddr;
- bool locked = false;
struct vm_area_struct *vma = vmf->vma;
struct mm_struct *mm = vma->vm_mm;
unsigned long addr = vmf->address;
@@ -2830,17 +2840,18 @@ static inline int __wp_page_copy_user(struct page *dst, struct page *src,
* On architectures with software "accessed" bits, we would
* take a double page fault, so mark it accessed here.
*/
+ vmf->pte = NULL;
if (!arch_has_hw_pte_young() && !pte_young(vmf->orig_pte)) {
pte_t entry;
vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl);
- locked = true;
- if (!likely(pte_same(*vmf->pte, vmf->orig_pte))) {
+ if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte))) {
/*
* Other thread has already handled the fault
* and update local tlb only
*/
- update_mmu_tlb(vma, addr, vmf->pte);
+ if (vmf->pte)
+ update_mmu_tlb(vma, addr, vmf->pte);
ret = -EAGAIN;
goto pte_unlock;
}
@@ -2857,15 +2868,15 @@ static inline int __wp_page_copy_user(struct page *dst, struct page *src,
* zeroes.
*/
if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) {
- if (locked)
+ if (vmf->pte)
goto warn;
/* Re-validate under PTL if the page is still mapped */
vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl);
- locked = true;
- if (!likely(pte_same(*vmf->pte, vmf->orig_pte))) {
+ if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte))) {
/* The PTE changed under us, update local tlb */
- update_mmu_tlb(vma, addr, vmf->pte);
+ if (vmf->pte)
+ update_mmu_tlb(vma, addr, vmf->pte);
ret = -EAGAIN;
goto pte_unlock;
}
@@ -2888,7 +2899,7 @@ warn:
ret = 0;
pte_unlock:
- if (locked)
+ if (vmf->pte)
pte_unmap_unlock(vmf->pte, vmf->ptl);
kunmap_atomic(kaddr);
flush_dcache_page(dst);
@@ -3110,7 +3121,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
* Re-check the pte - we dropped the lock
*/
vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, &vmf->ptl);
- if (likely(pte_same(*vmf->pte, vmf->orig_pte))) {
+ if (likely(vmf->pte && pte_same(ptep_get(vmf->pte), vmf->orig_pte))) {
if (old_folio) {
if (!folio_test_anon(old_folio)) {
dec_mm_counter(mm, mm_counter_file(&old_folio->page));
@@ -3178,19 +3189,20 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
/* Free the old page.. */
new_folio = old_folio;
page_copied = 1;
- } else {
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ } else if (vmf->pte) {
update_mmu_tlb(vma, vmf->address, vmf->pte);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
}
- if (new_folio)
- folio_put(new_folio);
-
- pte_unmap_unlock(vmf->pte, vmf->ptl);
/*
* No need to double call mmu_notifier->invalidate_range() callback as
* the above ptep_clear_flush_notify() did already call it.
*/
mmu_notifier_invalidate_range_only_end(&range);
+
+ if (new_folio)
+ folio_put(new_folio);
if (old_folio) {
if (page_copied)
free_swap_cache(&old_folio->page);
@@ -3230,11 +3242,13 @@ vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf)
WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED));
vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address,
&vmf->ptl);
+ if (!vmf->pte)
+ return VM_FAULT_NOPAGE;
/*
* We might have raced with another page fault while we released the
* pte_offset_map_lock.
*/
- if (!pte_same(*vmf->pte, vmf->orig_pte)) {
+ if (!pte_same(ptep_get(vmf->pte), vmf->orig_pte)) {
update_mmu_tlb(vmf->vma, vmf->address, vmf->pte);
pte_unmap_unlock(vmf->pte, vmf->ptl);
return VM_FAULT_NOPAGE;
@@ -3329,7 +3343,7 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
struct folio *folio = NULL;
if (likely(!unshare)) {
- if (userfaultfd_pte_wp(vma, *vmf->pte)) {
+ if (userfaultfd_pte_wp(vma, ptep_get(vmf->pte))) {
pte_unmap_unlock(vmf->pte, vmf->ptl);
return handle_userfault(vmf, VM_UFFD_WP);
}
@@ -3388,8 +3402,8 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
goto copy;
if (!folio_test_lru(folio))
/*
- * Note: We cannot easily detect+handle references from
- * remote LRU pagevecs or references to LRU folios.
+ * We cannot easily detect+handle references from
+ * remote LRU caches or references to LRU folios.
*/
lru_add_drain();
if (folio_ref_count(folio) > 1 + folio_test_swapcache(folio))
@@ -3591,10 +3605,11 @@ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf)
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
&vmf->ptl);
- if (likely(pte_same(*vmf->pte, vmf->orig_pte)))
+ if (likely(vmf->pte && pte_same(ptep_get(vmf->pte), vmf->orig_pte)))
restore_exclusive_pte(vma, vmf->page, vmf->address, vmf->pte);
- pte_unmap_unlock(vmf->pte, vmf->ptl);
+ if (vmf->pte)
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
folio_unlock(folio);
folio_put(folio);
@@ -3625,6 +3640,8 @@ static vm_fault_t pte_marker_clear(struct vm_fault *vmf)
{
vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd,
vmf->address, &vmf->ptl);
+ if (!vmf->pte)
+ return 0;
/*
* Be careful so that we will only recover a special uffd-wp pte into a
* none pte. Otherwise it means the pte could have changed, so retry.
@@ -3633,7 +3650,7 @@ static vm_fault_t pte_marker_clear(struct vm_fault *vmf)
* quickly from a PTE_MARKER_UFFD_WP into PTE_MARKER_SWAPIN_ERROR.
* So is_pte_marker() check is not enough to safely drop the pte.
*/
- if (pte_same(vmf->orig_pte, *vmf->pte))
+ if (pte_same(vmf->orig_pte, ptep_get(vmf->pte)))
pte_clear(vmf->vma->vm_mm, vmf->address, vmf->pte);
pte_unmap_unlock(vmf->pte, vmf->ptl);
return 0;
@@ -3728,10 +3745,10 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
vmf->page = pfn_swap_entry_to_page(entry);
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
vmf->address, &vmf->ptl);
- if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) {
- spin_unlock(vmf->ptl);
- goto out;
- }
+ if (unlikely(!vmf->pte ||
+ !pte_same(ptep_get(vmf->pte),
+ vmf->orig_pte)))
+ goto unlock;
/*
* Get a page reference while we know the page can't be
@@ -3807,7 +3824,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
*/
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
vmf->address, &vmf->ptl);
- if (likely(pte_same(*vmf->pte, vmf->orig_pte)))
+ if (likely(vmf->pte &&
+ pte_same(ptep_get(vmf->pte), vmf->orig_pte)))
ret = VM_FAULT_OOM;
goto unlock;
}
@@ -3863,7 +3881,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
* If we want to map a page that's in the swapcache writable, we
* have to detect via the refcount if we're really the exclusive
* owner. Try removing the extra reference from the local LRU
- * pagevecs if required.
+ * caches if required.
*/
if ((vmf->flags & FAULT_FLAG_WRITE) && folio == swapcache &&
!folio_test_ksm(folio) && !folio_test_lru(folio))
@@ -3877,7 +3895,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
*/
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
&vmf->ptl);
- if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte)))
+ if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte)))
goto out_nomap;
if (unlikely(!folio_test_uptodate(folio))) {
@@ -4003,13 +4021,15 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
/* No need to invalidate - it was non-present before */
update_mmu_cache(vma, vmf->address, vmf->pte);
unlock:
- pte_unmap_unlock(vmf->pte, vmf->ptl);
+ if (vmf->pte)
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
out:
if (si)
put_swap_device(si);
return ret;
out_nomap:
- pte_unmap_unlock(vmf->pte, vmf->ptl);
+ if (vmf->pte)
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
out_page:
folio_unlock(folio);
out_release:
@@ -4041,22 +4061,12 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
return VM_FAULT_SIGBUS;
/*
- * Use pte_alloc() instead of pte_alloc_map(). We can't run
- * pte_offset_map() on pmds where a huge pmd might be created
- * from a different thread.
- *
- * pte_alloc_map() is safe to use under mmap_write_lock(mm) or when
- * parallel threads are excluded by other means.
- *
- * Here we only have mmap_read_lock(mm).
+ * Use pte_alloc() instead of pte_alloc_map(), so that OOM can
+ * be distinguished from a transient failure of pte_offset_map().
*/
if (pte_alloc(vma->vm_mm, vmf->pmd))
return VM_FAULT_OOM;
- /* See comment in handle_pte_fault() */
- if (unlikely(pmd_trans_unstable(vmf->pmd)))
- return 0;
-
/* Use the zero-page for reads */
if (!(vmf->flags & FAULT_FLAG_WRITE) &&
!mm_forbids_zeropage(vma->vm_mm)) {
@@ -4064,6 +4074,8 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
vma->vm_page_prot));
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
vmf->address, &vmf->ptl);
+ if (!vmf->pte)
+ goto unlock;
if (vmf_pte_changed(vmf)) {
update_mmu_tlb(vma, vmf->address, vmf->pte);
goto unlock;
@@ -4104,6 +4116,8 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
&vmf->ptl);
+ if (!vmf->pte)
+ goto release;
if (vmf_pte_changed(vmf)) {
update_mmu_tlb(vma, vmf->address, vmf->pte);
goto release;
@@ -4131,7 +4145,8 @@ setpte:
/* No need to invalidate - it was non-present before */
update_mmu_cache(vma, vmf->address, vmf->pte);
unlock:
- pte_unmap_unlock(vmf->pte, vmf->ptl);
+ if (vmf->pte)
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
return ret;
release:
folio_put(folio);
@@ -4325,9 +4340,9 @@ void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
static bool vmf_pte_changed(struct vm_fault *vmf)
{
if (vmf->flags & FAULT_FLAG_ORIG_PTE_VALID)
- return !pte_same(*vmf->pte, vmf->orig_pte);
+ return !pte_same(ptep_get(vmf->pte), vmf->orig_pte);
- return !pte_none(*vmf->pte);
+ return !pte_none(ptep_get(vmf->pte));
}
/**
@@ -4380,15 +4395,10 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
return VM_FAULT_OOM;
}
- /*
- * See comment in handle_pte_fault() for how this scenario happens, we
- * need to return NOPAGE so that we drop this page.
- */
- if (pmd_devmap_trans_unstable(vmf->pmd))
- return VM_FAULT_NOPAGE;
-
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
vmf->address, &vmf->ptl);
+ if (!vmf->pte)
+ return VM_FAULT_NOPAGE;
/* Re-check under ptl */
if (likely(!vmf_pte_changed(vmf))) {
@@ -4630,17 +4640,11 @@ static vm_fault_t do_fault(struct vm_fault *vmf)
* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND
*/
if (!vma->vm_ops->fault) {
- /*
- * If we find a migration pmd entry or a none pmd entry, which
- * should never happen, return SIGBUS
- */
- if (unlikely(!pmd_present(*vmf->pmd)))
+ vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd,
+ vmf->address, &vmf->ptl);
+ if (unlikely(!vmf->pte))
ret = VM_FAULT_SIGBUS;
else {
- vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm,
- vmf->pmd,
- vmf->address,
- &vmf->ptl);
/*
* Make sure this is not a temporary clearing of pte
* by holding ptl and checking again. A R/M/W update
@@ -4648,7 +4652,7 @@ static vm_fault_t do_fault(struct vm_fault *vmf)
* we don't have concurrent modification by hardware
* followed by an update.
*/
- if (unlikely(pte_none(*vmf->pte)))
+ if (unlikely(pte_none(ptep_get(vmf->pte))))
ret = VM_FAULT_SIGBUS;
else
ret = VM_FAULT_NOPAGE;
@@ -4703,9 +4707,8 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
* validation through pte_unmap_same(). It's of NUMA type but
* the pfn may be screwed if the read is non atomic.
*/
- vmf->ptl = pte_lockptr(vma->vm_mm, vmf->pmd);
spin_lock(vmf->ptl);
- if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) {
+ if (unlikely(!pte_same(ptep_get(vmf->pte), vmf->orig_pte))) {
pte_unmap_unlock(vmf->pte, vmf->ptl);
goto out;
}
@@ -4774,9 +4777,11 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
flags |= TNF_MIGRATED;
} else {
flags |= TNF_MIGRATE_FAIL;
- vmf->pte = pte_offset_map(vmf->pmd, vmf->address);
- spin_lock(vmf->ptl);
- if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) {
+ vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
+ vmf->address, &vmf->ptl);
+ if (unlikely(!vmf->pte))
+ goto out;
+ if (unlikely(!pte_same(ptep_get(vmf->pte), vmf->orig_pte))) {
pte_unmap_unlock(vmf->pte, vmf->ptl);
goto out;
}
@@ -4905,38 +4910,18 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
vmf->flags &= ~FAULT_FLAG_ORIG_PTE_VALID;
} else {
/*
- * If a huge pmd materialized under us just retry later. Use
- * pmd_trans_unstable() via pmd_devmap_trans_unstable() instead
- * of pmd_trans_huge() to ensure the pmd didn't become
- * pmd_trans_huge under us and then back to pmd_none, as a
- * result of MADV_DONTNEED running immediately after a huge pmd
- * fault in a different thread of this mm, in turn leading to a
- * misleading pmd_trans_huge() retval. All we have to ensure is
- * that it is a regular pmd that we can walk with
- * pte_offset_map() and we can do that through an atomic read
- * in C, which is what pmd_trans_unstable() provides.
- */
- if (pmd_devmap_trans_unstable(vmf->pmd))
- return 0;
- /*
* A regular pmd is established and it can't morph into a huge
- * pmd from under us anymore at this point because we hold the
- * mmap_lock read mode and khugepaged takes it in write mode.
- * So now it's safe to run pte_offset_map().
+ * pmd by anon khugepaged, since that takes mmap_lock in write
+ * mode; but shmem or file collapse to THP could still morph
+ * it into a huge pmd: just retry later if so.
*/
- vmf->pte = pte_offset_map(vmf->pmd, vmf->address);
- vmf->orig_pte = *vmf->pte;
+ vmf->pte = pte_offset_map_nolock(vmf->vma->vm_mm, vmf->pmd,
+ vmf->address, &vmf->ptl);
+ if (unlikely(!vmf->pte))
+ return 0;
+ vmf->orig_pte = ptep_get_lockless(vmf->pte);
vmf->flags |= FAULT_FLAG_ORIG_PTE_VALID;
- /*
- * some architectures can have larger ptes than wordsize,
- * e.g.ppc44x-defconfig has CONFIG_PTE_64BIT=y and
- * CONFIG_32BIT=y, so READ_ONCE cannot guarantee atomic
- * accesses. The code below just needs a consistent view
- * for the ifs and we later double check anyway with the
- * ptl lock held. So here a barrier will do.
- */
- barrier();
if (pte_none(vmf->orig_pte)) {
pte_unmap(vmf->pte);
vmf->pte = NULL;
@@ -4952,10 +4937,9 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma))
return do_numa_page(vmf);
- vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
spin_lock(vmf->ptl);
entry = vmf->orig_pte;
- if (unlikely(!pte_same(*vmf->pte, entry))) {
+ if (unlikely(!pte_same(ptep_get(vmf->pte), entry))) {
update_mmu_tlb(vmf->vma, vmf->address, vmf->pte);
goto unlock;
}
@@ -5060,9 +5044,8 @@ retry_pud:
if (!(ret & VM_FAULT_FALLBACK))
return ret;
} else {
- vmf.orig_pmd = *vmf.pmd;
+ vmf.orig_pmd = pmdp_get_lockless(vmf.pmd);
- barrier();
if (unlikely(is_swap_pmd(vmf.orig_pmd))) {
VM_BUG_ON(thp_migration_supported() &&
!is_pmd_migration_entry(vmf.orig_pmd));
@@ -5262,6 +5245,125 @@ out:
}
EXPORT_SYMBOL_GPL(handle_mm_fault);
+#ifdef CONFIG_LOCK_MM_AND_FIND_VMA
+#include <linux/extable.h>
+
+static inline bool get_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs)
+{
+ /* Even if this succeeds, make it clear we *might* have slept */
+ if (likely(mmap_read_trylock(mm))) {
+ might_sleep();
+ return true;
+ }
+
+ if (regs && !user_mode(regs)) {
+ unsigned long ip = instruction_pointer(regs);
+ if (!search_exception_tables(ip))
+ return false;
+ }
+
+ return !mmap_read_lock_killable(mm);
+}
+
+static inline bool mmap_upgrade_trylock(struct mm_struct *mm)
+{
+ /*
+ * We don't have this operation yet.
+ *
+ * It should be easy enough to do: it's basically a
+ * atomic_long_try_cmpxchg_acquire()
+ * from RWSEM_READER_BIAS -> RWSEM_WRITER_LOCKED, but
+ * it also needs the proper lockdep magic etc.
+ */
+ return false;
+}
+
+static inline bool upgrade_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs)
+{
+ mmap_read_unlock(mm);
+ if (regs && !user_mode(regs)) {
+ unsigned long ip = instruction_pointer(regs);
+ if (!search_exception_tables(ip))
+ return false;
+ }
+ return !mmap_write_lock_killable(mm);
+}
+
+/*
+ * Helper for page fault handling.
+ *
+ * This is kind of equivalend to "mmap_read_lock()" followed
+ * by "find_extend_vma()", except it's a lot more careful about
+ * the locking (and will drop the lock on failure).
+ *
+ * For example, if we have a kernel bug that causes a page
+ * fault, we don't want to just use mmap_read_lock() to get
+ * the mm lock, because that would deadlock if the bug were
+ * to happen while we're holding the mm lock for writing.
+ *
+ * So this checks the exception tables on kernel faults in
+ * order to only do this all for instructions that are actually
+ * expected to fault.
+ *
+ * We can also actually take the mm lock for writing if we
+ * need to extend the vma, which helps the VM layer a lot.
+ */
+struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
+ unsigned long addr, struct pt_regs *regs)
+{
+ struct vm_area_struct *vma;
+
+ if (!get_mmap_lock_carefully(mm, regs))
+ return NULL;
+
+ vma = find_vma(mm, addr);
+ if (likely(vma && (vma->vm_start <= addr)))
+ return vma;
+
+ /*
+ * Well, dang. We might still be successful, but only
+ * if we can extend a vma to do so.
+ */
+ if (!vma || !(vma->vm_flags & VM_GROWSDOWN)) {
+ mmap_read_unlock(mm);
+ return NULL;
+ }
+
+ /*
+ * We can try to upgrade the mmap lock atomically,
+ * in which case we can continue to use the vma
+ * we already looked up.
+ *
+ * Otherwise we'll have to drop the mmap lock and
+ * re-take it, and also look up the vma again,
+ * re-checking it.
+ */
+ if (!mmap_upgrade_trylock(mm)) {
+ if (!upgrade_mmap_lock_carefully(mm, regs))
+ return NULL;
+
+ vma = find_vma(mm, addr);
+ if (!vma)
+ goto fail;
+ if (vma->vm_start <= addr)
+ goto success;
+ if (!(vma->vm_flags & VM_GROWSDOWN))
+ goto fail;
+ }
+
+ if (expand_stack_locked(vma, addr))
+ goto fail;
+
+success:
+ mmap_write_downgrade(mm);
+ return vma;
+
+fail:
+ mmap_write_unlock(mm);
+ return NULL;
+}
+#endif
+
#ifdef CONFIG_PER_VMA_LOCK
/*
* Lookup and lock a VMA under RCU protection. Returned VMA is guaranteed to be
@@ -5280,12 +5382,12 @@ retry:
if (!vma)
goto inval;
- /* Only anonymous vmas are supported for now */
- if (!vma_is_anonymous(vma))
+ /* Only anonymous and tcp vmas are supported for now */
+ if (!vma_is_anonymous(vma) && !vma_is_tcp(vma))
goto inval;
/* find_mergeable_anon_vma uses adjacent vmas which are not locked */
- if (!vma->anon_vma)
+ if (!vma->anon_vma && !vma_is_tcp(vma))
goto inval;
if (!vma_start_read(vma))
@@ -5439,11 +5541,10 @@ int follow_pte(struct mm_struct *mm, unsigned long address,
pmd = pmd_offset(pud, address);
VM_BUG_ON(pmd_trans_huge(*pmd));
- if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
- goto out;
-
ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
- if (!pte_present(*ptep))
+ if (!ptep)
+ goto out;
+ if (!pte_present(ptep_get(ptep)))
goto unlock;
*ptepp = ptep;
return 0;
@@ -5480,7 +5581,7 @@ int follow_pfn(struct vm_area_struct *vma, unsigned long address,
ret = follow_pte(vma->vm_mm, address, &ptep, &ptl);
if (ret)
return ret;
- *pfn = pte_pfn(*ptep);
+ *pfn = pte_pfn(ptep_get(ptep));
pte_unmap_unlock(ptep, ptl);
return 0;
}
@@ -5500,7 +5601,7 @@ int follow_phys(struct vm_area_struct *vma,
if (follow_pte(vma->vm_mm, address, &ptep, &ptl))
goto out;
- pte = *ptep;
+ pte = ptep_get(ptep);
if ((flags & FOLL_WRITE) && !pte_write(pte))
goto unlock;
@@ -5544,7 +5645,7 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
retry:
if (follow_pte(vma->vm_mm, addr, &ptep, &ptl))
return -EINVAL;
- pte = *ptep;
+ pte = ptep_get(ptep);
pte_unmap_unlock(ptep, ptl);
prot = pgprot_val(pte_pgprot(pte));
@@ -5560,7 +5661,7 @@ retry:
if (follow_pte(vma->vm_mm, addr, &ptep, &ptl))
goto out_unmap;
- if (!pte_same(pte, *ptep)) {
+ if (!pte_same(pte, ptep_get(ptep))) {
pte_unmap_unlock(ptep, ptl);
iounmap(maddr);
@@ -5587,39 +5688,51 @@ EXPORT_SYMBOL_GPL(generic_access_phys);
int __access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf,
int len, unsigned int gup_flags)
{
- struct vm_area_struct *vma;
void *old_buf = buf;
int write = gup_flags & FOLL_WRITE;
if (mmap_read_lock_killable(mm))
return 0;
+ /* Avoid triggering the temporary warning in __get_user_pages */
+ if (!vma_lookup(mm, addr) && !expand_stack(mm, addr))
+ return 0;
+
/* ignore errors, just check how much was successfully transferred */
while (len) {
- int bytes, ret, offset;
+ int bytes, offset;
void *maddr;
- struct page *page = NULL;
+ struct vm_area_struct *vma = NULL;
+ struct page *page = get_user_page_vma_remote(mm, addr,
+ gup_flags, &vma);
+
+ if (IS_ERR_OR_NULL(page)) {
+ /* We might need to expand the stack to access it */
+ vma = vma_lookup(mm, addr);
+ if (!vma) {
+ vma = expand_stack(mm, addr);
+
+ /* mmap_lock was dropped on failure */
+ if (!vma)
+ return buf - old_buf;
+
+ /* Try again if stack expansion worked */
+ continue;
+ }
+
- ret = get_user_pages_remote(mm, addr, 1,
- gup_flags, &page, &vma, NULL);
- if (ret <= 0) {
-#ifndef CONFIG_HAVE_IOREMAP_PROT
- break;
-#else
/*
* Check if this is a VM_IO | VM_PFNMAP VMA, which
* we can access using slightly different code.
*/
- vma = vma_lookup(mm, addr);
- if (!vma)
- break;
+ bytes = 0;
+#ifdef CONFIG_HAVE_IOREMAP_PROT
if (vma->vm_ops && vma->vm_ops->access)
- ret = vma->vm_ops->access(vma, addr, buf,
- len, write);
- if (ret <= 0)
- break;
- bytes = ret;
+ bytes = vma->vm_ops->access(vma, addr, buf,
+ len, write);
#endif
+ if (bytes <= 0)
+ break;
} else {
bytes = len;
offset = addr & (PAGE_SIZE-1);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 8e0fa209d533..3f231cf1b410 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -13,7 +13,6 @@
#include <linux/pagemap.h>
#include <linux/compiler.h>
#include <linux/export.h>
-#include <linux/pagevec.h>
#include <linux/writeback.h>
#include <linux/slab.h>
#include <linux/sysctl.h>
@@ -325,7 +324,7 @@ int __ref __add_pages(int nid, unsigned long pfn, unsigned long nr_pages,
}
if (check_pfn_span(pfn, nr_pages)) {
- WARN(1, "Misaligned %s start: %#lx end: #%lx\n", __func__, pfn, pfn + nr_pages - 1);
+ WARN(1, "Misaligned %s start: %#lx end: %#lx\n", __func__, pfn, pfn + nr_pages - 1);
return -EINVAL;
}
@@ -492,18 +491,6 @@ void __ref remove_pfn_range_from_zone(struct zone *zone,
set_zone_contiguous(zone);
}
-static void __remove_section(unsigned long pfn, unsigned long nr_pages,
- unsigned long map_offset,
- struct vmem_altmap *altmap)
-{
- struct mem_section *ms = __pfn_to_section(pfn);
-
- if (WARN_ON_ONCE(!valid_section(ms)))
- return;
-
- sparse_remove_section(ms, pfn, nr_pages, map_offset, altmap);
-}
-
/**
* __remove_pages() - remove sections of pages
* @pfn: starting pageframe (must be aligned to start of a section)
@@ -520,12 +507,9 @@ void __remove_pages(unsigned long pfn, unsigned long nr_pages,
{
const unsigned long end_pfn = pfn + nr_pages;
unsigned long cur_nr_pages;
- unsigned long map_offset = 0;
-
- map_offset = vmem_altmap_offset(altmap);
if (check_pfn_span(pfn, nr_pages)) {
- WARN(1, "Misaligned %s start: %#lx end: #%lx\n", __func__, pfn, pfn + nr_pages - 1);
+ WARN(1, "Misaligned %s start: %#lx end: %#lx\n", __func__, pfn, pfn + nr_pages - 1);
return;
}
@@ -534,8 +518,7 @@ void __remove_pages(unsigned long pfn, unsigned long nr_pages,
/* Select all remaining pages up to the next section boundary */
cur_nr_pages = min(end_pfn - pfn,
SECTION_ALIGN_UP(pfn + 1) - pfn);
- __remove_section(pfn, cur_nr_pages, map_offset, altmap);
- map_offset = 0;
+ sparse_remove_section(pfn, cur_nr_pages, altmap);
}
}
@@ -1172,16 +1155,6 @@ failed_addition:
return ret;
}
-static void reset_node_present_pages(pg_data_t *pgdat)
-{
- struct zone *z;
-
- for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++)
- z->present_pages = 0;
-
- pgdat->node_present_pages = 0;
-}
-
/* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
static pg_data_t __ref *hotadd_init_pgdat(int nid)
{
@@ -1204,15 +1177,6 @@ static pg_data_t __ref *hotadd_init_pgdat(int nid)
*/
build_all_zonelists(pgdat);
- /*
- * When memory is hot-added, all the memory is in offline state. So
- * clear all zones' present_pages because they will be updated in
- * online_pages() and offline_pages().
- * TODO: should be in free_area_init_core_hotplug?
- */
- reset_node_managed_pages(pgdat);
- reset_node_present_pages(pgdat);
-
return pgdat;
}
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 1756389a0609..edc25195f5bd 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -508,20 +508,23 @@ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr,
unsigned long flags = qp->flags;
bool has_unmovable = false;
pte_t *pte, *mapped_pte;
+ pte_t ptent;
spinlock_t *ptl;
ptl = pmd_trans_huge_lock(pmd, vma);
if (ptl)
return queue_folios_pmd(pmd, ptl, addr, end, walk);
- if (pmd_trans_unstable(pmd))
- return 0;
-
mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
+ if (!pte) {
+ walk->action = ACTION_AGAIN;
+ return 0;
+ }
for (; addr != end; pte++, addr += PAGE_SIZE) {
- if (!pte_present(*pte))
+ ptent = ptep_get(pte);
+ if (!pte_present(ptent))
continue;
- folio = vm_normal_folio(vma, addr, *pte);
+ folio = vm_normal_folio(vma, addr, ptent);
if (!folio || folio_is_zone_device(folio))
continue;
/*
@@ -1195,24 +1198,22 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
* list of pages handed to migrate_pages()--which is how we get here--
* is in virtual address order.
*/
-static struct page *new_page(struct page *page, unsigned long start)
+static struct folio *new_folio(struct folio *src, unsigned long start)
{
- struct folio *dst, *src = page_folio(page);
struct vm_area_struct *vma;
unsigned long address;
VMA_ITERATOR(vmi, current->mm, start);
gfp_t gfp = GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL;
for_each_vma(vmi, vma) {
- address = page_address_in_vma(page, vma);
+ address = page_address_in_vma(&src->page, vma);
if (address != -EFAULT)
break;
}
if (folio_test_hugetlb(src)) {
- dst = alloc_hugetlb_folio_vma(folio_hstate(src),
+ return alloc_hugetlb_folio_vma(folio_hstate(src),
vma, address);
- return &dst->page;
}
if (folio_test_large(src))
@@ -1221,9 +1222,8 @@ static struct page *new_page(struct page *page, unsigned long start)
/*
* if !vma, vma_alloc_folio() will use task or system default policy
*/
- dst = vma_alloc_folio(gfp, folio_order(src), vma, address,
+ return vma_alloc_folio(gfp, folio_order(src), vma, address,
folio_test_large(src));
- return &dst->page;
}
#else
@@ -1239,7 +1239,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
return -ENOSYS;
}
-static struct page *new_page(struct page *page, unsigned long start)
+static struct folio *new_folio(struct folio *src, unsigned long start)
{
return NULL;
}
@@ -1334,7 +1334,7 @@ static long do_mbind(unsigned long start, unsigned long len,
if (!list_empty(&pagelist)) {
WARN_ON_ONCE(flags & MPOL_MF_LAZY);
- nr_failed = migrate_pages(&pagelist, new_page, NULL,
+ nr_failed = migrate_pages(&pagelist, new_folio, NULL,
start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND, NULL);
if (nr_failed)
putback_movable_pages(&pagelist);
diff --git a/mm/migrate.c b/mm/migrate.c
index 01cac26a3127..24baad2571e3 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -21,7 +21,6 @@
#include <linux/buffer_head.h>
#include <linux/mm_inline.h>
#include <linux/nsproxy.h>
-#include <linux/pagevec.h>
#include <linux/ksm.h>
#include <linux/rmap.h>
#include <linux/topology.h>
@@ -188,6 +187,7 @@ static bool remove_migration_pte(struct folio *folio,
while (page_vma_mapped_walk(&pvmw)) {
rmap_t rmap_flags = RMAP_NONE;
+ pte_t old_pte;
pte_t pte;
swp_entry_t entry;
struct page *new;
@@ -210,17 +210,18 @@ static bool remove_migration_pte(struct folio *folio,
folio_get(folio);
pte = mk_pte(new, READ_ONCE(vma->vm_page_prot));
- if (pte_swp_soft_dirty(*pvmw.pte))
+ old_pte = ptep_get(pvmw.pte);
+ if (pte_swp_soft_dirty(old_pte))
pte = pte_mksoft_dirty(pte);
- entry = pte_to_swp_entry(*pvmw.pte);
+ entry = pte_to_swp_entry(old_pte);
if (!is_migration_entry_young(entry))
pte = pte_mkold(pte);
if (folio_test_dirty(folio) && is_migration_entry_dirty(entry))
pte = pte_mkdirty(pte);
if (is_writable_migration_entry(entry))
pte = pte_mkwrite(pte);
- else if (pte_swp_uffd_wp(*pvmw.pte))
+ else if (pte_swp_uffd_wp(old_pte))
pte = pte_mkuffd_wp(pte);
if (folio_test_anon(folio) && !is_readable_migration_entry(entry))
@@ -234,9 +235,9 @@ static bool remove_migration_pte(struct folio *folio,
entry = make_readable_device_private_entry(
page_to_pfn(new));
pte = swp_entry_to_pte(entry);
- if (pte_swp_soft_dirty(*pvmw.pte))
+ if (pte_swp_soft_dirty(old_pte))
pte = pte_swp_mksoft_dirty(pte);
- if (pte_swp_uffd_wp(*pvmw.pte))
+ if (pte_swp_uffd_wp(old_pte))
pte = pte_swp_mkuffd_wp(pte);
}
@@ -296,14 +297,21 @@ void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked)
* get to the page and wait until migration is finished.
* When we return from this function the fault will be retried.
*/
-void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
- spinlock_t *ptl)
+void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
+ unsigned long address)
{
+ spinlock_t *ptl;
+ pte_t *ptep;
pte_t pte;
swp_entry_t entry;
- spin_lock(ptl);
- pte = *ptep;
+ ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
+ if (!ptep)
+ return;
+
+ pte = ptep_get(ptep);
+ pte_unmap(ptep);
+
if (!is_swap_pte(pte))
goto out;
@@ -311,18 +319,10 @@ void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
if (!is_migration_entry(entry))
goto out;
- migration_entry_wait_on_locked(entry, ptep, ptl);
+ migration_entry_wait_on_locked(entry, ptl);
return;
out:
- pte_unmap_unlock(ptep, ptl);
-}
-
-void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
- unsigned long address)
-{
- spinlock_t *ptl = pte_lockptr(mm, pmd);
- pte_t *ptep = pte_offset_map(pmd, address);
- __migration_entry_wait(mm, ptep, ptl);
+ spin_unlock(ptl);
}
#ifdef CONFIG_HUGETLB_PAGE
@@ -332,9 +332,9 @@ void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
*
* This function will release the vma lock before returning.
*/
-void __migration_entry_wait_huge(struct vm_area_struct *vma,
- pte_t *ptep, spinlock_t *ptl)
+void migration_entry_wait_huge(struct vm_area_struct *vma, pte_t *ptep)
{
+ spinlock_t *ptl = huge_pte_lockptr(hstate_vma(vma), vma->vm_mm, ptep);
pte_t pte;
hugetlb_vma_assert_locked(vma);
@@ -352,16 +352,9 @@ void __migration_entry_wait_huge(struct vm_area_struct *vma,
* lock release in migration_entry_wait_on_locked().
*/
hugetlb_vma_unlock_read(vma);
- migration_entry_wait_on_locked(pte_to_swp_entry(pte), NULL, ptl);
+ migration_entry_wait_on_locked(pte_to_swp_entry(pte), ptl);
}
}
-
-void migration_entry_wait_huge(struct vm_area_struct *vma, pte_t *pte)
-{
- spinlock_t *ptl = huge_pte_lockptr(hstate_vma(vma), vma->vm_mm, pte);
-
- __migration_entry_wait_huge(vma, pte, ptl);
-}
#endif
#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
@@ -372,7 +365,7 @@ void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd)
ptl = pmd_lock(mm, pmd);
if (!is_pmd_migration_entry(*pmd))
goto unlock;
- migration_entry_wait_on_locked(pmd_to_swp_entry(*pmd), NULL, ptl);
+ migration_entry_wait_on_locked(pmd_to_swp_entry(*pmd), ptl);
return;
unlock:
spin_unlock(ptl);
@@ -492,6 +485,11 @@ int folio_migrate_mapping(struct address_space *mapping,
if (folio_test_swapbacked(folio) && !folio_test_swapcache(folio)) {
__mod_lruvec_state(old_lruvec, NR_SHMEM, -nr);
__mod_lruvec_state(new_lruvec, NR_SHMEM, nr);
+
+ if (folio_test_pmd_mappable(folio)) {
+ __mod_lruvec_state(old_lruvec, NR_SHMEM_THPS, -nr);
+ __mod_lruvec_state(new_lruvec, NR_SHMEM_THPS, nr);
+ }
}
#ifdef CONFIG_SWAP
if (folio_test_swapcache(folio)) {
@@ -692,37 +690,32 @@ static bool buffer_migrate_lock_buffers(struct buffer_head *head,
enum migrate_mode mode)
{
struct buffer_head *bh = head;
+ struct buffer_head *failed_bh;
- /* Simple case, sync compaction */
- if (mode != MIGRATE_ASYNC) {
- do {
- lock_buffer(bh);
- bh = bh->b_this_page;
-
- } while (bh != head);
-
- return true;
- }
-
- /* async case, we cannot block on lock_buffer so use trylock_buffer */
do {
if (!trylock_buffer(bh)) {
- /*
- * We failed to lock the buffer and cannot stall in
- * async migration. Release the taken locks
- */
- struct buffer_head *failed_bh = bh;
- bh = head;
- while (bh != failed_bh) {
- unlock_buffer(bh);
- bh = bh->b_this_page;
- }
- return false;
+ if (mode == MIGRATE_ASYNC)
+ goto unlock;
+ if (mode == MIGRATE_SYNC_LIGHT && !buffer_uptodate(bh))
+ goto unlock;
+ lock_buffer(bh);
}
bh = bh->b_this_page;
} while (bh != head);
+
return true;
+
+unlock:
+ /* We failed to lock the buffer and cannot stall. */
+ failed_bh = bh;
+ bh = head;
+ while (bh != failed_bh) {
+ unlock_buffer(bh);
+ bh = bh->b_this_page;
+ }
+
+ return false;
}
static int __buffer_migrate_folio(struct address_space *mapping,
@@ -1072,15 +1065,13 @@ static void migrate_folio_undo_src(struct folio *src,
}
/* Restore the destination folio to the original state upon failure */
-static void migrate_folio_undo_dst(struct folio *dst,
- bool locked,
- free_page_t put_new_page,
- unsigned long private)
+static void migrate_folio_undo_dst(struct folio *dst, bool locked,
+ free_folio_t put_new_folio, unsigned long private)
{
if (locked)
folio_unlock(dst);
- if (put_new_page)
- put_new_page(&dst->page, private);
+ if (put_new_folio)
+ put_new_folio(dst, private);
else
folio_put(dst);
}
@@ -1104,14 +1095,13 @@ static void migrate_folio_done(struct folio *src,
}
/* Obtain the lock on page, remove all ptes. */
-static int migrate_folio_unmap(new_page_t get_new_page, free_page_t put_new_page,
- unsigned long private, struct folio *src,
- struct folio **dstp, enum migrate_mode mode,
- enum migrate_reason reason, struct list_head *ret)
+static int migrate_folio_unmap(new_folio_t get_new_folio,
+ free_folio_t put_new_folio, unsigned long private,
+ struct folio *src, struct folio **dstp, enum migrate_mode mode,
+ enum migrate_reason reason, struct list_head *ret)
{
struct folio *dst;
int rc = -EAGAIN;
- struct page *newpage = NULL;
int page_was_mapped = 0;
struct anon_vma *anon_vma = NULL;
bool is_lru = !__PageMovable(&src->page);
@@ -1128,10 +1118,9 @@ static int migrate_folio_unmap(new_page_t get_new_page, free_page_t put_new_page
return MIGRATEPAGE_SUCCESS;
}
- newpage = get_new_page(&src->page, private);
- if (!newpage)
+ dst = get_new_folio(src, private);
+ if (!dst)
return -ENOMEM;
- dst = page_folio(newpage);
*dstp = dst;
dst->private = NULL;
@@ -1156,6 +1145,14 @@ static int migrate_folio_unmap(new_page_t get_new_page, free_page_t put_new_page
if (current->flags & PF_MEMALLOC)
goto out;
+ /*
+ * In "light" mode, we can wait for transient locks (eg
+ * inserting a page into the page table), but it's not
+ * worth waiting for I/O.
+ */
+ if (mode == MIGRATE_SYNC_LIGHT && !folio_test_uptodate(src))
+ goto out;
+
folio_lock(src);
}
locked = true;
@@ -1251,13 +1248,13 @@ out:
ret = NULL;
migrate_folio_undo_src(src, page_was_mapped, anon_vma, locked, ret);
- migrate_folio_undo_dst(dst, dst_locked, put_new_page, private);
+ migrate_folio_undo_dst(dst, dst_locked, put_new_folio, private);
return rc;
}
/* Migrate the folio to the newly allocated folio in dst. */
-static int migrate_folio_move(free_page_t put_new_page, unsigned long private,
+static int migrate_folio_move(free_folio_t put_new_folio, unsigned long private,
struct folio *src, struct folio *dst,
enum migrate_mode mode, enum migrate_reason reason,
struct list_head *ret)
@@ -1329,7 +1326,7 @@ out:
}
migrate_folio_undo_src(src, page_was_mapped, anon_vma, true, ret);
- migrate_folio_undo_dst(dst, true, put_new_page, private);
+ migrate_folio_undo_dst(dst, true, put_new_folio, private);
return rc;
}
@@ -1352,16 +1349,14 @@ out:
* because then pte is replaced with migration swap entry and direct I/O code
* will wait in the page fault for migration to complete.
*/
-static int unmap_and_move_huge_page(new_page_t get_new_page,
- free_page_t put_new_page, unsigned long private,
- struct page *hpage, int force,
- enum migrate_mode mode, int reason,
- struct list_head *ret)
+static int unmap_and_move_huge_page(new_folio_t get_new_folio,
+ free_folio_t put_new_folio, unsigned long private,
+ struct folio *src, int force, enum migrate_mode mode,
+ int reason, struct list_head *ret)
{
- struct folio *dst, *src = page_folio(hpage);
+ struct folio *dst;
int rc = -EAGAIN;
int page_was_mapped = 0;
- struct page *new_hpage;
struct anon_vma *anon_vma = NULL;
struct address_space *mapping = NULL;
@@ -1371,10 +1366,9 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
return MIGRATEPAGE_SUCCESS;
}
- new_hpage = get_new_page(hpage, private);
- if (!new_hpage)
+ dst = get_new_folio(src, private);
+ if (!dst)
return -ENOMEM;
- dst = page_folio(new_hpage);
if (!folio_trylock(src)) {
if (!force)
@@ -1415,7 +1409,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
* semaphore in write mode here and set TTU_RMAP_LOCKED
* to let lower levels know we have taken the lock.
*/
- mapping = hugetlb_page_mapping_lock_write(hpage);
+ mapping = hugetlb_page_mapping_lock_write(&src->page);
if (unlikely(!mapping))
goto unlock_put_anon;
@@ -1445,7 +1439,7 @@ put_anon:
if (rc == MIGRATEPAGE_SUCCESS) {
move_hugetlb_state(src, dst, reason);
- put_new_page = NULL;
+ put_new_folio = NULL;
}
out_unlock:
@@ -1461,8 +1455,8 @@ out:
* it. Otherwise, put_page() will drop the reference grabbed during
* isolation.
*/
- if (put_new_page)
- put_new_page(new_hpage, private);
+ if (put_new_folio)
+ put_new_folio(dst, private);
else
folio_putback_active_hugetlb(dst);
@@ -1509,8 +1503,8 @@ struct migrate_pages_stats {
* exist any more. It is caller's responsibility to call putback_movable_pages()
* only if ret != 0.
*/
-static int migrate_hugetlbs(struct list_head *from, new_page_t get_new_page,
- free_page_t put_new_page, unsigned long private,
+static int migrate_hugetlbs(struct list_head *from, new_folio_t get_new_folio,
+ free_folio_t put_new_folio, unsigned long private,
enum migrate_mode mode, int reason,
struct migrate_pages_stats *stats,
struct list_head *ret_folios)
@@ -1548,9 +1542,9 @@ static int migrate_hugetlbs(struct list_head *from, new_page_t get_new_page,
continue;
}
- rc = unmap_and_move_huge_page(get_new_page,
- put_new_page, private,
- &folio->page, pass > 2, mode,
+ rc = unmap_and_move_huge_page(get_new_folio,
+ put_new_folio, private,
+ folio, pass > 2, mode,
reason, ret_folios);
/*
* The rules are:
@@ -1607,20 +1601,17 @@ static int migrate_hugetlbs(struct list_head *from, new_page_t get_new_page,
* deadlock (e.g., for loop device). So, if mode != MIGRATE_ASYNC, the
* length of the from list must be <= 1.
*/
-static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page,
- free_page_t put_new_page, unsigned long private,
- enum migrate_mode mode, int reason, struct list_head *ret_folios,
- struct list_head *split_folios, struct migrate_pages_stats *stats,
- int nr_pass)
+static int migrate_pages_batch(struct list_head *from,
+ new_folio_t get_new_folio, free_folio_t put_new_folio,
+ unsigned long private, enum migrate_mode mode, int reason,
+ struct list_head *ret_folios, struct list_head *split_folios,
+ struct migrate_pages_stats *stats, int nr_pass)
{
int retry = 1;
- int large_retry = 1;
int thp_retry = 1;
int nr_failed = 0;
int nr_retry_pages = 0;
- int nr_large_failed = 0;
int pass = 0;
- bool is_large = false;
bool is_thp = false;
struct folio *folio, *folio2, *dst = NULL, *dst2;
int rc, rc_saved = 0, nr_pages;
@@ -1631,20 +1622,13 @@ static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page,
VM_WARN_ON_ONCE(mode != MIGRATE_ASYNC &&
!list_empty(from) && !list_is_singular(from));
- for (pass = 0; pass < nr_pass && (retry || large_retry); pass++) {
+ for (pass = 0; pass < nr_pass && retry; pass++) {
retry = 0;
- large_retry = 0;
thp_retry = 0;
nr_retry_pages = 0;
list_for_each_entry_safe(folio, folio2, from, lru) {
- /*
- * Large folio statistics is based on the source large
- * folio. Capture required information that might get
- * lost during migration.
- */
- is_large = folio_test_large(folio);
- is_thp = is_large && folio_test_pmd_mappable(folio);
+ is_thp = folio_test_large(folio) && folio_test_pmd_mappable(folio);
nr_pages = folio_nr_pages(folio);
cond_resched();
@@ -1660,7 +1644,7 @@ static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page,
* list is processed.
*/
if (!thp_migration_supported() && is_thp) {
- nr_large_failed++;
+ nr_failed++;
stats->nr_thp_failed++;
if (!try_split_folio(folio, split_folios)) {
stats->nr_thp_split++;
@@ -1671,8 +1655,9 @@ static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page,
continue;
}
- rc = migrate_folio_unmap(get_new_page, put_new_page, private,
- folio, &dst, mode, reason, ret_folios);
+ rc = migrate_folio_unmap(get_new_folio, put_new_folio,
+ private, folio, &dst, mode, reason,
+ ret_folios);
/*
* The rules are:
* Success: folio will be freed
@@ -1688,38 +1673,33 @@ static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page,
* When memory is low, don't bother to try to migrate
* other folios, move unmapped folios, then exit.
*/
- if (is_large) {
- nr_large_failed++;
- stats->nr_thp_failed += is_thp;
- /* Large folio NUMA faulting doesn't split to retry. */
- if (!nosplit) {
- int ret = try_split_folio(folio, split_folios);
-
- if (!ret) {
- stats->nr_thp_split += is_thp;
- break;
- } else if (reason == MR_LONGTERM_PIN &&
- ret == -EAGAIN) {
- /*
- * Try again to split large folio to
- * mitigate the failure of longterm pinning.
- */
- large_retry++;
- thp_retry += is_thp;
- nr_retry_pages += nr_pages;
- /* Undo duplicated failure counting. */
- nr_large_failed--;
- stats->nr_thp_failed -= is_thp;
- break;
- }
+ nr_failed++;
+ stats->nr_thp_failed += is_thp;
+ /* Large folio NUMA faulting doesn't split to retry. */
+ if (folio_test_large(folio) && !nosplit) {
+ int ret = try_split_folio(folio, split_folios);
+
+ if (!ret) {
+ stats->nr_thp_split += is_thp;
+ break;
+ } else if (reason == MR_LONGTERM_PIN &&
+ ret == -EAGAIN) {
+ /*
+ * Try again to split large folio to
+ * mitigate the failure of longterm pinning.
+ */
+ retry++;
+ thp_retry += is_thp;
+ nr_retry_pages += nr_pages;
+ /* Undo duplicated failure counting. */
+ nr_failed--;
+ stats->nr_thp_failed -= is_thp;
+ break;
}
- } else {
- nr_failed++;
}
stats->nr_failed_pages += nr_pages + nr_retry_pages;
/* nr_failed isn't updated for not used */
- nr_large_failed += large_retry;
stats->nr_thp_failed += thp_retry;
rc_saved = rc;
if (list_empty(&unmap_folios))
@@ -1727,12 +1707,8 @@ static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page,
else
goto move;
case -EAGAIN:
- if (is_large) {
- large_retry++;
- thp_retry += is_thp;
- } else {
- retry++;
- }
+ retry++;
+ thp_retry += is_thp;
nr_retry_pages += nr_pages;
break;
case MIGRATEPAGE_SUCCESS:
@@ -1750,20 +1726,14 @@ static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page,
* removed from migration folio list and not
* retried in the next outer loop.
*/
- if (is_large) {
- nr_large_failed++;
- stats->nr_thp_failed += is_thp;
- } else {
- nr_failed++;
- }
-
+ nr_failed++;
+ stats->nr_thp_failed += is_thp;
stats->nr_failed_pages += nr_pages;
break;
}
}
}
nr_failed += retry;
- nr_large_failed += large_retry;
stats->nr_thp_failed += thp_retry;
stats->nr_failed_pages += nr_retry_pages;
move:
@@ -1771,22 +1741,20 @@ move:
try_to_unmap_flush();
retry = 1;
- for (pass = 0; pass < nr_pass && (retry || large_retry); pass++) {
+ for (pass = 0; pass < nr_pass && retry; pass++) {
retry = 0;
- large_retry = 0;
thp_retry = 0;
nr_retry_pages = 0;
dst = list_first_entry(&dst_folios, struct folio, lru);
dst2 = list_next_entry(dst, lru);
list_for_each_entry_safe(folio, folio2, &unmap_folios, lru) {
- is_large = folio_test_large(folio);
- is_thp = is_large && folio_test_pmd_mappable(folio);
+ is_thp = folio_test_large(folio) && folio_test_pmd_mappable(folio);
nr_pages = folio_nr_pages(folio);
cond_resched();
- rc = migrate_folio_move(put_new_page, private,
+ rc = migrate_folio_move(put_new_folio, private,
folio, dst, mode,
reason, ret_folios);
/*
@@ -1797,12 +1765,8 @@ move:
*/
switch(rc) {
case -EAGAIN:
- if (is_large) {
- large_retry++;
- thp_retry += is_thp;
- } else {
- retry++;
- }
+ retry++;
+ thp_retry += is_thp;
nr_retry_pages += nr_pages;
break;
case MIGRATEPAGE_SUCCESS:
@@ -1810,13 +1774,8 @@ move:
stats->nr_thp_succeeded += is_thp;
break;
default:
- if (is_large) {
- nr_large_failed++;
- stats->nr_thp_failed += is_thp;
- } else {
- nr_failed++;
- }
-
+ nr_failed++;
+ stats->nr_thp_failed += is_thp;
stats->nr_failed_pages += nr_pages;
break;
}
@@ -1825,14 +1784,10 @@ move:
}
}
nr_failed += retry;
- nr_large_failed += large_retry;
stats->nr_thp_failed += thp_retry;
stats->nr_failed_pages += nr_retry_pages;
- if (rc_saved)
- rc = rc_saved;
- else
- rc = nr_failed + nr_large_failed;
+ rc = rc_saved ? : nr_failed;
out:
/* Cleanup remaining folios */
dst = list_first_entry(&dst_folios, struct folio, lru);
@@ -1845,7 +1800,7 @@ out:
migrate_folio_undo_src(folio, page_was_mapped, anon_vma,
true, ret_folios);
list_del(&dst->lru);
- migrate_folio_undo_dst(dst, true, put_new_page, private);
+ migrate_folio_undo_dst(dst, true, put_new_folio, private);
dst = dst2;
dst2 = list_next_entry(dst, lru);
}
@@ -1853,10 +1808,11 @@ out:
return rc;
}
-static int migrate_pages_sync(struct list_head *from, new_page_t get_new_page,
- free_page_t put_new_page, unsigned long private,
- enum migrate_mode mode, int reason, struct list_head *ret_folios,
- struct list_head *split_folios, struct migrate_pages_stats *stats)
+static int migrate_pages_sync(struct list_head *from, new_folio_t get_new_folio,
+ free_folio_t put_new_folio, unsigned long private,
+ enum migrate_mode mode, int reason,
+ struct list_head *ret_folios, struct list_head *split_folios,
+ struct migrate_pages_stats *stats)
{
int rc, nr_failed = 0;
LIST_HEAD(folios);
@@ -1864,7 +1820,7 @@ static int migrate_pages_sync(struct list_head *from, new_page_t get_new_page,
memset(&astats, 0, sizeof(astats));
/* Try to migrate in batch with MIGRATE_ASYNC mode firstly */
- rc = migrate_pages_batch(from, get_new_page, put_new_page, private, MIGRATE_ASYNC,
+ rc = migrate_pages_batch(from, get_new_folio, put_new_folio, private, MIGRATE_ASYNC,
reason, &folios, split_folios, &astats,
NR_MAX_MIGRATE_ASYNC_RETRY);
stats->nr_succeeded += astats.nr_succeeded;
@@ -1886,7 +1842,7 @@ static int migrate_pages_sync(struct list_head *from, new_page_t get_new_page,
list_splice_tail_init(&folios, from);
while (!list_empty(from)) {
list_move(from->next, &folios);
- rc = migrate_pages_batch(&folios, get_new_page, put_new_page,
+ rc = migrate_pages_batch(&folios, get_new_folio, put_new_folio,
private, mode, reason, ret_folios,
split_folios, stats, NR_MAX_MIGRATE_SYNC_RETRY);
list_splice_tail_init(&folios, ret_folios);
@@ -1903,11 +1859,11 @@ static int migrate_pages_sync(struct list_head *from, new_page_t get_new_page,
* supplied as the target for the page migration
*
* @from: The list of folios to be migrated.
- * @get_new_page: The function used to allocate free folios to be used
+ * @get_new_folio: The function used to allocate free folios to be used
* as the target of the folio migration.
- * @put_new_page: The function used to free target folios if migration
+ * @put_new_folio: The function used to free target folios if migration
* fails, or NULL if no special handling is necessary.
- * @private: Private data to be passed on to get_new_page()
+ * @private: Private data to be passed on to get_new_folio()
* @mode: The migration mode that specifies the constraints for
* folio migration, if any.
* @reason: The reason for folio migration.
@@ -1924,8 +1880,8 @@ static int migrate_pages_sync(struct list_head *from, new_page_t get_new_page,
* considered as the number of non-migrated large folio, no matter how many
* split folios of the large folio are migrated successfully.
*/
-int migrate_pages(struct list_head *from, new_page_t get_new_page,
- free_page_t put_new_page, unsigned long private,
+int migrate_pages(struct list_head *from, new_folio_t get_new_folio,
+ free_folio_t put_new_folio, unsigned long private,
enum migrate_mode mode, int reason, unsigned int *ret_succeeded)
{
int rc, rc_gather;
@@ -1940,7 +1896,7 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
memset(&stats, 0, sizeof(stats));
- rc_gather = migrate_hugetlbs(from, get_new_page, put_new_page, private,
+ rc_gather = migrate_hugetlbs(from, get_new_folio, put_new_folio, private,
mode, reason, &stats, &ret_folios);
if (rc_gather < 0)
goto out;
@@ -1963,12 +1919,14 @@ again:
else
list_splice_init(from, &folios);
if (mode == MIGRATE_ASYNC)
- rc = migrate_pages_batch(&folios, get_new_page, put_new_page, private,
- mode, reason, &ret_folios, &split_folios, &stats,
- NR_MAX_MIGRATE_PAGES_RETRY);
+ rc = migrate_pages_batch(&folios, get_new_folio, put_new_folio,
+ private, mode, reason, &ret_folios,
+ &split_folios, &stats,
+ NR_MAX_MIGRATE_PAGES_RETRY);
else
- rc = migrate_pages_sync(&folios, get_new_page, put_new_page, private,
- mode, reason, &ret_folios, &split_folios, &stats);
+ rc = migrate_pages_sync(&folios, get_new_folio, put_new_folio,
+ private, mode, reason, &ret_folios,
+ &split_folios, &stats);
list_splice_tail_init(&folios, &ret_folios);
if (rc < 0) {
rc_gather = rc;
@@ -1981,8 +1939,9 @@ again:
* is counted as 1 failure already. And, we only try to migrate
* with minimal effort, force MIGRATE_ASYNC mode and retry once.
*/
- migrate_pages_batch(&split_folios, get_new_page, put_new_page, private,
- MIGRATE_ASYNC, reason, &ret_folios, NULL, &stats, 1);
+ migrate_pages_batch(&split_folios, get_new_folio,
+ put_new_folio, private, MIGRATE_ASYNC, reason,
+ &ret_folios, NULL, &stats, 1);
list_splice_tail_init(&split_folios, &ret_folios);
}
rc_gather += rc;
@@ -2017,14 +1976,11 @@ out:
return rc_gather;
}
-struct page *alloc_migration_target(struct page *page, unsigned long private)
+struct folio *alloc_migration_target(struct folio *src, unsigned long private)
{
- struct folio *folio = page_folio(page);
struct migration_target_control *mtc;
gfp_t gfp_mask;
unsigned int order = 0;
- struct folio *hugetlb_folio = NULL;
- struct folio *new_folio = NULL;
int nid;
int zidx;
@@ -2032,33 +1988,30 @@ struct page *alloc_migration_target(struct page *page, unsigned long private)
gfp_mask = mtc->gfp_mask;
nid = mtc->nid;
if (nid == NUMA_NO_NODE)
- nid = folio_nid(folio);
+ nid = folio_nid(src);
- if (folio_test_hugetlb(folio)) {
- struct hstate *h = folio_hstate(folio);
+ if (folio_test_hugetlb(src)) {
+ struct hstate *h = folio_hstate(src);
gfp_mask = htlb_modify_alloc_mask(h, gfp_mask);
- hugetlb_folio = alloc_hugetlb_folio_nodemask(h, nid,
+ return alloc_hugetlb_folio_nodemask(h, nid,
mtc->nmask, gfp_mask);
- return &hugetlb_folio->page;
}
- if (folio_test_large(folio)) {
+ if (folio_test_large(src)) {
/*
* clear __GFP_RECLAIM to make the migration callback
* consistent with regular THP allocations.
*/
gfp_mask &= ~__GFP_RECLAIM;
gfp_mask |= GFP_TRANSHUGE;
- order = folio_order(folio);
+ order = folio_order(src);
}
- zidx = zone_idx(folio_zone(folio));
+ zidx = zone_idx(folio_zone(src));
if (is_highmem_idx(zidx) || zidx == ZONE_MOVABLE)
gfp_mask |= __GFP_HIGHMEM;
- new_folio = __folio_alloc(gfp_mask, order, nid, mtc->nmask);
-
- return &new_folio->page;
+ return __folio_alloc(gfp_mask, order, nid, mtc->nmask);
}
#ifdef CONFIG_NUMA
@@ -2509,13 +2462,12 @@ static bool migrate_balanced_pgdat(struct pglist_data *pgdat,
return false;
}
-static struct page *alloc_misplaced_dst_page(struct page *page,
+static struct folio *alloc_misplaced_dst_folio(struct folio *src,
unsigned long data)
{
int nid = (int) data;
- int order = compound_order(page);
+ int order = folio_order(src);
gfp_t gfp = __GFP_THISNODE;
- struct folio *new;
if (order > 0)
gfp |= GFP_TRANSHUGE_LIGHT;
@@ -2524,9 +2476,7 @@ static struct page *alloc_misplaced_dst_page(struct page *page,
__GFP_NOWARN;
gfp &= ~__GFP_RECLAIM;
}
- new = __folio_alloc_node(gfp, order, nid);
-
- return &new->page;
+ return __folio_alloc_node(gfp, order, nid);
}
static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
@@ -2604,7 +2554,7 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
goto out;
list_add(&page->lru, &migratepages);
- nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page,
+ nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_folio,
NULL, node, MIGRATE_ASYNC,
MR_NUMA_MISPLACED, &nr_succeeded);
if (nr_remaining) {
diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index d30c9de60b0d..8365158460ed 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -83,9 +83,6 @@ again:
if (is_huge_zero_page(page)) {
spin_unlock(ptl);
split_huge_pmd(vma, pmdp, addr);
- if (pmd_trans_unstable(pmdp))
- return migrate_vma_collect_skip(start, end,
- walk);
} else {
int ret;
@@ -100,16 +97,12 @@ again:
if (ret)
return migrate_vma_collect_skip(start, end,
walk);
- if (pmd_none(*pmdp))
- return migrate_vma_collect_hole(start, end, -1,
- walk);
}
}
- if (unlikely(pmd_bad(*pmdp)))
- return migrate_vma_collect_skip(start, end, walk);
-
ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
+ if (!ptep)
+ goto again;
arch_enter_lazy_mmu_mode();
for (; addr < end; addr += PAGE_SIZE, ptep++) {
@@ -118,7 +111,7 @@ again:
swp_entry_t entry;
pte_t pte;
- pte = *ptep;
+ pte = ptep_get(ptep);
if (pte_none(pte)) {
if (vma_is_anonymous(vma)) {
@@ -201,7 +194,7 @@ again:
bool anon_exclusive;
pte_t swp_pte;
- flush_cache_page(vma, addr, pte_pfn(*ptep));
+ flush_cache_page(vma, addr, pte_pfn(pte));
anon_exclusive = PageAnon(page) && PageAnonExclusive(page);
if (anon_exclusive) {
pte = ptep_clear_flush(vma, addr, ptep);
@@ -383,7 +376,7 @@ static unsigned long migrate_device_unmap(unsigned long *src_pfns,
/* ZONE_DEVICE pages are not on LRU */
if (!is_zone_device_page(page)) {
if (!PageLRU(page) && allow_drain) {
- /* Drain CPU's pagevec */
+ /* Drain CPU's lru cache */
lru_add_drain_all();
allow_drain = false;
}
@@ -580,6 +573,7 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
pud_t *pudp;
pmd_t *pmdp;
pte_t *ptep;
+ pte_t orig_pte;
/* Only allow populating anonymous memory */
if (!vma_is_anonymous(vma))
@@ -595,27 +589,10 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
pmdp = pmd_alloc(mm, pudp, addr);
if (!pmdp)
goto abort;
-
if (pmd_trans_huge(*pmdp) || pmd_devmap(*pmdp))
goto abort;
-
- /*
- * Use pte_alloc() instead of pte_alloc_map(). We can't run
- * pte_offset_map() on pmds where a huge pmd might be created
- * from a different thread.
- *
- * pte_alloc_map() is safe to use under mmap_write_lock(mm) or when
- * parallel threads are excluded by other means.
- *
- * Here we only have mmap_read_lock(mm).
- */
if (pte_alloc(mm, pmdp))
goto abort;
-
- /* See the comment in pte_alloc_one_map() */
- if (unlikely(pmd_trans_unstable(pmdp)))
- goto abort;
-
if (unlikely(anon_vma_prepare(vma)))
goto abort;
if (mem_cgroup_charge(page_folio(page), vma->vm_mm, GFP_KERNEL))
@@ -650,17 +627,20 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
}
ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
+ if (!ptep)
+ goto abort;
+ orig_pte = ptep_get(ptep);
if (check_stable_address_space(mm))
goto unlock_abort;
- if (pte_present(*ptep)) {
- unsigned long pfn = pte_pfn(*ptep);
+ if (pte_present(orig_pte)) {
+ unsigned long pfn = pte_pfn(orig_pte);
if (!is_zero_pfn(pfn))
goto unlock_abort;
flush = true;
- } else if (!pte_none(*ptep))
+ } else if (!pte_none(orig_pte))
goto unlock_abort;
/*
@@ -677,7 +657,7 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
get_page(page);
if (flush) {
- flush_cache_page(vma, addr, pte_pfn(*ptep));
+ flush_cache_page(vma, addr, pte_pfn(orig_pte));
ptep_clear_flush_notify(vma, addr, ptep);
set_pte_at_notify(mm, addr, ptep, entry);
update_mmu_cache(vma, addr, ptep);
diff --git a/mm/mincore.c b/mm/mincore.c
index 2d5be013a25a..b7f7a516b26c 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -113,14 +113,13 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
goto out;
}
- if (pmd_trans_unstable(pmd)) {
- __mincore_unmapped_range(addr, end, vma, vec);
- goto out;
- }
-
ptep = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
+ if (!ptep) {
+ walk->action = ACTION_AGAIN;
+ return 0;
+ }
for (; addr != end; ptep++, addr += PAGE_SIZE) {
- pte_t pte = *ptep;
+ pte_t pte = ptep_get(ptep);
/* We need to do cache lookup too for pte markers */
if (pte_none_mostly(pte))
diff --git a/mm/mlock.c b/mm/mlock.c
index 40b43f8740df..d7db94519884 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -312,6 +312,7 @@ static int mlock_pte_range(pmd_t *pmd, unsigned long addr,
struct vm_area_struct *vma = walk->vma;
spinlock_t *ptl;
pte_t *start_pte, *pte;
+ pte_t ptent;
struct folio *folio;
ptl = pmd_trans_huge_lock(pmd, vma);
@@ -329,10 +330,15 @@ static int mlock_pte_range(pmd_t *pmd, unsigned long addr,
}
start_pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+ if (!start_pte) {
+ walk->action = ACTION_AGAIN;
+ return 0;
+ }
for (pte = start_pte; addr != end; pte++, addr += PAGE_SIZE) {
- if (!pte_present(*pte))
+ ptent = ptep_get(pte);
+ if (!pte_present(ptent))
continue;
- folio = vm_normal_folio(vma, addr, *pte);
+ folio = vm_normal_folio(vma, addr, ptent);
if (!folio || folio_is_zone_device(folio))
continue;
if (folio_test_large(folio))
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 7f7f9c677854..a1963c3322af 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -259,6 +259,8 @@ static int __init cmdline_parse_core(char *p, unsigned long *core,
return 0;
}
+bool mirrored_kernelcore __initdata_memblock;
+
/*
* kernelcore=size sets the amount of memory for use for allocations that
* cannot be reclaimed or migrated.
@@ -644,10 +646,8 @@ static inline void pgdat_set_deferred_range(pg_data_t *pgdat)
}
/* Returns true if the struct page for the pfn is initialised */
-static inline bool __meminit early_page_initialised(unsigned long pfn)
+static inline bool __meminit early_page_initialised(unsigned long pfn, int nid)
{
- int nid = early_pfn_to_nid(pfn);
-
if (node_online(nid) && pfn >= NODE_DATA(nid)->first_deferred_pfn)
return false;
@@ -693,15 +693,14 @@ defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
return false;
}
-static void __meminit init_reserved_page(unsigned long pfn)
+static void __meminit init_reserved_page(unsigned long pfn, int nid)
{
pg_data_t *pgdat;
- int nid, zid;
+ int zid;
- if (early_page_initialised(pfn))
+ if (early_page_initialised(pfn, nid))
return;
- nid = early_pfn_to_nid(pfn);
pgdat = NODE_DATA(nid);
for (zid = 0; zid < MAX_NR_ZONES; zid++) {
@@ -715,7 +714,7 @@ static void __meminit init_reserved_page(unsigned long pfn)
#else
static inline void pgdat_set_deferred_range(pg_data_t *pgdat) {}
-static inline bool early_page_initialised(unsigned long pfn)
+static inline bool early_page_initialised(unsigned long pfn, int nid)
{
return true;
}
@@ -725,7 +724,7 @@ static inline bool defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
return false;
}
-static inline void init_reserved_page(unsigned long pfn)
+static inline void init_reserved_page(unsigned long pfn, int nid)
{
}
#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
@@ -736,7 +735,8 @@ static inline void init_reserved_page(unsigned long pfn)
* marks the pages PageReserved. The remaining valid pages are later
* sent to the buddy page allocator.
*/
-void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end)
+void __meminit reserve_bootmem_region(phys_addr_t start,
+ phys_addr_t end, int nid)
{
unsigned long start_pfn = PFN_DOWN(start);
unsigned long end_pfn = PFN_UP(end);
@@ -745,7 +745,7 @@ void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end)
if (pfn_valid(start_pfn)) {
struct page *page = pfn_to_page(start_pfn);
- init_reserved_page(start_pfn);
+ init_reserved_page(start_pfn, nid);
/* Avoid false-positive PageTail() */
INIT_LIST_HEAD(&page->lru);
@@ -1166,24 +1166,15 @@ unsigned long __init absent_pages_in_range(unsigned long start_pfn,
/* Return the number of page frames in holes in a zone on a node */
static unsigned long __init zone_absent_pages_in_node(int nid,
unsigned long zone_type,
- unsigned long node_start_pfn,
- unsigned long node_end_pfn)
+ unsigned long zone_start_pfn,
+ unsigned long zone_end_pfn)
{
- unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
- unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
- unsigned long zone_start_pfn, zone_end_pfn;
unsigned long nr_absent;
- /* When hotadd a new node from cpu_up(), the node should be empty */
- if (!node_start_pfn && !node_end_pfn)
+ /* zone is empty, we don't have any absent pages */
+ if (zone_start_pfn == zone_end_pfn)
return 0;
- zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);
- zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);
-
- adjust_zone_range_for_zone_movable(nid, zone_type,
- node_start_pfn, node_end_pfn,
- &zone_start_pfn, &zone_end_pfn);
nr_absent = __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
/*
@@ -1227,9 +1218,6 @@ static unsigned long __init zone_spanned_pages_in_node(int nid,
{
unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
- /* When hotadd a new node from cpu_up(), the node should be empty */
- if (!node_start_pfn && !node_end_pfn)
- return 0;
/* Get the start and end of the zone */
*zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);
@@ -1250,6 +1238,24 @@ static unsigned long __init zone_spanned_pages_in_node(int nid,
return *zone_end_pfn - *zone_start_pfn;
}
+static void __init reset_memoryless_node_totalpages(struct pglist_data *pgdat)
+{
+ struct zone *z;
+
+ for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++) {
+ z->zone_start_pfn = 0;
+ z->spanned_pages = 0;
+ z->present_pages = 0;
+#if defined(CONFIG_MEMORY_HOTPLUG)
+ z->present_early_pages = 0;
+#endif
+ }
+
+ pgdat->node_spanned_pages = 0;
+ pgdat->node_present_pages = 0;
+ pr_debug("On node %d totalpages: 0\n", pgdat->node_id);
+}
+
static void __init calculate_node_totalpages(struct pglist_data *pgdat,
unsigned long node_start_pfn,
unsigned long node_end_pfn)
@@ -1261,7 +1267,7 @@ static void __init calculate_node_totalpages(struct pglist_data *pgdat,
struct zone *zone = pgdat->node_zones + i;
unsigned long zone_start_pfn, zone_end_pfn;
unsigned long spanned, absent;
- unsigned long size, real_size;
+ unsigned long real_size;
spanned = zone_spanned_pages_in_node(pgdat->node_id, i,
node_start_pfn,
@@ -1269,23 +1275,22 @@ static void __init calculate_node_totalpages(struct pglist_data *pgdat,
&zone_start_pfn,
&zone_end_pfn);
absent = zone_absent_pages_in_node(pgdat->node_id, i,
- node_start_pfn,
- node_end_pfn);
+ zone_start_pfn,
+ zone_end_pfn);
- size = spanned;
- real_size = size - absent;
+ real_size = spanned - absent;
- if (size)
+ if (spanned)
zone->zone_start_pfn = zone_start_pfn;
else
zone->zone_start_pfn = 0;
- zone->spanned_pages = size;
+ zone->spanned_pages = spanned;
zone->present_pages = real_size;
#if defined(CONFIG_MEMORY_HOTPLUG)
zone->present_early_pages = real_size;
#endif
- totalpages += size;
+ totalpages += spanned;
realtotalpages += real_size;
}
@@ -1375,6 +1380,10 @@ static void __meminit zone_init_free_lists(struct zone *zone)
INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);
zone->free_area[order].nr_free = 0;
}
+
+#ifdef CONFIG_UNACCEPTED_MEMORY
+ INIT_LIST_HEAD(&zone->unaccepted_pages);
+#endif
}
void __meminit init_currently_empty_zone(struct zone *zone,
@@ -1502,6 +1511,8 @@ void __ref free_area_init_core_hotplug(struct pglist_data *pgdat)
pgdat->kswapd_order = 0;
pgdat->kswapd_highest_zoneidx = 0;
pgdat->node_start_pfn = 0;
+ pgdat->node_present_pages = 0;
+
for_each_online_cpu(cpu) {
struct per_cpu_nodestat *p;
@@ -1509,8 +1520,17 @@ void __ref free_area_init_core_hotplug(struct pglist_data *pgdat)
memset(p, 0, sizeof(*p));
}
- for (z = 0; z < MAX_NR_ZONES; z++)
- zone_init_internals(&pgdat->node_zones[z], z, nid, 0);
+ /*
+ * When memory is hot-added, all the memory is in offline state. So
+ * clear all zones' present_pages and managed_pages because they will
+ * be updated in online_pages() and offline_pages().
+ */
+ for (z = 0; z < MAX_NR_ZONES; z++) {
+ struct zone *zone = pgdat->node_zones + z;
+
+ zone->present_pages = 0;
+ zone_init_internals(zone, z, nid, 0);
+ }
}
#endif
@@ -1578,7 +1598,6 @@ static void __init free_area_init_core(struct pglist_data *pgdat)
if (!size)
continue;
- set_pageblock_order();
setup_usemap(zone);
init_currently_empty_zone(zone, zone->zone_start_pfn, size);
}
@@ -1702,11 +1721,13 @@ static void __init free_area_init_node(int nid)
pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid,
(u64)start_pfn << PAGE_SHIFT,
end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0);
+
+ calculate_node_totalpages(pgdat, start_pfn, end_pfn);
} else {
pr_info("Initmem setup node %d as memoryless\n", nid);
- }
- calculate_node_totalpages(pgdat, start_pfn, end_pfn);
+ reset_memoryless_node_totalpages(pgdat);
+ }
alloc_node_mem_map(pgdat);
pgdat_set_deferred_range(pgdat);
@@ -1716,7 +1737,7 @@ static void __init free_area_init_node(int nid)
}
/* Any regular or high memory on that node ? */
-static void check_for_memory(pg_data_t *pgdat, int nid)
+static void check_for_memory(pg_data_t *pgdat)
{
enum zone_type zone_type;
@@ -1724,9 +1745,9 @@ static void check_for_memory(pg_data_t *pgdat, int nid)
struct zone *zone = &pgdat->node_zones[zone_type];
if (populated_zone(zone)) {
if (IS_ENABLED(CONFIG_HIGHMEM))
- node_set_state(nid, N_HIGH_MEMORY);
+ node_set_state(pgdat->node_id, N_HIGH_MEMORY);
if (zone_type <= ZONE_NORMAL)
- node_set_state(nid, N_NORMAL_MEMORY);
+ node_set_state(pgdat->node_id, N_NORMAL_MEMORY);
break;
}
}
@@ -1745,11 +1766,6 @@ void __init setup_nr_node_ids(void)
}
#endif
-static void __init free_area_init_memoryless_node(int nid)
-{
- free_area_init_node(nid);
-}
-
/*
* Some architectures, e.g. ARC may have ZONE_HIGHMEM below ZONE_NORMAL. For
* such cases we allow max_zone_pfn sorted in the descending order
@@ -1848,6 +1864,8 @@ void __init free_area_init(unsigned long *max_zone_pfn)
/* Initialise every node */
mminit_verify_pageflags_layout();
setup_nr_node_ids();
+ set_pageblock_order();
+
for_each_node(nid) {
pg_data_t *pgdat;
@@ -1860,7 +1878,7 @@ void __init free_area_init(unsigned long *max_zone_pfn)
panic("Cannot allocate %zuB for node %d.\n",
sizeof(*pgdat), nid);
arch_refresh_nodedata(nid, pgdat);
- free_area_init_memoryless_node(nid);
+ free_area_init_node(nid);
/*
* We do not want to confuse userspace by sysfs
@@ -1881,7 +1899,7 @@ void __init free_area_init(unsigned long *max_zone_pfn)
/* Any memory on that node */
if (pgdat->node_present_pages)
node_set_state(nid, N_MEMORY);
- check_for_memory(pgdat, nid);
+ check_for_memory(pgdat);
}
memmap_init();
@@ -1960,6 +1978,9 @@ static void __init deferred_free_range(unsigned long pfn,
return;
}
+ /* Accept chunks smaller than MAX_ORDER upfront */
+ accept_memory(PFN_PHYS(pfn), PFN_PHYS(pfn + nr_pages));
+
for (i = 0; i < nr_pages; i++, page++, pfn++) {
if (pageblock_aligned(pfn))
set_pageblock_migratetype(page, MIGRATE_MOVABLE);
@@ -2328,6 +2349,28 @@ void __init init_cma_reserved_pageblock(struct page *page)
}
#endif
+void set_zone_contiguous(struct zone *zone)
+{
+ unsigned long block_start_pfn = zone->zone_start_pfn;
+ unsigned long block_end_pfn;
+
+ block_end_pfn = pageblock_end_pfn(block_start_pfn);
+ for (; block_start_pfn < zone_end_pfn(zone);
+ block_start_pfn = block_end_pfn,
+ block_end_pfn += pageblock_nr_pages) {
+
+ block_end_pfn = min(block_end_pfn, zone_end_pfn(zone));
+
+ if (!__pageblock_pfn_to_page(block_start_pfn,
+ block_end_pfn, zone))
+ return;
+ cond_resched();
+ }
+
+ /* We confirm that there is no hole */
+ zone->contiguous = true;
+}
+
void __init page_alloc_init_late(void)
{
struct zone *zone;
@@ -2368,6 +2411,8 @@ void __init page_alloc_init_late(void)
/* Initialize page ext after all struct pages are initialized. */
if (deferred_struct_pages)
page_ext_init();
+
+ page_alloc_sysctl_init();
}
#ifndef __HAVE_ARCH_RESERVED_KERNEL_PAGES
@@ -2532,8 +2577,14 @@ void __init set_dma_reserve(unsigned long new_dma_reserve)
void __init memblock_free_pages(struct page *page, unsigned long pfn,
unsigned int order)
{
- if (!early_page_initialised(pfn))
- return;
+
+ if (IS_ENABLED(CONFIG_DEFERRED_STRUCT_PAGE_INIT)) {
+ int nid = early_pfn_to_nid(pfn);
+
+ if (!early_page_initialised(pfn, nid))
+ return;
+ }
+
if (!kmsan_memblock_free_pages(page, order)) {
/* KMSAN will take care of these pages. */
return;
@@ -2541,6 +2592,12 @@ void __init memblock_free_pages(struct page *page, unsigned long pfn,
__free_pages_core(page, order);
}
+DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, init_on_alloc);
+EXPORT_SYMBOL(init_on_alloc);
+
+DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_FREE_DEFAULT_ON, init_on_free);
+EXPORT_SYMBOL(init_on_free);
+
static bool _init_on_alloc_enabled_early __read_mostly
= IS_ENABLED(CONFIG_INIT_ON_ALLOC_DEFAULT_ON);
static int __init early_init_on_alloc(char *buf)
diff --git a/mm/mmap.c b/mm/mmap.c
index 13678edaa22c..3e5793ebbaae 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -182,7 +182,8 @@ static int check_brk_limits(unsigned long addr, unsigned long len)
if (IS_ERR_VALUE(mapped_addr))
return mapped_addr;
- return mlock_future_check(current->mm, current->mm->def_flags, len);
+ return mlock_future_ok(current->mm, current->mm->def_flags, len)
+ ? 0 : -EAGAIN;
}
static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *brkvma,
unsigned long addr, unsigned long request, unsigned long flags);
@@ -300,61 +301,40 @@ out:
}
#if defined(CONFIG_DEBUG_VM_MAPLE_TREE)
-extern void mt_validate(struct maple_tree *mt);
-extern void mt_dump(const struct maple_tree *mt);
-
-/* Validate the maple tree */
-static void validate_mm_mt(struct mm_struct *mm)
-{
- struct maple_tree *mt = &mm->mm_mt;
- struct vm_area_struct *vma_mt;
-
- MA_STATE(mas, mt, 0, 0);
-
- mt_validate(&mm->mm_mt);
- mas_for_each(&mas, vma_mt, ULONG_MAX) {
- if ((vma_mt->vm_start != mas.index) ||
- (vma_mt->vm_end - 1 != mas.last)) {
- pr_emerg("issue in %s\n", current->comm);
- dump_stack();
- dump_vma(vma_mt);
- pr_emerg("mt piv: %p %lu - %lu\n", vma_mt,
- mas.index, mas.last);
- pr_emerg("mt vma: %p %lu - %lu\n", vma_mt,
- vma_mt->vm_start, vma_mt->vm_end);
-
- mt_dump(mas.tree);
- if (vma_mt->vm_end != mas.last + 1) {
- pr_err("vma: %p vma_mt %lu-%lu\tmt %lu-%lu\n",
- mm, vma_mt->vm_start, vma_mt->vm_end,
- mas.index, mas.last);
- mt_dump(mas.tree);
- }
- VM_BUG_ON_MM(vma_mt->vm_end != mas.last + 1, mm);
- if (vma_mt->vm_start != mas.index) {
- pr_err("vma: %p vma_mt %p %lu - %lu doesn't match\n",
- mm, vma_mt, vma_mt->vm_start, vma_mt->vm_end);
- mt_dump(mas.tree);
- }
- VM_BUG_ON_MM(vma_mt->vm_start != mas.index, mm);
- }
- }
-}
-
static void validate_mm(struct mm_struct *mm)
{
int bug = 0;
int i = 0;
struct vm_area_struct *vma;
- MA_STATE(mas, &mm->mm_mt, 0, 0);
-
- validate_mm_mt(mm);
+ VMA_ITERATOR(vmi, mm, 0);
- mas_for_each(&mas, vma, ULONG_MAX) {
+ mt_validate(&mm->mm_mt);
+ for_each_vma(vmi, vma) {
#ifdef CONFIG_DEBUG_VM_RB
struct anon_vma *anon_vma = vma->anon_vma;
struct anon_vma_chain *avc;
+#endif
+ unsigned long vmi_start, vmi_end;
+ bool warn = 0;
+
+ vmi_start = vma_iter_addr(&vmi);
+ vmi_end = vma_iter_end(&vmi);
+ if (VM_WARN_ON_ONCE_MM(vma->vm_end != vmi_end, mm))
+ warn = 1;
+ if (VM_WARN_ON_ONCE_MM(vma->vm_start != vmi_start, mm))
+ warn = 1;
+
+ if (warn) {
+ pr_emerg("issue in %s\n", current->comm);
+ dump_stack();
+ dump_vma(vma);
+ pr_emerg("tree range: %px start %lx end %lx\n", vma,
+ vmi_start, vmi_end - 1);
+ vma_iter_dump_tree(&vmi);
+ }
+
+#ifdef CONFIG_DEBUG_VM_RB
if (anon_vma) {
anon_vma_lock_read(anon_vma);
list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
@@ -365,14 +345,13 @@ static void validate_mm(struct mm_struct *mm)
i++;
}
if (i != mm->map_count) {
- pr_emerg("map_count %d mas_for_each %d\n", mm->map_count, i);
+ pr_emerg("map_count %d vma iterator %d\n", mm->map_count, i);
bug = 1;
}
VM_BUG_ON_MM(bug, mm);
}
#else /* !CONFIG_DEBUG_VM_MAPLE_TREE */
-#define validate_mm_mt(root) do { } while (0)
#define validate_mm(mm) do { } while (0)
#endif /* CONFIG_DEBUG_VM_MAPLE_TREE */
@@ -1167,21 +1146,21 @@ static inline unsigned long round_hint_to_min(unsigned long hint)
return hint;
}
-int mlock_future_check(struct mm_struct *mm, unsigned long flags,
- unsigned long len)
+bool mlock_future_ok(struct mm_struct *mm, unsigned long flags,
+ unsigned long bytes)
{
- unsigned long locked, lock_limit;
+ unsigned long locked_pages, limit_pages;
- /* mlock MCL_FUTURE? */
- if (flags & VM_LOCKED) {
- locked = len >> PAGE_SHIFT;
- locked += mm->locked_vm;
- lock_limit = rlimit(RLIMIT_MEMLOCK);
- lock_limit >>= PAGE_SHIFT;
- if (locked > lock_limit && !capable(CAP_IPC_LOCK))
- return -EAGAIN;
- }
- return 0;
+ if (!(flags & VM_LOCKED) || capable(CAP_IPC_LOCK))
+ return true;
+
+ locked_pages = bytes >> PAGE_SHIFT;
+ locked_pages += mm->locked_vm;
+
+ limit_pages = rlimit(RLIMIT_MEMLOCK);
+ limit_pages >>= PAGE_SHIFT;
+
+ return locked_pages <= limit_pages;
}
static inline u64 file_mmap_size_max(struct file *file, struct inode *inode)
@@ -1293,7 +1272,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
if (!can_do_mlock())
return -EPERM;
- if (mlock_future_check(mm, vm_flags, len))
+ if (!mlock_future_ok(mm, vm_flags, len))
return -EAGAIN;
if (file) {
@@ -1475,6 +1454,48 @@ SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg)
}
#endif /* __ARCH_WANT_SYS_OLD_MMAP */
+static bool vm_ops_needs_writenotify(const struct vm_operations_struct *vm_ops)
+{
+ return vm_ops && (vm_ops->page_mkwrite || vm_ops->pfn_mkwrite);
+}
+
+static bool vma_is_shared_writable(struct vm_area_struct *vma)
+{
+ return (vma->vm_flags & (VM_WRITE | VM_SHARED)) ==
+ (VM_WRITE | VM_SHARED);
+}
+
+static bool vma_fs_can_writeback(struct vm_area_struct *vma)
+{
+ /* No managed pages to writeback. */
+ if (vma->vm_flags & VM_PFNMAP)
+ return false;
+
+ return vma->vm_file && vma->vm_file->f_mapping &&
+ mapping_can_writeback(vma->vm_file->f_mapping);
+}
+
+/*
+ * Does this VMA require the underlying folios to have their dirty state
+ * tracked?
+ */
+bool vma_needs_dirty_tracking(struct vm_area_struct *vma)
+{
+ /* Only shared, writable VMAs require dirty tracking. */
+ if (!vma_is_shared_writable(vma))
+ return false;
+
+ /* Does the filesystem need to be notified? */
+ if (vm_ops_needs_writenotify(vma->vm_ops))
+ return true;
+
+ /*
+ * Even if the filesystem doesn't indicate a need for writenotify, if it
+ * can writeback, dirty tracking is still required.
+ */
+ return vma_fs_can_writeback(vma);
+}
+
/*
* Some shared mappings will want the pages marked read-only
* to track write events. If so, we'll downgrade vm_page_prot
@@ -1483,21 +1504,18 @@ SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg)
*/
int vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot)
{
- vm_flags_t vm_flags = vma->vm_flags;
- const struct vm_operations_struct *vm_ops = vma->vm_ops;
-
/* If it was private or non-writable, the write bit is already clear */
- if ((vm_flags & (VM_WRITE|VM_SHARED)) != ((VM_WRITE|VM_SHARED)))
+ if (!vma_is_shared_writable(vma))
return 0;
/* The backer wishes to know when pages are first written to? */
- if (vm_ops && (vm_ops->page_mkwrite || vm_ops->pfn_mkwrite))
+ if (vm_ops_needs_writenotify(vma->vm_ops))
return 1;
/* The open routine did something to the protections that pgprot_modify
* won't preserve? */
if (pgprot_val(vm_page_prot) !=
- pgprot_val(vm_pgprot_modify(vm_page_prot, vm_flags)))
+ pgprot_val(vm_pgprot_modify(vm_page_prot, vma->vm_flags)))
return 0;
/*
@@ -1511,13 +1529,8 @@ int vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot)
if (userfaultfd_wp(vma))
return 1;
- /* Specialty mapping? */
- if (vm_flags & VM_PFNMAP)
- return 0;
-
/* Can the mapping track the dirty pages? */
- return vma->vm_file && vma->vm_file->f_mapping &&
- mapping_can_writeback(vma->vm_file->f_mapping);
+ return vma_fs_can_writeback(vma);
}
/*
@@ -1911,7 +1924,7 @@ static int acct_stack_growth(struct vm_area_struct *vma,
return -ENOMEM;
/* mlock limit tests */
- if (mlock_future_check(mm, vma->vm_flags, grow << PAGE_SHIFT))
+ if (!mlock_future_ok(mm, vma->vm_flags, grow << PAGE_SHIFT))
return -ENOMEM;
/* Check to ensure the stack will not grow into a hugetlb-only region */
@@ -1935,7 +1948,7 @@ static int acct_stack_growth(struct vm_area_struct *vma,
* PA-RISC uses this for its stack; IA64 for its Register Backing Store.
* vma is the last one with address > vma->vm_end. Have to extend vma.
*/
-int expand_upwards(struct vm_area_struct *vma, unsigned long address)
+static int expand_upwards(struct vm_area_struct *vma, unsigned long address)
{
struct mm_struct *mm = vma->vm_mm;
struct vm_area_struct *next;
@@ -2027,6 +2040,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
/*
* vma is the first one with address < vma->vm_start. Have to extend vma.
+ * mmap_lock held for writing.
*/
int expand_downwards(struct vm_area_struct *vma, unsigned long address)
{
@@ -2035,16 +2049,20 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address)
struct vm_area_struct *prev;
int error = 0;
+ if (!(vma->vm_flags & VM_GROWSDOWN))
+ return -EFAULT;
+
address &= PAGE_MASK;
- if (address < mmap_min_addr)
+ if (address < mmap_min_addr || address < FIRST_USER_ADDRESS)
return -EPERM;
/* Enforce stack_guard_gap */
prev = mas_prev(&mas, 0);
/* Check that both stack segments have the same anon_vma? */
- if (prev && !(prev->vm_flags & VM_GROWSDOWN) &&
- vma_is_accessible(prev)) {
- if (address - prev->vm_end < stack_guard_gap)
+ if (prev) {
+ if (!(prev->vm_flags & VM_GROWSDOWN) &&
+ vma_is_accessible(prev) &&
+ (address - prev->vm_end < stack_guard_gap))
return -ENOMEM;
}
@@ -2124,13 +2142,12 @@ static int __init cmdline_parse_stack_guard_gap(char *p)
__setup("stack_guard_gap=", cmdline_parse_stack_guard_gap);
#ifdef CONFIG_STACK_GROWSUP
-int expand_stack(struct vm_area_struct *vma, unsigned long address)
+int expand_stack_locked(struct vm_area_struct *vma, unsigned long address)
{
return expand_upwards(vma, address);
}
-struct vm_area_struct *
-find_extend_vma(struct mm_struct *mm, unsigned long addr)
+struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm, unsigned long addr)
{
struct vm_area_struct *vma, *prev;
@@ -2138,20 +2155,23 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr)
vma = find_vma_prev(mm, addr, &prev);
if (vma && (vma->vm_start <= addr))
return vma;
- if (!prev || expand_stack(prev, addr))
+ if (!prev)
+ return NULL;
+ if (expand_stack_locked(prev, addr))
return NULL;
if (prev->vm_flags & VM_LOCKED)
populate_vma_page_range(prev, addr, prev->vm_end, NULL);
return prev;
}
#else
-int expand_stack(struct vm_area_struct *vma, unsigned long address)
+int expand_stack_locked(struct vm_area_struct *vma, unsigned long address)
{
+ if (unlikely(!(vma->vm_flags & VM_GROWSDOWN)))
+ return -EINVAL;
return expand_downwards(vma, address);
}
-struct vm_area_struct *
-find_extend_vma(struct mm_struct *mm, unsigned long addr)
+struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm, unsigned long addr)
{
struct vm_area_struct *vma;
unsigned long start;
@@ -2162,10 +2182,8 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr)
return NULL;
if (vma->vm_start <= addr)
return vma;
- if (!(vma->vm_flags & VM_GROWSDOWN))
- return NULL;
start = vma->vm_start;
- if (expand_stack(vma, addr))
+ if (expand_stack_locked(vma, addr))
return NULL;
if (vma->vm_flags & VM_LOCKED)
populate_vma_page_range(vma, addr, start, NULL);
@@ -2173,7 +2191,91 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr)
}
#endif
-EXPORT_SYMBOL_GPL(find_extend_vma);
+/*
+ * IA64 has some horrid mapping rules: it can expand both up and down,
+ * but with various special rules.
+ *
+ * We'll get rid of this architecture eventually, so the ugliness is
+ * temporary.
+ */
+#ifdef CONFIG_IA64
+static inline bool vma_expand_ok(struct vm_area_struct *vma, unsigned long addr)
+{
+ return REGION_NUMBER(addr) == REGION_NUMBER(vma->vm_start) &&
+ REGION_OFFSET(addr) < RGN_MAP_LIMIT;
+}
+
+/*
+ * IA64 stacks grow down, but there's a special register backing store
+ * that can grow up. Only sequentially, though, so the new address must
+ * match vm_end.
+ */
+static inline int vma_expand_up(struct vm_area_struct *vma, unsigned long addr)
+{
+ if (!vma_expand_ok(vma, addr))
+ return -EFAULT;
+ if (vma->vm_end != (addr & PAGE_MASK))
+ return -EFAULT;
+ return expand_upwards(vma, addr);
+}
+
+static inline bool vma_expand_down(struct vm_area_struct *vma, unsigned long addr)
+{
+ if (!vma_expand_ok(vma, addr))
+ return -EFAULT;
+ return expand_downwards(vma, addr);
+}
+
+#elif defined(CONFIG_STACK_GROWSUP)
+
+#define vma_expand_up(vma,addr) expand_upwards(vma, addr)
+#define vma_expand_down(vma, addr) (-EFAULT)
+
+#else
+
+#define vma_expand_up(vma,addr) (-EFAULT)
+#define vma_expand_down(vma, addr) expand_downwards(vma, addr)
+
+#endif
+
+/*
+ * expand_stack(): legacy interface for page faulting. Don't use unless
+ * you have to.
+ *
+ * This is called with the mm locked for reading, drops the lock, takes
+ * the lock for writing, tries to look up a vma again, expands it if
+ * necessary, and downgrades the lock to reading again.
+ *
+ * If no vma is found or it can't be expanded, it returns NULL and has
+ * dropped the lock.
+ */
+struct vm_area_struct *expand_stack(struct mm_struct *mm, unsigned long addr)
+{
+ struct vm_area_struct *vma, *prev;
+
+ mmap_read_unlock(mm);
+ if (mmap_write_lock_killable(mm))
+ return NULL;
+
+ vma = find_vma_prev(mm, addr, &prev);
+ if (vma && vma->vm_start <= addr)
+ goto success;
+
+ if (prev && !vma_expand_up(prev, addr)) {
+ vma = prev;
+ goto success;
+ }
+
+ if (vma && !vma_expand_down(vma, addr))
+ goto success;
+
+ mmap_write_unlock(mm);
+ return NULL;
+
+success:
+ mmap_write_downgrade(mm);
+ return vma;
+}
/*
* Ok - we have the memory areas we should free on a maple tree so release them,
@@ -2234,7 +2336,7 @@ int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
struct vm_area_struct *new;
int err;
- validate_mm_mt(vma->vm_mm);
+ validate_mm(vma->vm_mm);
WARN_ON(vma->vm_start >= addr);
WARN_ON(vma->vm_end <= addr);
@@ -2292,7 +2394,7 @@ int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
/* Success. */
if (new_below)
vma_next(vmi);
- validate_mm_mt(vma->vm_mm);
+ validate_mm(vma->vm_mm);
return 0;
out_free_mpol:
@@ -2301,7 +2403,7 @@ out_free_vmi:
vma_iter_free(vmi);
out_free_vma:
vm_area_free(new);
- validate_mm_mt(vma->vm_mm);
+ validate_mm(vma->vm_mm);
return err;
}
@@ -2318,21 +2420,6 @@ int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
return __split_vma(vmi, vma, addr, new_below);
}
-static inline int munmap_sidetree(struct vm_area_struct *vma,
- struct ma_state *mas_detach)
-{
- vma_start_write(vma);
- mas_set_range(mas_detach, vma->vm_start, vma->vm_end - 1);
- if (mas_store_gfp(mas_detach, vma, GFP_KERNEL))
- return -ENOMEM;
-
- vma_mark_detached(vma, true);
- if (vma->vm_flags & VM_LOCKED)
- vma->vm_mm->locked_vm -= vma_pages(vma);
-
- return 0;
-}
-
/*
* do_vmi_align_munmap() - munmap the aligned region from @start to @end.
* @vmi: The vma iterator
@@ -2354,6 +2441,7 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
struct maple_tree mt_detach;
int count = 0;
int error = -ENOMEM;
+ unsigned long locked_vm = 0;
MA_STATE(mas_detach, &mt_detach, 0, 0);
mt_init_flags(&mt_detach, vmi->mas.tree->ma_flags & MT_FLAGS_LOCK_MASK);
mt_set_external_lock(&mt_detach, &mm->mmap_lock);
@@ -2399,33 +2487,42 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
if (error)
goto end_split_failed;
}
- error = munmap_sidetree(next, &mas_detach);
+ vma_start_write(next);
+ mas_set_range(&mas_detach, next->vm_start, next->vm_end - 1);
+ error = mas_store_gfp(&mas_detach, next, GFP_KERNEL);
if (error)
- goto munmap_sidetree_failed;
+ goto munmap_gather_failed;
+ vma_mark_detached(next, true);
+ if (next->vm_flags & VM_LOCKED)
+ locked_vm += vma_pages(next);
count++;
+ if (unlikely(uf)) {
+ /*
+ * If userfaultfd_unmap_prep returns an error the vmas
+ * will remain split, but userland will get a
+ * highly unexpected error anyway. This is no
+ * different than the case where the first of the two
+ * __split_vma fails, but we don't undo the first
+ * split, despite we could. This is unlikely enough
+ * failure that it's not worth optimizing it for.
+ */
+ error = userfaultfd_unmap_prep(next, start, end, uf);
+
+ if (error)
+ goto userfaultfd_error;
+ }
#ifdef CONFIG_DEBUG_VM_MAPLE_TREE
BUG_ON(next->vm_start < start);
BUG_ON(next->vm_start > end);
#endif
}
- next = vma_next(vmi);
- if (unlikely(uf)) {
- /*
- * If userfaultfd_unmap_prep returns an error the vmas
- * will remain split, but userland will get a
- * highly unexpected error anyway. This is no
- * different than the case where the first of the two
- * __split_vma fails, but we don't undo the first
- * split, despite we could. This is unlikely enough
- * failure that it's not worth optimizing it for.
- */
- error = userfaultfd_unmap_prep(mm, start, end, uf);
+ if (vma_iter_end(vmi) > end)
+ next = vma_iter_load(vmi);
- if (error)
- goto userfaultfd_error;
- }
+ if (!next)
+ next = vma_next(vmi);
#if defined(CONFIG_DEBUG_VM_MAPLE_TREE)
/* Make sure no VMAs are about to be lost. */
@@ -2446,11 +2543,13 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
BUG_ON(count != test_count);
}
#endif
- /* Point of no return */
vma_iter_set(vmi, start);
- if (vma_iter_clear_gfp(vmi, start, end, GFP_KERNEL))
- return -ENOMEM;
+ error = vma_iter_clear_gfp(vmi, start, end, GFP_KERNEL);
+ if (error)
+ goto clear_tree_failed;
+ /* Point of no return */
+ mm->locked_vm -= locked_vm;
mm->map_count -= count;
/*
* Do not downgrade mmap_lock if we are next to VM_GROWSDOWN or
@@ -2480,9 +2579,14 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
validate_mm(mm);
return downgrade ? 1 : 0;
+clear_tree_failed:
userfaultfd_error:
-munmap_sidetree_failed:
+munmap_gather_failed:
end_split_failed:
+ mas_set(&mas_detach, 0);
+ mas_for_each(&mas_detach, next, end)
+ vma_mark_detached(next, false);
+
__mt_destroy(&mt_detach);
start_split_failed:
map_count_exceeded:
@@ -2623,6 +2727,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
}
cannot_expand:
+ if (prev)
+ vma_iter_next_range(&vmi);
+
/*
* Determine the object being mapped and call the appropriate
* specific mapper. the address has already been validated, but
@@ -2936,7 +3043,7 @@ int do_vma_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
arch_unmap(mm, start, end);
ret = do_vmi_align_munmap(vmi, vma, mm, start, end, uf, downgrade);
- validate_mm_mt(mm);
+ validate_mm(mm);
return ret;
}
@@ -2958,7 +3065,7 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
struct mm_struct *mm = current->mm;
struct vma_prepare vp;
- validate_mm_mt(mm);
+ validate_mm(mm);
/*
* Check against address space limits by the changed size
* Note: This happens *after* clearing old mappings in some code paths.
@@ -3199,7 +3306,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
bool faulted_in_anon_vma = true;
VMA_ITERATOR(vmi, mm, addr);
- validate_mm_mt(mm);
+ validate_mm(mm);
/*
* If anonymous vma has not yet been faulted, update new pgoff
* to match new location, to increase its chance of merging.
@@ -3258,7 +3365,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
goto out_vma_link;
*need_rmap_locks = false;
}
- validate_mm_mt(mm);
+ validate_mm(mm);
return new_vma;
out_vma_link:
@@ -3274,7 +3381,7 @@ out_free_mempol:
out_free_vma:
vm_area_free(new_vma);
out:
- validate_mm_mt(mm);
+ validate_mm(mm);
return NULL;
}
@@ -3411,7 +3518,7 @@ static struct vm_area_struct *__install_special_mapping(
int ret;
struct vm_area_struct *vma;
- validate_mm_mt(mm);
+ validate_mm(mm);
vma = vm_area_alloc(mm);
if (unlikely(vma == NULL))
return ERR_PTR(-ENOMEM);
@@ -3434,12 +3541,12 @@ static struct vm_area_struct *__install_special_mapping(
perf_event_mmap(vma);
- validate_mm_mt(mm);
+ validate_mm(mm);
return vma;
out:
vm_area_free(vma);
- validate_mm_mt(mm);
+ validate_mm(mm);
return ERR_PTR(ret);
}
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 92d3d3ca390a..6f658d483704 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -93,22 +93,9 @@ static long change_pte_range(struct mmu_gather *tlb,
bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
tlb_change_page_size(tlb, PAGE_SIZE);
-
- /*
- * Can be called with only the mmap_lock for reading by
- * prot_numa so we must check the pmd isn't constantly
- * changing from under us from pmd_none to pmd_trans_huge
- * and/or the other way around.
- */
- if (pmd_trans_unstable(pmd))
- return 0;
-
- /*
- * The pmd points to a regular pte so the pmd can't change
- * from under us even if the mmap_lock is only hold for
- * reading.
- */
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+ if (!pte)
+ return -EAGAIN;
/* Get target node for single threaded private VMAs */
if (prot_numa && !(vma->vm_flags & VM_SHARED) &&
@@ -118,7 +105,7 @@ static long change_pte_range(struct mmu_gather *tlb,
flush_tlb_batched_pending(vma->vm_mm);
arch_enter_lazy_mmu_mode();
do {
- oldpte = *pte;
+ oldpte = ptep_get(pte);
if (pte_present(oldpte)) {
pte_t ptent;
@@ -302,31 +289,6 @@ static long change_pte_range(struct mmu_gather *tlb,
}
/*
- * Used when setting automatic NUMA hinting protection where it is
- * critical that a numa hinting PMD is not confused with a bad PMD.
- */
-static inline int pmd_none_or_clear_bad_unless_trans_huge(pmd_t *pmd)
-{
- pmd_t pmdval = pmdp_get_lockless(pmd);
-
- /* See pmd_none_or_trans_huge_or_clear_bad for info on barrier */
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- barrier();
-#endif
-
- if (pmd_none(pmdval))
- return 1;
- if (pmd_trans_huge(pmdval))
- return 0;
- if (unlikely(pmd_bad(pmdval))) {
- pmd_clear_bad(pmd);
- return 1;
- }
-
- return 0;
-}
-
-/*
* Return true if we want to split THPs into PTE mappings in change
* protection procedure, false otherwise.
*/
@@ -403,7 +365,8 @@ static inline long change_pmd_range(struct mmu_gather *tlb,
pmd = pmd_offset(pud, addr);
do {
long ret;
-
+ pmd_t _pmd;
+again:
next = pmd_addr_end(addr, end);
ret = change_pmd_prepare(vma, pmd, cp_flags);
@@ -411,16 +374,8 @@ static inline long change_pmd_range(struct mmu_gather *tlb,
pages = ret;
break;
}
- /*
- * Automatic NUMA balancing walks the tables with mmap_lock
- * held for read. It's possible a parallel update to occur
- * between pmd_trans_huge() and a pmd_none_or_clear_bad()
- * check leading to a false positive and clearing.
- * Hence, it's necessary to atomically read the PMD value
- * for all the checks.
- */
- if (!is_swap_pmd(*pmd) && !pmd_devmap(*pmd) &&
- pmd_none_or_clear_bad_unless_trans_huge(pmd))
+
+ if (pmd_none(*pmd))
goto next;
/* invoke the mmu notifier if the pmd is populated */
@@ -431,7 +386,8 @@ static inline long change_pmd_range(struct mmu_gather *tlb,
mmu_notifier_invalidate_range_start(&range);
}
- if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
+ _pmd = pmdp_get_lockless(pmd);
+ if (is_swap_pmd(_pmd) || pmd_trans_huge(_pmd) || pmd_devmap(_pmd)) {
if ((next - addr != HPAGE_PMD_SIZE) ||
pgtable_split_needed(vma, cp_flags)) {
__split_huge_pmd(vma, pmd, addr, false, NULL);
@@ -446,15 +402,10 @@ static inline long change_pmd_range(struct mmu_gather *tlb,
break;
}
} else {
- /*
- * change_huge_pmd() does not defer TLB flushes,
- * so no need to propagate the tlb argument.
- */
- int nr_ptes = change_huge_pmd(tlb, vma, pmd,
+ ret = change_huge_pmd(tlb, vma, pmd,
addr, newprot, cp_flags);
-
- if (nr_ptes) {
- if (nr_ptes == HPAGE_PMD_NR) {
+ if (ret) {
+ if (ret == HPAGE_PMD_NR) {
pages += HPAGE_PMD_NR;
nr_huge_updates++;
}
@@ -465,8 +416,12 @@ static inline long change_pmd_range(struct mmu_gather *tlb,
}
/* fall through, the trans huge pmd just split */
}
- pages += change_pte_range(tlb, vma, pmd, addr, next,
- newprot, cp_flags);
+
+ ret = change_pte_range(tlb, vma, pmd, addr, next, newprot,
+ cp_flags);
+ if (ret < 0)
+ goto again;
+ pages += ret;
next:
cond_resched();
} while (pmd++, addr = next, addr != end);
@@ -589,7 +544,8 @@ long change_protection(struct mmu_gather *tlb,
static int prot_none_pte_entry(pte_t *pte, unsigned long addr,
unsigned long next, struct mm_walk *walk)
{
- return pfn_modify_allowed(pte_pfn(*pte), *(pgprot_t *)(walk->private)) ?
+ return pfn_modify_allowed(pte_pfn(ptep_get(pte)),
+ *(pgprot_t *)(walk->private)) ?
0 : -EACCES;
}
@@ -597,7 +553,8 @@ static int prot_none_hugetlb_entry(pte_t *pte, unsigned long hmask,
unsigned long addr, unsigned long next,
struct mm_walk *walk)
{
- return pfn_modify_allowed(pte_pfn(*pte), *(pgprot_t *)(walk->private)) ?
+ return pfn_modify_allowed(pte_pfn(ptep_get(pte)),
+ *(pgprot_t *)(walk->private)) ?
0 : -EACCES;
}
@@ -867,7 +824,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len,
}
tlb_finish_mmu(&tlb);
- if (!error && vma_iter_end(&vmi) < end)
+ if (!error && tmp < end)
error = -ENOMEM;
out:
diff --git a/mm/mremap.c b/mm/mremap.c
index b11ce6c92099..fe6b722ae633 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -133,7 +133,7 @@ static pte_t move_soft_dirty_pte(pte_t pte)
return pte;
}
-static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
+static int move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
unsigned long old_addr, unsigned long old_end,
struct vm_area_struct *new_vma, pmd_t *new_pmd,
unsigned long new_addr, bool need_rmap_locks)
@@ -143,6 +143,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
spinlock_t *old_ptl, *new_ptl;
bool force_flush = false;
unsigned long len = old_end - old_addr;
+ int err = 0;
/*
* When need_rmap_locks is true, we take the i_mmap_rwsem and anon_vma
@@ -170,8 +171,16 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
* pte locks because exclusive mmap_lock prevents deadlock.
*/
old_pte = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl);
- new_pte = pte_offset_map(new_pmd, new_addr);
- new_ptl = pte_lockptr(mm, new_pmd);
+ if (!old_pte) {
+ err = -EAGAIN;
+ goto out;
+ }
+ new_pte = pte_offset_map_nolock(mm, new_pmd, new_addr, &new_ptl);
+ if (!new_pte) {
+ pte_unmap_unlock(old_pte, old_ptl);
+ err = -EAGAIN;
+ goto out;
+ }
if (new_ptl != old_ptl)
spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
flush_tlb_batched_pending(vma->vm_mm);
@@ -179,7 +188,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE,
new_pte++, new_addr += PAGE_SIZE) {
- if (pte_none(*old_pte))
+ if (pte_none(ptep_get(old_pte)))
continue;
pte = ptep_get_and_clear(mm, old_addr, old_pte);
@@ -208,8 +217,10 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
spin_unlock(new_ptl);
pte_unmap(new_pte - 1);
pte_unmap_unlock(old_pte - 1, old_ptl);
+out:
if (need_rmap_locks)
drop_rmap_locks(vma);
+ return err;
}
#ifndef arch_supports_page_table_move
@@ -537,6 +548,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
new_pmd = alloc_new_pmd(vma->vm_mm, vma, new_addr);
if (!new_pmd)
break;
+again:
if (is_swap_pmd(*old_pmd) || pmd_trans_huge(*old_pmd) ||
pmd_devmap(*old_pmd)) {
if (extent == HPAGE_PMD_SIZE &&
@@ -544,8 +556,6 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
old_pmd, new_pmd, need_rmap_locks))
continue;
split_huge_pmd(vma, old_pmd, old_addr);
- if (pmd_trans_unstable(old_pmd))
- continue;
} else if (IS_ENABLED(CONFIG_HAVE_MOVE_PMD) &&
extent == PMD_SIZE) {
/*
@@ -556,11 +566,13 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
old_pmd, new_pmd, true))
continue;
}
-
+ if (pmd_none(*old_pmd))
+ continue;
if (pte_alloc(new_vma->vm_mm, new_pmd))
break;
- move_ptes(vma, old_pmd, old_addr, old_addr + extent, new_vma,
- new_pmd, new_addr, need_rmap_locks);
+ if (move_ptes(vma, old_pmd, old_addr, old_addr + extent,
+ new_vma, new_pmd, new_addr, need_rmap_locks) < 0)
+ goto again;
}
mmu_notifier_invalidate_range_end(&range);
@@ -775,7 +787,7 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP))
return ERR_PTR(-EFAULT);
- if (mlock_future_check(mm, vma->vm_flags, new_len - old_len))
+ if (!mlock_future_ok(mm, vma->vm_flags, new_len - old_len))
return ERR_PTR(-EAGAIN);
if (!may_expand_vm(mm, vma->vm_flags,
@@ -914,7 +926,8 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
* mapping address intact. A non-zero tag will cause the subsequent
* range checks to reject the address as invalid.
*
- * See Documentation/arm64/tagged-address-abi.rst for more information.
+ * See Documentation/arch/arm64/tagged-address-abi.rst for more
+ * information.
*/
addr = untagged_addr(addr);
diff --git a/mm/nommu.c b/mm/nommu.c
index f670d9979a26..37d0b03143f1 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -631,23 +631,20 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
EXPORT_SYMBOL(find_vma);
/*
- * find a VMA
- * - we don't extend stack VMAs under NOMMU conditions
- */
-struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr)
-{
- return find_vma(mm, addr);
-}
-
-/*
* expand a stack to a given address
* - not supported under NOMMU conditions
*/
-int expand_stack(struct vm_area_struct *vma, unsigned long address)
+int expand_stack_locked(struct vm_area_struct *vma, unsigned long addr)
{
return -ENOMEM;
}
+struct vm_area_struct *expand_stack(struct mm_struct *mm, unsigned long addr)
+{
+ mmap_read_unlock(mm);
+ return NULL;
+}
+
/*
* look up the first VMA exactly that exactly matches addr
* - should be called with mm->mmap_lock at least held readlocked
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 044e1eed720e..612b5597d3af 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -1130,12 +1130,10 @@ bool out_of_memory(struct oom_control *oc)
/*
* The OOM killer does not compensate for IO-less reclaim.
- * pagefault_out_of_memory lost its gfp context so we have to
- * make sure exclude 0 mask - all other users should have at least
- * ___GFP_DIRECT_RECLAIM to get here. But mem_cgroup_oom() has to
- * invoke the OOM killer even if it is a GFP_NOFS allocation.
+ * But mem_cgroup_oom() has to invoke the OOM killer even
+ * if it is a GFP_NOFS allocation.
*/
- if (oc->gfp_mask && !(oc->gfp_mask & __GFP_FS) && !is_memcg_oom(oc))
+ if (!(oc->gfp_mask & __GFP_FS) && !is_memcg_oom(oc))
return true;
/*
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index db7943999007..1d17fb1ec863 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2597,7 +2597,7 @@ EXPORT_SYMBOL(noop_dirty_folio);
/*
* Helper function for set_page_dirty family.
*
- * Caller must hold lock_page_memcg().
+ * Caller must hold folio_memcg_lock().
*
* NOTE: This relies on being atomic wrt interrupts.
*/
@@ -2631,7 +2631,7 @@ static void folio_account_dirtied(struct folio *folio,
/*
* Helper function for deaccounting dirty page without writeback.
*
- * Caller must hold lock_page_memcg().
+ * Caller must hold folio_memcg_lock().
*/
void folio_account_cleaned(struct folio *folio, struct bdi_writeback *wb)
{
@@ -2650,7 +2650,7 @@ void folio_account_cleaned(struct folio *folio, struct bdi_writeback *wb)
* If warn is true, then emit a warning if the folio is not uptodate and has
* not been truncated.
*
- * The caller must hold lock_page_memcg(). Most callers have the folio
+ * The caller must hold folio_memcg_lock(). Most callers have the folio
* locked. A few have the folio blocked from truncation through other
* means (eg zap_vma_pages() has it mapped and is holding the page table
* lock). This can also be called from mark_buffer_dirty(), which I
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 47421bedc12b..7d3460c7a480 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -18,21 +18,14 @@
#include <linux/stddef.h>
#include <linux/mm.h>
#include <linux/highmem.h>
-#include <linux/swap.h>
-#include <linux/swapops.h>
#include <linux/interrupt.h>
-#include <linux/pagemap.h>
#include <linux/jiffies.h>
-#include <linux/memblock.h>
#include <linux/compiler.h>
#include <linux/kernel.h>
#include <linux/kasan.h>
#include <linux/kmsan.h>
#include <linux/module.h>
#include <linux/suspend.h>
-#include <linux/pagevec.h>
-#include <linux/blkdev.h>
-#include <linux/slab.h>
#include <linux/ratelimit.h>
#include <linux/oom.h>
#include <linux/topology.h>
@@ -41,19 +34,8 @@
#include <linux/cpuset.h>
#include <linux/memory_hotplug.h>
#include <linux/nodemask.h>
-#include <linux/vmalloc.h>
#include <linux/vmstat.h>
-#include <linux/mempolicy.h>
-#include <linux/memremap.h>
-#include <linux/stop_machine.h>
-#include <linux/random.h>
-#include <linux/sort.h>
-#include <linux/pfn.h>
-#include <linux/backing-dev.h>
#include <linux/fault-inject.h>
-#include <linux/page-isolation.h>
-#include <linux/debugobjects.h>
-#include <linux/kmemleak.h>
#include <linux/compaction.h>
#include <trace/events/kmem.h>
#include <trace/events/oom.h>
@@ -61,26 +43,19 @@
#include <linux/mm_inline.h>
#include <linux/mmu_notifier.h>
#include <linux/migrate.h>
-#include <linux/hugetlb.h>
-#include <linux/sched/rt.h>
#include <linux/sched/mm.h>
#include <linux/page_owner.h>
#include <linux/page_table_check.h>
-#include <linux/kthread.h>
#include <linux/memcontrol.h>
#include <linux/ftrace.h>
#include <linux/lockdep.h>
-#include <linux/nmi.h>
#include <linux/psi.h>
#include <linux/khugepaged.h>
#include <linux/delayacct.h>
-#include <asm/sections.h>
-#include <asm/tlbflush.h>
#include <asm/div64.h>
#include "internal.h"
#include "shuffle.h"
#include "page_reporting.h"
-#include "swap.h"
/* Free Page Internal flags: for internal, non-pcp variants of free_pages(). */
typedef int __bitwise fpi_t;
@@ -227,18 +202,7 @@ nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
};
EXPORT_SYMBOL(node_states);
-atomic_long_t _totalram_pages __read_mostly;
-EXPORT_SYMBOL(_totalram_pages);
-unsigned long totalreserve_pages __read_mostly;
-unsigned long totalcma_pages __read_mostly;
-
-int percpu_pagelist_high_fraction;
gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
-DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, init_on_alloc);
-EXPORT_SYMBOL(init_on_alloc);
-
-DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_FREE_DEFAULT_ON, init_on_free);
-EXPORT_SYMBOL(init_on_free);
/*
* A cached value of the page's pageblock's migratetype, used when the page is
@@ -258,44 +222,6 @@ static inline void set_pcppage_migratetype(struct page *page, int migratetype)
page->index = migratetype;
}
-#ifdef CONFIG_PM_SLEEP
-/*
- * The following functions are used by the suspend/hibernate code to temporarily
- * change gfp_allowed_mask in order to avoid using I/O during memory allocations
- * while devices are suspended. To avoid races with the suspend/hibernate code,
- * they should always be called with system_transition_mutex held
- * (gfp_allowed_mask also should only be modified with system_transition_mutex
- * held, unless the suspend/hibernate code is guaranteed not to run in parallel
- * with that modification).
- */
-
-static gfp_t saved_gfp_mask;
-
-void pm_restore_gfp_mask(void)
-{
- WARN_ON(!mutex_is_locked(&system_transition_mutex));
- if (saved_gfp_mask) {
- gfp_allowed_mask = saved_gfp_mask;
- saved_gfp_mask = 0;
- }
-}
-
-void pm_restrict_gfp_mask(void)
-{
- WARN_ON(!mutex_is_locked(&system_transition_mutex));
- WARN_ON(saved_gfp_mask);
- saved_gfp_mask = gfp_allowed_mask;
- gfp_allowed_mask &= ~(__GFP_IO | __GFP_FS);
-}
-
-bool pm_suspended_storage(void)
-{
- if ((gfp_allowed_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS))
- return false;
- return true;
-}
-#endif /* CONFIG_PM_SLEEP */
-
#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
unsigned int pageblock_order __read_mostly;
#endif
@@ -314,7 +240,7 @@ static void __free_pages_ok(struct page *page, unsigned int order,
* TBD: should special case ZONE_DMA32 machines here - in those we normally
* don't need any ZONE_NORMAL reservation
*/
-int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES] = {
+static int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES] = {
#ifdef CONFIG_ZONE_DMA
[ZONE_DMA] = 256,
#endif
@@ -358,7 +284,7 @@ const char * const migratetype_names[MIGRATE_TYPES] = {
#endif
};
-compound_page_dtor * const compound_page_dtors[NR_COMPOUND_DTORS] = {
+static compound_page_dtor * const compound_page_dtors[NR_COMPOUND_DTORS] = {
[NULL_COMPOUND_DTOR] = NULL,
[COMPOUND_PAGE_DTOR] = free_compound_page,
#ifdef CONFIG_HUGETLB_PAGE
@@ -371,10 +297,8 @@ compound_page_dtor * const compound_page_dtors[NR_COMPOUND_DTORS] = {
int min_free_kbytes = 1024;
int user_min_free_kbytes = -1;
-int watermark_boost_factor __read_mostly = 15000;
-int watermark_scale_factor = 10;
-
-bool mirrored_kernelcore __initdata_memblock;
+static int watermark_boost_factor __read_mostly = 15000;
+static int watermark_scale_factor = 10;
/* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
int movable_zone;
@@ -387,6 +311,12 @@ EXPORT_SYMBOL(nr_node_ids);
EXPORT_SYMBOL(nr_online_nodes);
#endif
+static bool page_contains_unaccepted(struct page *page, unsigned int order);
+static void accept_page(struct page *page, unsigned int order);
+static bool try_to_accept_memory(struct zone *zone, unsigned int order);
+static inline bool has_unaccepted_memory(void);
+static bool __free_unaccepted(struct page *page);
+
int page_group_by_mobility_disabled __read_mostly;
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
@@ -550,13 +480,6 @@ static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
return ret;
}
-static int page_is_consistent(struct zone *zone, struct page *page)
-{
- if (zone != page_zone(page))
- return 0;
-
- return 1;
-}
/*
* Temporary debugging check for pages not lying within a given zone.
*/
@@ -564,7 +487,7 @@ static int __maybe_unused bad_range(struct zone *zone, struct page *page)
{
if (page_outside_zone_boundaries(zone, page))
return 1;
- if (!page_is_consistent(zone, page))
+ if (zone != page_zone(page))
return 1;
return 0;
@@ -704,75 +627,6 @@ void destroy_large_folio(struct folio *folio)
compound_page_dtors[dtor](&folio->page);
}
-#ifdef CONFIG_DEBUG_PAGEALLOC
-unsigned int _debug_guardpage_minorder;
-
-bool _debug_pagealloc_enabled_early __read_mostly
- = IS_ENABLED(CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT);
-EXPORT_SYMBOL(_debug_pagealloc_enabled_early);
-DEFINE_STATIC_KEY_FALSE(_debug_pagealloc_enabled);
-EXPORT_SYMBOL(_debug_pagealloc_enabled);
-
-DEFINE_STATIC_KEY_FALSE(_debug_guardpage_enabled);
-
-static int __init early_debug_pagealloc(char *buf)
-{
- return kstrtobool(buf, &_debug_pagealloc_enabled_early);
-}
-early_param("debug_pagealloc", early_debug_pagealloc);
-
-static int __init debug_guardpage_minorder_setup(char *buf)
-{
- unsigned long res;
-
- if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) {
- pr_err("Bad debug_guardpage_minorder value\n");
- return 0;
- }
- _debug_guardpage_minorder = res;
- pr_info("Setting debug_guardpage_minorder to %lu\n", res);
- return 0;
-}
-early_param("debug_guardpage_minorder", debug_guardpage_minorder_setup);
-
-static inline bool set_page_guard(struct zone *zone, struct page *page,
- unsigned int order, int migratetype)
-{
- if (!debug_guardpage_enabled())
- return false;
-
- if (order >= debug_guardpage_minorder())
- return false;
-
- __SetPageGuard(page);
- INIT_LIST_HEAD(&page->buddy_list);
- set_page_private(page, order);
- /* Guard pages are not available for any usage */
- if (!is_migrate_isolate(migratetype))
- __mod_zone_freepage_state(zone, -(1 << order), migratetype);
-
- return true;
-}
-
-static inline void clear_page_guard(struct zone *zone, struct page *page,
- unsigned int order, int migratetype)
-{
- if (!debug_guardpage_enabled())
- return;
-
- __ClearPageGuard(page);
-
- set_page_private(page, 0);
- if (!is_migrate_isolate(migratetype))
- __mod_zone_freepage_state(zone, (1 << order), migratetype);
-}
-#else
-static inline bool set_page_guard(struct zone *zone, struct page *page,
- unsigned int order, int migratetype) { return false; }
-static inline void clear_page_guard(struct zone *zone, struct page *page,
- unsigned int order, int migratetype) {}
-#endif
-
static inline void set_buddy_order(struct page *page, unsigned int order)
{
set_page_private(page, order);
@@ -879,7 +733,7 @@ static inline struct page *get_page_from_free_area(struct free_area *area,
int migratetype)
{
return list_first_entry_or_null(&area->free_list[migratetype],
- struct page, lru);
+ struct page, buddy_list);
}
/*
@@ -1131,6 +985,11 @@ static inline bool free_page_is_bad(struct page *page)
return true;
}
+static inline bool is_check_pages_enabled(void)
+{
+ return static_branch_unlikely(&check_pages_enabled);
+}
+
static int free_tail_page_prepare(struct page *head_page, struct page *page)
{
struct folio *folio = (struct folio *)head_page;
@@ -1142,7 +1001,7 @@ static int free_tail_page_prepare(struct page *head_page, struct page *page)
*/
BUILD_BUG_ON((unsigned long)LIST_POISON1 & 1);
- if (!static_branch_unlikely(&check_pages_enabled)) {
+ if (!is_check_pages_enabled()) {
ret = 0;
goto out;
}
@@ -1481,6 +1340,13 @@ void __free_pages_core(struct page *page, unsigned int order)
atomic_long_add(nr_pages, &page_zone(page)->managed_pages);
+ if (page_contains_unaccepted(page, order)) {
+ if (order == MAX_ORDER && __free_unaccepted(page))
+ return;
+
+ accept_page(page, order);
+ }
+
/*
* Bypass PCP and place fresh pages right to the tail, primarily
* relevant for memory onlining.
@@ -1521,7 +1387,7 @@ struct page *__pageblock_pfn_to_page(unsigned long start_pfn,
/* end_pfn is one past the range we are checking */
end_pfn--;
- if (!pfn_valid(start_pfn) || !pfn_valid(end_pfn))
+ if (!pfn_valid(end_pfn))
return NULL;
start_page = pfn_to_online_page(start_pfn);
@@ -1540,33 +1406,6 @@ struct page *__pageblock_pfn_to_page(unsigned long start_pfn,
return start_page;
}
-void set_zone_contiguous(struct zone *zone)
-{
- unsigned long block_start_pfn = zone->zone_start_pfn;
- unsigned long block_end_pfn;
-
- block_end_pfn = pageblock_end_pfn(block_start_pfn);
- for (; block_start_pfn < zone_end_pfn(zone);
- block_start_pfn = block_end_pfn,
- block_end_pfn += pageblock_nr_pages) {
-
- block_end_pfn = min(block_end_pfn, zone_end_pfn(zone));
-
- if (!__pageblock_pfn_to_page(block_start_pfn,
- block_end_pfn, zone))
- return;
- cond_resched();
- }
-
- /* We confirm that there is no hole */
- zone->contiguous = true;
-}
-
-void clear_zone_contiguous(struct zone *zone)
-{
- zone->contiguous = false;
-}
-
/*
* The order of subdivision here is critical for the IO subsystem.
* Please do not alter this order without good reasons and regression
@@ -2501,61 +2340,6 @@ void drain_all_pages(struct zone *zone)
__drain_all_pages(zone, false);
}
-#ifdef CONFIG_HIBERNATION
-
-/*
- * Touch the watchdog for every WD_PAGE_COUNT pages.
- */
-#define WD_PAGE_COUNT (128*1024)
-
-void mark_free_pages(struct zone *zone)
-{
- unsigned long pfn, max_zone_pfn, page_count = WD_PAGE_COUNT;
- unsigned long flags;
- unsigned int order, t;
- struct page *page;
-
- if (zone_is_empty(zone))
- return;
-
- spin_lock_irqsave(&zone->lock, flags);
-
- max_zone_pfn = zone_end_pfn(zone);
- for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
- if (pfn_valid(pfn)) {
- page = pfn_to_page(pfn);
-
- if (!--page_count) {
- touch_nmi_watchdog();
- page_count = WD_PAGE_COUNT;
- }
-
- if (page_zone(page) != zone)
- continue;
-
- if (!swsusp_page_is_forbidden(page))
- swsusp_unset_page_free(page);
- }
-
- for_each_migratetype_order(order, t) {
- list_for_each_entry(page,
- &zone->free_area[order].free_list[t], buddy_list) {
- unsigned long i;
-
- pfn = page_to_pfn(page);
- for (i = 0; i < (1UL << order); i++) {
- if (!--page_count) {
- touch_nmi_watchdog();
- page_count = WD_PAGE_COUNT;
- }
- swsusp_set_page_free(pfn_to_page(pfn + i));
- }
- }
- }
- spin_unlock_irqrestore(&zone->lock, flags);
-}
-#endif /* CONFIG_PM */
-
static bool free_unref_page_prepare(struct page *page, unsigned long pfn,
unsigned int order)
{
@@ -3052,7 +2836,8 @@ struct page *rmqueue(struct zone *preferred_zone,
out:
/* Separate test+clear to avoid unnecessary atomics */
- if (unlikely(test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags))) {
+ if ((alloc_flags & ALLOC_KSWAPD) &&
+ unlikely(test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags))) {
clear_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
wakeup_kswapd(zone, 0, 0, zone_idx(zone));
}
@@ -3061,80 +2846,6 @@ out:
return page;
}
-#ifdef CONFIG_FAIL_PAGE_ALLOC
-
-static struct {
- struct fault_attr attr;
-
- bool ignore_gfp_highmem;
- bool ignore_gfp_reclaim;
- u32 min_order;
-} fail_page_alloc = {
- .attr = FAULT_ATTR_INITIALIZER,
- .ignore_gfp_reclaim = true,
- .ignore_gfp_highmem = true,
- .min_order = 1,
-};
-
-static int __init setup_fail_page_alloc(char *str)
-{
- return setup_fault_attr(&fail_page_alloc.attr, str);
-}
-__setup("fail_page_alloc=", setup_fail_page_alloc);
-
-static bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
-{
- int flags = 0;
-
- if (order < fail_page_alloc.min_order)
- return false;
- if (gfp_mask & __GFP_NOFAIL)
- return false;
- if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))
- return false;
- if (fail_page_alloc.ignore_gfp_reclaim &&
- (gfp_mask & __GFP_DIRECT_RECLAIM))
- return false;
-
- /* See comment in __should_failslab() */
- if (gfp_mask & __GFP_NOWARN)
- flags |= FAULT_NOWARN;
-
- return should_fail_ex(&fail_page_alloc.attr, 1 << order, flags);
-}
-
-#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
-
-static int __init fail_page_alloc_debugfs(void)
-{
- umode_t mode = S_IFREG | 0600;
- struct dentry *dir;
-
- dir = fault_create_debugfs_attr("fail_page_alloc", NULL,
- &fail_page_alloc.attr);
-
- debugfs_create_bool("ignore-gfp-wait", mode, dir,
- &fail_page_alloc.ignore_gfp_reclaim);
- debugfs_create_bool("ignore-gfp-highmem", mode, dir,
- &fail_page_alloc.ignore_gfp_highmem);
- debugfs_create_u32("min-order", mode, dir, &fail_page_alloc.min_order);
-
- return 0;
-}
-
-late_initcall(fail_page_alloc_debugfs);
-
-#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
-
-#else /* CONFIG_FAIL_PAGE_ALLOC */
-
-static inline bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
-{
- return false;
-}
-
-#endif /* CONFIG_FAIL_PAGE_ALLOC */
-
noinline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
{
return __should_fail_alloc_page(gfp_mask, order);
@@ -3159,6 +2870,9 @@ static inline long __zone_watermark_unusable_free(struct zone *z,
if (!(alloc_flags & ALLOC_CMA))
unusable_free += zone_page_state(z, NR_FREE_CMA_PAGES);
#endif
+#ifdef CONFIG_UNACCEPTED_MEMORY
+ unusable_free += zone_page_state(z, NR_UNACCEPTED);
+#endif
return unusable_free;
}
@@ -3458,6 +3172,11 @@ retry:
gfp_mask)) {
int ret;
+ if (has_unaccepted_memory()) {
+ if (try_to_accept_memory(zone, order))
+ goto try_this_zone;
+ }
+
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
/*
* Watermark failed for this zone, but see if we can
@@ -3510,6 +3229,11 @@ try_this_zone:
return page;
} else {
+ if (has_unaccepted_memory()) {
+ if (try_to_accept_memory(zone, order))
+ goto try_this_zone;
+ }
+
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
/* Try again if zone has deferred pages */
if (deferred_pages_enabled()) {
@@ -3768,56 +3492,41 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
if (fatal_signal_pending(current))
return false;
- if (compaction_made_progress(compact_result))
- (*compaction_retries)++;
-
- /*
- * compaction considers all the zone as desperately out of memory
- * so it doesn't really make much sense to retry except when the
- * failure could be caused by insufficient priority
- */
- if (compaction_failed(compact_result))
- goto check_priority;
-
/*
- * compaction was skipped because there are not enough order-0 pages
- * to work with, so we retry only if it looks like reclaim can help.
+ * Compaction was skipped due to a lack of free order-0
+ * migration targets. Continue if reclaim can help.
*/
- if (compaction_needs_reclaim(compact_result)) {
+ if (compact_result == COMPACT_SKIPPED) {
ret = compaction_zonelist_suitable(ac, order, alloc_flags);
goto out;
}
/*
- * make sure the compaction wasn't deferred or didn't bail out early
- * due to locks contention before we declare that we should give up.
- * But the next retry should use a higher priority if allowed, so
- * we don't just keep bailing out endlessly.
+ * Compaction managed to coalesce some page blocks, but the
+ * allocation failed presumably due to a race. Retry some.
*/
- if (compaction_withdrawn(compact_result)) {
- goto check_priority;
- }
+ if (compact_result == COMPACT_SUCCESS) {
+ /*
+ * !costly requests are much more important than
+ * __GFP_RETRY_MAYFAIL costly ones because they are de
+ * facto nofail and invoke OOM killer to move on while
+ * costly can fail and users are ready to cope with
+ * that. 1/4 retries is rather arbitrary but we would
+ * need much more detailed feedback from compaction to
+ * make a better decision.
+ */
+ if (order > PAGE_ALLOC_COSTLY_ORDER)
+ max_retries /= 4;
- /*
- * !costly requests are much more important than __GFP_RETRY_MAYFAIL
- * costly ones because they are de facto nofail and invoke OOM
- * killer to move on while costly can fail and users are ready
- * to cope with that. 1/4 retries is rather arbitrary but we
- * would need much more detailed feedback from compaction to
- * make a better decision.
- */
- if (order > PAGE_ALLOC_COSTLY_ORDER)
- max_retries /= 4;
- if (*compaction_retries <= max_retries) {
- ret = true;
- goto out;
+ if (++(*compaction_retries) <= max_retries) {
+ ret = true;
+ goto out;
+ }
}
/*
- * Make sure there are attempts at the highest priority if we exhausted
- * all retries or failed at the lower priorities.
+ * Compaction failed. Retry with increasing priority.
*/
-check_priority:
min_priority = (order > PAGE_ALLOC_COSTLY_ORDER) ?
MIN_COMPACT_COSTLY_PRIORITY : MIN_COMPACT_PRIORITY;
@@ -5137,383 +4846,6 @@ unsigned long nr_free_buffer_pages(void)
}
EXPORT_SYMBOL_GPL(nr_free_buffer_pages);
-static inline void show_node(struct zone *zone)
-{
- if (IS_ENABLED(CONFIG_NUMA))
- printk("Node %d ", zone_to_nid(zone));
-}
-
-long si_mem_available(void)
-{
- long available;
- unsigned long pagecache;
- unsigned long wmark_low = 0;
- unsigned long pages[NR_LRU_LISTS];
- unsigned long reclaimable;
- struct zone *zone;
- int lru;
-
- for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++)
- pages[lru] = global_node_page_state(NR_LRU_BASE + lru);
-
- for_each_zone(zone)
- wmark_low += low_wmark_pages(zone);
-
- /*
- * Estimate the amount of memory available for userspace allocations,
- * without causing swapping or OOM.
- */
- available = global_zone_page_state(NR_FREE_PAGES) - totalreserve_pages;
-
- /*
- * Not all the page cache can be freed, otherwise the system will
- * start swapping or thrashing. Assume at least half of the page
- * cache, or the low watermark worth of cache, needs to stay.
- */
- pagecache = pages[LRU_ACTIVE_FILE] + pages[LRU_INACTIVE_FILE];
- pagecache -= min(pagecache / 2, wmark_low);
- available += pagecache;
-
- /*
- * Part of the reclaimable slab and other kernel memory consists of
- * items that are in use, and cannot be freed. Cap this estimate at the
- * low watermark.
- */
- reclaimable = global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B) +
- global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE);
- available += reclaimable - min(reclaimable / 2, wmark_low);
-
- if (available < 0)
- available = 0;
- return available;
-}
-EXPORT_SYMBOL_GPL(si_mem_available);
-
-void si_meminfo(struct sysinfo *val)
-{
- val->totalram = totalram_pages();
- val->sharedram = global_node_page_state(NR_SHMEM);
- val->freeram = global_zone_page_state(NR_FREE_PAGES);
- val->bufferram = nr_blockdev_pages();
- val->totalhigh = totalhigh_pages();
- val->freehigh = nr_free_highpages();
- val->mem_unit = PAGE_SIZE;
-}
-
-EXPORT_SYMBOL(si_meminfo);
-
-#ifdef CONFIG_NUMA
-void si_meminfo_node(struct sysinfo *val, int nid)
-{
- int zone_type; /* needs to be signed */
- unsigned long managed_pages = 0;
- unsigned long managed_highpages = 0;
- unsigned long free_highpages = 0;
- pg_data_t *pgdat = NODE_DATA(nid);
-
- for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)
- managed_pages += zone_managed_pages(&pgdat->node_zones[zone_type]);
- val->totalram = managed_pages;
- val->sharedram = node_page_state(pgdat, NR_SHMEM);
- val->freeram = sum_zone_node_page_state(nid, NR_FREE_PAGES);
-#ifdef CONFIG_HIGHMEM
- for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
- struct zone *zone = &pgdat->node_zones[zone_type];
-
- if (is_highmem(zone)) {
- managed_highpages += zone_managed_pages(zone);
- free_highpages += zone_page_state(zone, NR_FREE_PAGES);
- }
- }
- val->totalhigh = managed_highpages;
- val->freehigh = free_highpages;
-#else
- val->totalhigh = managed_highpages;
- val->freehigh = free_highpages;
-#endif
- val->mem_unit = PAGE_SIZE;
-}
-#endif
-
-/*
- * Determine whether the node should be displayed or not, depending on whether
- * SHOW_MEM_FILTER_NODES was passed to show_free_areas().
- */
-static bool show_mem_node_skip(unsigned int flags, int nid, nodemask_t *nodemask)
-{
- if (!(flags & SHOW_MEM_FILTER_NODES))
- return false;
-
- /*
- * no node mask - aka implicit memory numa policy. Do not bother with
- * the synchronization - read_mems_allowed_begin - because we do not
- * have to be precise here.
- */
- if (!nodemask)
- nodemask = &cpuset_current_mems_allowed;
-
- return !node_isset(nid, *nodemask);
-}
-
-static void show_migration_types(unsigned char type)
-{
- static const char types[MIGRATE_TYPES] = {
- [MIGRATE_UNMOVABLE] = 'U',
- [MIGRATE_MOVABLE] = 'M',
- [MIGRATE_RECLAIMABLE] = 'E',
- [MIGRATE_HIGHATOMIC] = 'H',
-#ifdef CONFIG_CMA
- [MIGRATE_CMA] = 'C',
-#endif
-#ifdef CONFIG_MEMORY_ISOLATION
- [MIGRATE_ISOLATE] = 'I',
-#endif
- };
- char tmp[MIGRATE_TYPES + 1];
- char *p = tmp;
- int i;
-
- for (i = 0; i < MIGRATE_TYPES; i++) {
- if (type & (1 << i))
- *p++ = types[i];
- }
-
- *p = '\0';
- printk(KERN_CONT "(%s) ", tmp);
-}
-
-static bool node_has_managed_zones(pg_data_t *pgdat, int max_zone_idx)
-{
- int zone_idx;
- for (zone_idx = 0; zone_idx <= max_zone_idx; zone_idx++)
- if (zone_managed_pages(pgdat->node_zones + zone_idx))
- return true;
- return false;
-}
-
-/*
- * Show free area list (used inside shift_scroll-lock stuff)
- * We also calculate the percentage fragmentation. We do this by counting the
- * memory on each free list with the exception of the first item on the list.
- *
- * Bits in @filter:
- * SHOW_MEM_FILTER_NODES: suppress nodes that are not allowed by current's
- * cpuset.
- */
-void __show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_zone_idx)
-{
- unsigned long free_pcp = 0;
- int cpu, nid;
- struct zone *zone;
- pg_data_t *pgdat;
-
- for_each_populated_zone(zone) {
- if (zone_idx(zone) > max_zone_idx)
- continue;
- if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
- continue;
-
- for_each_online_cpu(cpu)
- free_pcp += per_cpu_ptr(zone->per_cpu_pageset, cpu)->count;
- }
-
- printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"
- " active_file:%lu inactive_file:%lu isolated_file:%lu\n"
- " unevictable:%lu dirty:%lu writeback:%lu\n"
- " slab_reclaimable:%lu slab_unreclaimable:%lu\n"
- " mapped:%lu shmem:%lu pagetables:%lu\n"
- " sec_pagetables:%lu bounce:%lu\n"
- " kernel_misc_reclaimable:%lu\n"
- " free:%lu free_pcp:%lu free_cma:%lu\n",
- global_node_page_state(NR_ACTIVE_ANON),
- global_node_page_state(NR_INACTIVE_ANON),
- global_node_page_state(NR_ISOLATED_ANON),
- global_node_page_state(NR_ACTIVE_FILE),
- global_node_page_state(NR_INACTIVE_FILE),
- global_node_page_state(NR_ISOLATED_FILE),
- global_node_page_state(NR_UNEVICTABLE),
- global_node_page_state(NR_FILE_DIRTY),
- global_node_page_state(NR_WRITEBACK),
- global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B),
- global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B),
- global_node_page_state(NR_FILE_MAPPED),
- global_node_page_state(NR_SHMEM),
- global_node_page_state(NR_PAGETABLE),
- global_node_page_state(NR_SECONDARY_PAGETABLE),
- global_zone_page_state(NR_BOUNCE),
- global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE),
- global_zone_page_state(NR_FREE_PAGES),
- free_pcp,
- global_zone_page_state(NR_FREE_CMA_PAGES));
-
- for_each_online_pgdat(pgdat) {
- if (show_mem_node_skip(filter, pgdat->node_id, nodemask))
- continue;
- if (!node_has_managed_zones(pgdat, max_zone_idx))
- continue;
-
- printk("Node %d"
- " active_anon:%lukB"
- " inactive_anon:%lukB"
- " active_file:%lukB"
- " inactive_file:%lukB"
- " unevictable:%lukB"
- " isolated(anon):%lukB"
- " isolated(file):%lukB"
- " mapped:%lukB"
- " dirty:%lukB"
- " writeback:%lukB"
- " shmem:%lukB"
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- " shmem_thp: %lukB"
- " shmem_pmdmapped: %lukB"
- " anon_thp: %lukB"
-#endif
- " writeback_tmp:%lukB"
- " kernel_stack:%lukB"
-#ifdef CONFIG_SHADOW_CALL_STACK
- " shadow_call_stack:%lukB"
-#endif
- " pagetables:%lukB"
- " sec_pagetables:%lukB"
- " all_unreclaimable? %s"
- "\n",
- pgdat->node_id,
- K(node_page_state(pgdat, NR_ACTIVE_ANON)),
- K(node_page_state(pgdat, NR_INACTIVE_ANON)),
- K(node_page_state(pgdat, NR_ACTIVE_FILE)),
- K(node_page_state(pgdat, NR_INACTIVE_FILE)),
- K(node_page_state(pgdat, NR_UNEVICTABLE)),
- K(node_page_state(pgdat, NR_ISOLATED_ANON)),
- K(node_page_state(pgdat, NR_ISOLATED_FILE)),
- K(node_page_state(pgdat, NR_FILE_MAPPED)),
- K(node_page_state(pgdat, NR_FILE_DIRTY)),
- K(node_page_state(pgdat, NR_WRITEBACK)),
- K(node_page_state(pgdat, NR_SHMEM)),
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- K(node_page_state(pgdat, NR_SHMEM_THPS)),
- K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED)),
- K(node_page_state(pgdat, NR_ANON_THPS)),
-#endif
- K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),
- node_page_state(pgdat, NR_KERNEL_STACK_KB),
-#ifdef CONFIG_SHADOW_CALL_STACK
- node_page_state(pgdat, NR_KERNEL_SCS_KB),
-#endif
- K(node_page_state(pgdat, NR_PAGETABLE)),
- K(node_page_state(pgdat, NR_SECONDARY_PAGETABLE)),
- pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ?
- "yes" : "no");
- }
-
- for_each_populated_zone(zone) {
- int i;
-
- if (zone_idx(zone) > max_zone_idx)
- continue;
- if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
- continue;
-
- free_pcp = 0;
- for_each_online_cpu(cpu)
- free_pcp += per_cpu_ptr(zone->per_cpu_pageset, cpu)->count;
-
- show_node(zone);
- printk(KERN_CONT
- "%s"
- " free:%lukB"
- " boost:%lukB"
- " min:%lukB"
- " low:%lukB"
- " high:%lukB"
- " reserved_highatomic:%luKB"
- " active_anon:%lukB"
- " inactive_anon:%lukB"
- " active_file:%lukB"
- " inactive_file:%lukB"
- " unevictable:%lukB"
- " writepending:%lukB"
- " present:%lukB"
- " managed:%lukB"
- " mlocked:%lukB"
- " bounce:%lukB"
- " free_pcp:%lukB"
- " local_pcp:%ukB"
- " free_cma:%lukB"
- "\n",
- zone->name,
- K(zone_page_state(zone, NR_FREE_PAGES)),
- K(zone->watermark_boost),
- K(min_wmark_pages(zone)),
- K(low_wmark_pages(zone)),
- K(high_wmark_pages(zone)),
- K(zone->nr_reserved_highatomic),
- K(zone_page_state(zone, NR_ZONE_ACTIVE_ANON)),
- K(zone_page_state(zone, NR_ZONE_INACTIVE_ANON)),
- K(zone_page_state(zone, NR_ZONE_ACTIVE_FILE)),
- K(zone_page_state(zone, NR_ZONE_INACTIVE_FILE)),
- K(zone_page_state(zone, NR_ZONE_UNEVICTABLE)),
- K(zone_page_state(zone, NR_ZONE_WRITE_PENDING)),
- K(zone->present_pages),
- K(zone_managed_pages(zone)),
- K(zone_page_state(zone, NR_MLOCK)),
- K(zone_page_state(zone, NR_BOUNCE)),
- K(free_pcp),
- K(this_cpu_read(zone->per_cpu_pageset->count)),
- K(zone_page_state(zone, NR_FREE_CMA_PAGES)));
- printk("lowmem_reserve[]:");
- for (i = 0; i < MAX_NR_ZONES; i++)
- printk(KERN_CONT " %ld", zone->lowmem_reserve[i]);
- printk(KERN_CONT "\n");
- }
-
- for_each_populated_zone(zone) {
- unsigned int order;
- unsigned long nr[MAX_ORDER + 1], flags, total = 0;
- unsigned char types[MAX_ORDER + 1];
-
- if (zone_idx(zone) > max_zone_idx)
- continue;
- if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
- continue;
- show_node(zone);
- printk(KERN_CONT "%s: ", zone->name);
-
- spin_lock_irqsave(&zone->lock, flags);
- for (order = 0; order <= MAX_ORDER; order++) {
- struct free_area *area = &zone->free_area[order];
- int type;
-
- nr[order] = area->nr_free;
- total += nr[order] << order;
-
- types[order] = 0;
- for (type = 0; type < MIGRATE_TYPES; type++) {
- if (!free_area_empty(area, type))
- types[order] |= 1 << type;
- }
- }
- spin_unlock_irqrestore(&zone->lock, flags);
- for (order = 0; order <= MAX_ORDER; order++) {
- printk(KERN_CONT "%lu*%lukB ",
- nr[order], K(1UL) << order);
- if (nr[order])
- show_migration_types(types[order]);
- }
- printk(KERN_CONT "= %lukB\n", K(total));
- }
-
- for_each_online_node(nid) {
- if (show_mem_node_skip(filter, nid, nodemask))
- continue;
- hugetlb_show_meminfo_node(nid);
- }
-
- printk("%ld total pagecache pages\n", global_node_page_state(NR_FILE_PAGES));
-
- show_swap_cache_info();
-}
-
static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
{
zoneref->zone = zone;
@@ -5560,12 +4892,12 @@ static int __parse_numa_zonelist_order(char *s)
return 0;
}
-char numa_zonelist_order[] = "Node";
-
+static char numa_zonelist_order[] = "Node";
+#define NUMA_ZONELIST_ORDER_LEN 16
/*
* sysctl handler for numa_zonelist_order
*/
-int numa_zonelist_order_handler(struct ctl_table *table, int write,
+static int numa_zonelist_order_handler(struct ctl_table *table, int write,
void *buffer, size_t *length, loff_t *ppos)
{
if (write)
@@ -5573,7 +4905,6 @@ int numa_zonelist_order_handler(struct ctl_table *table, int write,
return proc_dostring(table, write, buffer, length, ppos);
}
-
static int node_load[MAX_NUMNODES];
/**
@@ -5976,6 +5307,7 @@ static int zone_batchsize(struct zone *zone)
#endif
}
+static int percpu_pagelist_high_fraction;
static int zone_highsize(struct zone *zone, int batch, int cpu_online)
{
#ifdef CONFIG_MMU
@@ -6505,7 +5837,7 @@ postcore_initcall(init_per_zone_wmark_min)
* that we can call two helper functions whenever min_free_kbytes
* changes.
*/
-int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write,
+static int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write,
void *buffer, size_t *length, loff_t *ppos)
{
int rc;
@@ -6521,7 +5853,7 @@ int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write,
return 0;
}
-int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write,
+static int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write,
void *buffer, size_t *length, loff_t *ppos)
{
int rc;
@@ -6551,7 +5883,7 @@ static void setup_min_unmapped_ratio(void)
}
-int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write,
+static int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write,
void *buffer, size_t *length, loff_t *ppos)
{
int rc;
@@ -6578,7 +5910,7 @@ static void setup_min_slab_ratio(void)
sysctl_min_slab_ratio) / 100;
}
-int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write,
+static int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write,
void *buffer, size_t *length, loff_t *ppos)
{
int rc;
@@ -6602,8 +5934,8 @@ int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write,
* minimum watermarks. The lowmem reserve ratio can only make sense
* if in function of the boot time zone sizes.
*/
-int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, int write,
- void *buffer, size_t *length, loff_t *ppos)
+static int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table,
+ int write, void *buffer, size_t *length, loff_t *ppos)
{
int i;
@@ -6623,7 +5955,7 @@ int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, int write,
* cpu. It is the fraction of total pages in each zone that a hot per cpu
* pagelist can have before it gets flushed back to buddy allocator.
*/
-int percpu_pagelist_high_fraction_sysctl_handler(struct ctl_table *table,
+static int percpu_pagelist_high_fraction_sysctl_handler(struct ctl_table *table,
int write, void *buffer, size_t *length, loff_t *ppos)
{
struct zone *zone;
@@ -6656,9 +5988,83 @@ out:
return ret;
}
+static struct ctl_table page_alloc_sysctl_table[] = {
+ {
+ .procname = "min_free_kbytes",
+ .data = &min_free_kbytes,
+ .maxlen = sizeof(min_free_kbytes),
+ .mode = 0644,
+ .proc_handler = min_free_kbytes_sysctl_handler,
+ .extra1 = SYSCTL_ZERO,
+ },
+ {
+ .procname = "watermark_boost_factor",
+ .data = &watermark_boost_factor,
+ .maxlen = sizeof(watermark_boost_factor),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ },
+ {
+ .procname = "watermark_scale_factor",
+ .data = &watermark_scale_factor,
+ .maxlen = sizeof(watermark_scale_factor),
+ .mode = 0644,
+ .proc_handler = watermark_scale_factor_sysctl_handler,
+ .extra1 = SYSCTL_ONE,
+ .extra2 = SYSCTL_THREE_THOUSAND,
+ },
+ {
+ .procname = "percpu_pagelist_high_fraction",
+ .data = &percpu_pagelist_high_fraction,
+ .maxlen = sizeof(percpu_pagelist_high_fraction),
+ .mode = 0644,
+ .proc_handler = percpu_pagelist_high_fraction_sysctl_handler,
+ .extra1 = SYSCTL_ZERO,
+ },
+ {
+ .procname = "lowmem_reserve_ratio",
+ .data = &sysctl_lowmem_reserve_ratio,
+ .maxlen = sizeof(sysctl_lowmem_reserve_ratio),
+ .mode = 0644,
+ .proc_handler = lowmem_reserve_ratio_sysctl_handler,
+ },
+#ifdef CONFIG_NUMA
+ {
+ .procname = "numa_zonelist_order",
+ .data = &numa_zonelist_order,
+ .maxlen = NUMA_ZONELIST_ORDER_LEN,
+ .mode = 0644,
+ .proc_handler = numa_zonelist_order_handler,
+ },
+ {
+ .procname = "min_unmapped_ratio",
+ .data = &sysctl_min_unmapped_ratio,
+ .maxlen = sizeof(sysctl_min_unmapped_ratio),
+ .mode = 0644,
+ .proc_handler = sysctl_min_unmapped_ratio_sysctl_handler,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE_HUNDRED,
+ },
+ {
+ .procname = "min_slab_ratio",
+ .data = &sysctl_min_slab_ratio,
+ .maxlen = sizeof(sysctl_min_slab_ratio),
+ .mode = 0644,
+ .proc_handler = sysctl_min_slab_ratio_sysctl_handler,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE_HUNDRED,
+ },
+#endif
+ {}
+};
+
+void __init page_alloc_sysctl_init(void)
+{
+ register_sysctl_init("vm", page_alloc_sysctl_table);
+}
+
#ifdef CONFIG_CONTIG_ALLOC
-#if defined(CONFIG_DYNAMIC_DEBUG) || \
- (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE))
/* Usage: See admin-guide/dynamic-debug-howto.rst */
static void alloc_contig_dump_pages(struct list_head *page_list)
{
@@ -6672,11 +6078,6 @@ static void alloc_contig_dump_pages(struct list_head *page_list)
dump_page(page, "migration failure");
}
}
-#else
-static inline void alloc_contig_dump_pages(struct list_head *page_list)
-{
-}
-#endif
/* [start, end) must belong to a single zone. */
int __alloc_contig_migrate_range(struct compact_control *cc,
@@ -7215,3 +6616,150 @@ bool has_managed_dma(void)
return false;
}
#endif /* CONFIG_ZONE_DMA */
+
+#ifdef CONFIG_UNACCEPTED_MEMORY
+
+/* Counts number of zones with unaccepted pages. */
+static DEFINE_STATIC_KEY_FALSE(zones_with_unaccepted_pages);
+
+static bool lazy_accept = true;
+
+static int __init accept_memory_parse(char *p)
+{
+ if (!strcmp(p, "lazy")) {
+ lazy_accept = true;
+ return 0;
+ } else if (!strcmp(p, "eager")) {
+ lazy_accept = false;
+ return 0;
+ } else {
+ return -EINVAL;
+ }
+}
+early_param("accept_memory", accept_memory_parse);
+
+static bool page_contains_unaccepted(struct page *page, unsigned int order)
+{
+ phys_addr_t start = page_to_phys(page);
+ phys_addr_t end = start + (PAGE_SIZE << order);
+
+ return range_contains_unaccepted_memory(start, end);
+}
+
+static void accept_page(struct page *page, unsigned int order)
+{
+ phys_addr_t start = page_to_phys(page);
+
+ accept_memory(start, start + (PAGE_SIZE << order));
+}
+
+static bool try_to_accept_memory_one(struct zone *zone)
+{
+ unsigned long flags;
+ struct page *page;
+ bool last;
+
+ if (list_empty(&zone->unaccepted_pages))
+ return false;
+
+ spin_lock_irqsave(&zone->lock, flags);
+ page = list_first_entry_or_null(&zone->unaccepted_pages,
+ struct page, lru);
+ if (!page) {
+ spin_unlock_irqrestore(&zone->lock, flags);
+ return false;
+ }
+
+ list_del(&page->lru);
+ last = list_empty(&zone->unaccepted_pages);
+
+ __mod_zone_freepage_state(zone, -MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE);
+ __mod_zone_page_state(zone, NR_UNACCEPTED, -MAX_ORDER_NR_PAGES);
+ spin_unlock_irqrestore(&zone->lock, flags);
+
+ accept_page(page, MAX_ORDER);
+
+ __free_pages_ok(page, MAX_ORDER, FPI_TO_TAIL);
+
+ if (last)
+ static_branch_dec(&zones_with_unaccepted_pages);
+
+ return true;
+}
+
+static bool try_to_accept_memory(struct zone *zone, unsigned int order)
+{
+ long to_accept;
+ int ret = false;
+
+ /* How much to accept to get to high watermark? */
+ to_accept = high_wmark_pages(zone) -
+ (zone_page_state(zone, NR_FREE_PAGES) -
+ __zone_watermark_unusable_free(zone, order, 0));
+
+ /* Accept at least one page */
+ do {
+ if (!try_to_accept_memory_one(zone))
+ break;
+ ret = true;
+ to_accept -= MAX_ORDER_NR_PAGES;
+ } while (to_accept > 0);
+
+ return ret;
+}
+
+static inline bool has_unaccepted_memory(void)
+{
+ return static_branch_unlikely(&zones_with_unaccepted_pages);
+}
+
+static bool __free_unaccepted(struct page *page)
+{
+ struct zone *zone = page_zone(page);
+ unsigned long flags;
+ bool first = false;
+
+ if (!lazy_accept)
+ return false;
+
+ spin_lock_irqsave(&zone->lock, flags);
+ first = list_empty(&zone->unaccepted_pages);
+ list_add_tail(&page->lru, &zone->unaccepted_pages);
+ __mod_zone_freepage_state(zone, MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE);
+ __mod_zone_page_state(zone, NR_UNACCEPTED, MAX_ORDER_NR_PAGES);
+ spin_unlock_irqrestore(&zone->lock, flags);
+
+ if (first)
+ static_branch_inc(&zones_with_unaccepted_pages);
+
+ return true;
+}
+
+#else
+
+static bool page_contains_unaccepted(struct page *page, unsigned int order)
+{
+ return false;
+}
+
+static void accept_page(struct page *page, unsigned int order)
+{
+}
+
+static bool try_to_accept_memory(struct zone *zone, unsigned int order)
+{
+ return false;
+}
+
+static inline bool has_unaccepted_memory(void)
+{
+ return false;
+}
+
+static bool __free_unaccepted(struct page *page)
+{
+ BUILD_BUG();
+ return false;
+}
+
+#endif /* CONFIG_UNACCEPTED_MEMORY */
diff --git a/mm/page_io.c b/mm/page_io.c
index 87b682d18850..684cd3c7b59b 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -338,7 +338,7 @@ static void swap_writepage_bdev_sync(struct page *page,
bio_init(&bio, sis->bdev, &bv, 1,
REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc));
bio.bi_iter.bi_sector = swap_page_sector(page);
- bio_add_page(&bio, page, thp_size(page), 0);
+ __bio_add_page(&bio, page, thp_size(page), 0);
bio_associate_blkg_from_page(&bio, page);
count_swpout_vm_event(page);
@@ -360,7 +360,7 @@ static void swap_writepage_bdev_async(struct page *page,
GFP_NOIO);
bio->bi_iter.bi_sector = swap_page_sector(page);
bio->bi_end_io = end_swap_bio_write;
- bio_add_page(bio, page, thp_size(page), 0);
+ __bio_add_page(bio, page, thp_size(page), 0);
bio_associate_blkg_from_page(bio, page);
count_swpout_vm_event(page);
@@ -468,7 +468,7 @@ static void swap_readpage_bdev_sync(struct page *page,
bio_init(&bio, sis->bdev, &bv, 1, REQ_OP_READ);
bio.bi_iter.bi_sector = swap_page_sector(page);
- bio_add_page(&bio, page, thp_size(page), 0);
+ __bio_add_page(&bio, page, thp_size(page), 0);
/*
* Keep this task valid during swap readpage because the oom killer may
* attempt to access it in the page fault retry time check.
@@ -488,7 +488,7 @@ static void swap_readpage_bdev_async(struct page *page,
bio = bio_alloc(sis->bdev, 1, REQ_OP_READ, GFP_KERNEL);
bio->bi_iter.bi_sector = swap_page_sector(page);
bio->bi_end_io = end_swap_bio_read;
- bio_add_page(bio, page, thp_size(page), 0);
+ __bio_add_page(bio, page, thp_size(page), 0);
count_vm_event(PSWPIN);
submit_bio(bio);
}
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index c6f3605e37ab..6599cc965e21 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -481,10 +481,9 @@ failed:
}
/**
- * start_isolate_page_range() - make page-allocation-type of range of pages to
- * be MIGRATE_ISOLATE.
- * @start_pfn: The lower PFN of the range to be isolated.
- * @end_pfn: The upper PFN of the range to be isolated.
+ * start_isolate_page_range() - mark page range MIGRATE_ISOLATE
+ * @start_pfn: The first PFN of the range to be isolated.
+ * @end_pfn: The last PFN of the range to be isolated.
* @migratetype: Migrate type to set in error recovery.
* @flags: The following flags are allowed (they can be combined in
* a bit mask)
@@ -571,8 +570,14 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
return 0;
}
-/*
- * Make isolated pages available again.
+/**
+ * undo_isolate_page_range - undo effects of start_isolate_page_range()
+ * @start_pfn: The first PFN of the isolated range
+ * @end_pfn: The last PFN of the isolated range
+ * @migratetype: New migrate type to set on the range
+ *
+ * This finds every MIGRATE_ISOLATE page block in the given range
+ * and switches it to @migratetype.
*/
void undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
int migratetype)
@@ -631,7 +636,21 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn,
return pfn;
}
-/* Caller should ensure that requested range is in a single zone */
+/**
+ * test_pages_isolated - check if pageblocks in range are isolated
+ * @start_pfn: The first PFN of the isolated range
+ * @end_pfn: The first PFN *after* the isolated range
+ * @isol_flags: Testing mode flags
+ *
+ * This tests if all in the specified range are free.
+ *
+ * If %MEMORY_OFFLINE is specified in @flags, it will consider
+ * poisoned and offlined pages free as well.
+ *
+ * Caller must ensure the requested range doesn't span zones.
+ *
+ * Returns 0 if true, -EBUSY if one or more pages are in use.
+ */
int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn,
int isol_flags)
{
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 31169b3e7f06..c93baef0148f 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -418,7 +418,7 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn,
pageblock_mt = get_pageblock_migratetype(page);
page_mt = gfp_migratetype(page_owner->gfp_mask);
ret += scnprintf(kbuf + ret, count - ret,
- "PFN %lu type %s Block %lu type %s Flags %pGp\n",
+ "PFN 0x%lx type %s Block %lu type %s Flags %pGp\n",
pfn,
migratetype_names[page_mt],
pfn >> pageblock_order,
diff --git a/mm/page_table_check.c b/mm/page_table_check.c
index 25d8610c0042..93ec7690a0d8 100644
--- a/mm/page_table_check.c
+++ b/mm/page_table_check.c
@@ -71,6 +71,8 @@ static void page_table_check_clear(struct mm_struct *mm, unsigned long addr,
page = pfn_to_page(pfn);
page_ext = page_ext_get(page);
+
+ BUG_ON(PageSlab(page));
anon = PageAnon(page);
for (i = 0; i < pgcnt; i++) {
@@ -107,6 +109,8 @@ static void page_table_check_set(struct mm_struct *mm, unsigned long addr,
page = pfn_to_page(pfn);
page_ext = page_ext_get(page);
+
+ BUG_ON(PageSlab(page));
anon = PageAnon(page);
for (i = 0; i < pgcnt; i++) {
@@ -133,6 +137,8 @@ void __page_table_check_zero(struct page *page, unsigned int order)
struct page_ext *page_ext;
unsigned long i;
+ BUG_ON(PageSlab(page));
+
page_ext = page_ext_get(page);
BUG_ON(!page_ext);
for (i = 0; i < (1ul << order); i++) {
@@ -190,7 +196,7 @@ void __page_table_check_pte_set(struct mm_struct *mm, unsigned long addr,
if (&init_mm == mm)
return;
- __page_table_check_pte_clear(mm, addr, *ptep);
+ __page_table_check_pte_clear(mm, addr, ptep_get(ptep));
if (pte_user_accessible_page(pte)) {
page_table_check_set(mm, addr, pte_pfn(pte),
PAGE_SIZE >> PAGE_SHIFT,
@@ -240,8 +246,10 @@ void __page_table_check_pte_clear_range(struct mm_struct *mm,
pte_t *ptep = pte_offset_map(&pmd, addr);
unsigned long i;
+ if (WARN_ON(!ptep))
+ return;
for (i = 0; i < PTRS_PER_PTE; i++) {
- __page_table_check_pte_clear(mm, addr, *ptep);
+ __page_table_check_pte_clear(mm, addr, ptep_get(ptep));
addr += PAGE_SIZE;
ptep++;
}
diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index 4e448cfbc6ef..49e0d28f0379 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -13,42 +13,61 @@ static inline bool not_found(struct page_vma_mapped_walk *pvmw)
return false;
}
-static bool map_pte(struct page_vma_mapped_walk *pvmw)
+static bool map_pte(struct page_vma_mapped_walk *pvmw, spinlock_t **ptlp)
{
- pvmw->pte = pte_offset_map(pvmw->pmd, pvmw->address);
- if (!(pvmw->flags & PVMW_SYNC)) {
- if (pvmw->flags & PVMW_MIGRATION) {
- if (!is_swap_pte(*pvmw->pte))
- return false;
- } else {
- /*
- * We get here when we are trying to unmap a private
- * device page from the process address space. Such
- * page is not CPU accessible and thus is mapped as
- * a special swap entry, nonetheless it still does
- * count as a valid regular mapping for the page (and
- * is accounted as such in page maps count).
- *
- * So handle this special case as if it was a normal
- * page mapping ie lock CPU page table and returns
- * true.
- *
- * For more details on device private memory see HMM
- * (include/linux/hmm.h or mm/hmm.c).
- */
- if (is_swap_pte(*pvmw->pte)) {
- swp_entry_t entry;
+ pte_t ptent;
- /* Handle un-addressable ZONE_DEVICE memory */
- entry = pte_to_swp_entry(*pvmw->pte);
- if (!is_device_private_entry(entry) &&
- !is_device_exclusive_entry(entry))
- return false;
- } else if (!pte_present(*pvmw->pte))
- return false;
- }
+ if (pvmw->flags & PVMW_SYNC) {
+ /* Use the stricter lookup */
+ pvmw->pte = pte_offset_map_lock(pvmw->vma->vm_mm, pvmw->pmd,
+ pvmw->address, &pvmw->ptl);
+ *ptlp = pvmw->ptl;
+ return !!pvmw->pte;
}
- pvmw->ptl = pte_lockptr(pvmw->vma->vm_mm, pvmw->pmd);
+
+ /*
+ * It is important to return the ptl corresponding to pte,
+ * in case *pvmw->pmd changes underneath us; so we need to
+ * return it even when choosing not to lock, in case caller
+ * proceeds to loop over next ptes, and finds a match later.
+ * Though, in most cases, page lock already protects this.
+ */
+ pvmw->pte = pte_offset_map_nolock(pvmw->vma->vm_mm, pvmw->pmd,
+ pvmw->address, ptlp);
+ if (!pvmw->pte)
+ return false;
+
+ ptent = ptep_get(pvmw->pte);
+
+ if (pvmw->flags & PVMW_MIGRATION) {
+ if (!is_swap_pte(ptent))
+ return false;
+ } else if (is_swap_pte(ptent)) {
+ swp_entry_t entry;
+ /*
+ * Handle un-addressable ZONE_DEVICE memory.
+ *
+ * We get here when we are trying to unmap a private
+ * device page from the process address space. Such
+ * page is not CPU accessible and thus is mapped as
+ * a special swap entry, nonetheless it still does
+ * count as a valid regular mapping for the page
+ * (and is accounted as such in page maps count).
+ *
+ * So handle this special case as if it was a normal
+ * page mapping ie lock CPU page table and return true.
+ *
+ * For more details on device private memory see HMM
+ * (include/linux/hmm.h or mm/hmm.c).
+ */
+ entry = pte_to_swp_entry(ptent);
+ if (!is_device_private_entry(entry) &&
+ !is_device_exclusive_entry(entry))
+ return false;
+ } else if (!pte_present(ptent)) {
+ return false;
+ }
+ pvmw->ptl = *ptlp;
spin_lock(pvmw->ptl);
return true;
}
@@ -75,33 +94,34 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw)
static bool check_pte(struct page_vma_mapped_walk *pvmw)
{
unsigned long pfn;
+ pte_t ptent = ptep_get(pvmw->pte);
if (pvmw->flags & PVMW_MIGRATION) {
swp_entry_t entry;
- if (!is_swap_pte(*pvmw->pte))
+ if (!is_swap_pte(ptent))
return false;
- entry = pte_to_swp_entry(*pvmw->pte);
+ entry = pte_to_swp_entry(ptent);
if (!is_migration_entry(entry) &&
!is_device_exclusive_entry(entry))
return false;
pfn = swp_offset_pfn(entry);
- } else if (is_swap_pte(*pvmw->pte)) {
+ } else if (is_swap_pte(ptent)) {
swp_entry_t entry;
/* Handle un-addressable ZONE_DEVICE memory */
- entry = pte_to_swp_entry(*pvmw->pte);
+ entry = pte_to_swp_entry(ptent);
if (!is_device_private_entry(entry) &&
!is_device_exclusive_entry(entry))
return false;
pfn = swp_offset_pfn(entry);
} else {
- if (!pte_present(*pvmw->pte))
+ if (!pte_present(ptent))
return false;
- pfn = pte_pfn(*pvmw->pte);
+ pfn = pte_pfn(ptent);
}
return (pfn - pvmw->pfn) < pvmw->nr_pages;
@@ -153,6 +173,7 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
struct vm_area_struct *vma = pvmw->vma;
struct mm_struct *mm = vma->vm_mm;
unsigned long end;
+ spinlock_t *ptl;
pgd_t *pgd;
p4d_t *p4d;
pud_t *pud;
@@ -210,7 +231,7 @@ restart:
* compiler and used as a stale value after we've observed a
* subsequent update.
*/
- pmde = READ_ONCE(*pvmw->pmd);
+ pmde = pmdp_get_lockless(pvmw->pmd);
if (pmd_trans_huge(pmde) || is_pmd_migration_entry(pmde) ||
(pmd_present(pmde) && pmd_devmap(pmde))) {
@@ -254,8 +275,11 @@ restart:
step_forward(pvmw, PMD_SIZE);
continue;
}
- if (!map_pte(pvmw))
+ if (!map_pte(pvmw, &ptl)) {
+ if (!pvmw->pte)
+ goto restart;
goto next_pte;
+ }
this_pte:
if (check_pte(pvmw))
return true;
@@ -275,14 +299,10 @@ next_pte:
goto restart;
}
pvmw->pte++;
- if ((pvmw->flags & PVMW_SYNC) && !pvmw->ptl) {
- pvmw->ptl = pte_lockptr(mm, pvmw->pmd);
- spin_lock(pvmw->ptl);
- }
- } while (pte_none(*pvmw->pte));
+ } while (pte_none(ptep_get(pvmw->pte)));
if (!pvmw->ptl) {
- pvmw->ptl = pte_lockptr(mm, pvmw->pmd);
+ pvmw->ptl = ptl;
spin_lock(pvmw->ptl);
}
goto this_pte;
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index cb23f8a15c13..64437105fe0d 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -46,15 +46,27 @@ static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
spinlock_t *ptl;
if (walk->no_vma) {
- pte = pte_offset_map(pmd, addr);
- err = walk_pte_range_inner(pte, addr, end, walk);
- pte_unmap(pte);
+ /*
+ * pte_offset_map() might apply user-specific validation.
+ */
+ if (walk->mm == &init_mm)
+ pte = pte_offset_kernel(pmd, addr);
+ else
+ pte = pte_offset_map(pmd, addr);
+ if (pte) {
+ err = walk_pte_range_inner(pte, addr, end, walk);
+ if (walk->mm != &init_mm)
+ pte_unmap(pte);
+ }
} else {
pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
- err = walk_pte_range_inner(pte, addr, end, walk);
- pte_unmap_unlock(pte, ptl);
+ if (pte) {
+ err = walk_pte_range_inner(pte, addr, end, walk);
+ pte_unmap_unlock(pte, ptl);
+ }
}
-
+ if (!pte)
+ walk->action = ACTION_AGAIN;
return err;
}
@@ -141,11 +153,8 @@ again:
!(ops->pte_entry))
continue;
- if (walk->vma) {
+ if (walk->vma)
split_huge_pmd(walk->vma, pmd, addr);
- if (pmd_trans_unstable(pmd))
- goto again;
- }
if (is_hugepd(__hugepd(pmd_val(*pmd))))
err = walk_hugepd_range((hugepd_t *)pmd, addr, next, walk, PMD_SHIFT);
@@ -153,6 +162,10 @@ again:
err = walk_pte_range(pmd, addr, next, walk);
if (err)
break;
+
+ if (walk->action == ACTION_AGAIN)
+ goto again;
+
} while (pmd++, addr = next, addr != end);
return err;
diff --git a/mm/percpu-internal.h b/mm/percpu-internal.h
index f9847c131998..cdd0aa597a81 100644
--- a/mm/percpu-internal.h
+++ b/mm/percpu-internal.h
@@ -41,10 +41,17 @@ struct pcpu_chunk {
struct list_head list; /* linked to pcpu_slot lists */
int free_bytes; /* free bytes in the chunk */
struct pcpu_block_md chunk_md;
- void *base_addr; /* base address of this chunk */
+ unsigned long *bound_map; /* boundary map */
+
+ /*
+ * base_addr is the base address of this chunk.
+ * To reduce false sharing, current layout is optimized to make sure
+ * base_addr locate in the different cacheline with free_bytes and
+ * chunk_md.
+ */
+ void *base_addr ____cacheline_aligned_in_smp;
unsigned long *alloc_map; /* allocation map */
- unsigned long *bound_map; /* boundary map */
struct pcpu_block_md *md_blocks; /* metadata blocks */
void *data; /* chunk data */
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index d2fc52bffafc..4d454953046f 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -10,6 +10,8 @@
#include <linux/pagemap.h>
#include <linux/hugetlb.h>
#include <linux/pgtable.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
#include <linux/mm_inline.h>
#include <asm/tlb.h>
@@ -66,7 +68,7 @@ int ptep_set_access_flags(struct vm_area_struct *vma,
unsigned long address, pte_t *ptep,
pte_t entry, int dirty)
{
- int changed = !pte_same(*ptep, entry);
+ int changed = !pte_same(ptep_get(ptep), entry);
if (changed) {
set_pte_at(vma->vm_mm, address, ptep, entry);
flush_tlb_fix_spurious_fault(vma, address, ptep);
@@ -229,3 +231,57 @@ pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
}
#endif
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+pte_t *__pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp)
+{
+ pmd_t pmdval;
+
+ /* rcu_read_lock() to be added later */
+ pmdval = pmdp_get_lockless(pmd);
+ if (pmdvalp)
+ *pmdvalp = pmdval;
+ if (unlikely(pmd_none(pmdval) || is_pmd_migration_entry(pmdval)))
+ goto nomap;
+ if (unlikely(pmd_trans_huge(pmdval) || pmd_devmap(pmdval)))
+ goto nomap;
+ if (unlikely(pmd_bad(pmdval))) {
+ pmd_clear_bad(pmd);
+ goto nomap;
+ }
+ return __pte_map(&pmdval, addr);
+nomap:
+ /* rcu_read_unlock() to be added later */
+ return NULL;
+}
+
+pte_t *pte_offset_map_nolock(struct mm_struct *mm, pmd_t *pmd,
+ unsigned long addr, spinlock_t **ptlp)
+{
+ pmd_t pmdval;
+ pte_t *pte;
+
+ pte = __pte_offset_map(pmd, addr, &pmdval);
+ if (likely(pte))
+ *ptlp = pte_lockptr(mm, &pmdval);
+ return pte;
+}
+
+pte_t *__pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd,
+ unsigned long addr, spinlock_t **ptlp)
+{
+ spinlock_t *ptl;
+ pmd_t pmdval;
+ pte_t *pte;
+again:
+ pte = __pte_offset_map(pmd, addr, &pmdval);
+ if (unlikely(!pte))
+ return pte;
+ ptl = pte_lockptr(mm, &pmdval);
+ spin_lock(ptl);
+ if (likely(pmd_same(pmdval, pmdp_get_lockless(pmd)))) {
+ *ptlp = ptl;
+ return pte;
+ }
+ pte_unmap_unlock(pte, ptl);
+ goto again;
+}
diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c
index 78dfaf9e8990..0523edab03a6 100644
--- a/mm/process_vm_access.c
+++ b/mm/process_vm_access.c
@@ -104,7 +104,7 @@ static int process_vm_rw_single_vec(unsigned long addr,
mmap_read_lock(mm);
pinned_pages = pin_user_pages_remote(mm, pa, pinned_pages,
flags, process_pages,
- NULL, &locked);
+ &locked);
if (locked)
mmap_read_unlock(mm);
if (pinned_pages <= 0)
diff --git a/mm/ptdump.c b/mm/ptdump.c
index 8adab455a68b..03c1bdae4a43 100644
--- a/mm/ptdump.c
+++ b/mm/ptdump.c
@@ -119,7 +119,7 @@ static int ptdump_pte_entry(pte_t *pte, unsigned long addr,
unsigned long next, struct mm_walk *walk)
{
struct ptdump_state *st = walk->private;
- pte_t val = ptep_get(pte);
+ pte_t val = ptep_get_lockless(pte);
if (st->effective_prot)
st->effective_prot(st, 4, pte_val(val));
diff --git a/mm/readahead.c b/mm/readahead.c
index 47afbca1d122..a9c999aa19af 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -120,7 +120,6 @@
#include <linux/export.h>
#include <linux/backing-dev.h>
#include <linux/task_io_accounting_ops.h>
-#include <linux/pagevec.h>
#include <linux/pagemap.h>
#include <linux/psi.h>
#include <linux/syscalls.h>
diff --git a/mm/rmap.c b/mm/rmap.c
index 19392e090bec..0c0d8857dfce 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -826,7 +826,8 @@ static bool folio_referenced_one(struct folio *folio,
}
if (pvmw.pte) {
- if (lru_gen_enabled() && pte_young(*pvmw.pte)) {
+ if (lru_gen_enabled() &&
+ pte_young(ptep_get(pvmw.pte))) {
lru_gen_look_around(&pvmw);
referenced++;
}
@@ -956,13 +957,13 @@ static int page_vma_mkclean_one(struct page_vma_mapped_walk *pvmw)
address = pvmw->address;
if (pvmw->pte) {
- pte_t entry;
pte_t *pte = pvmw->pte;
+ pte_t entry = ptep_get(pte);
- if (!pte_dirty(*pte) && !pte_write(*pte))
+ if (!pte_dirty(entry) && !pte_write(entry))
continue;
- flush_cache_page(vma, address, pte_pfn(*pte));
+ flush_cache_page(vma, address, pte_pfn(entry));
entry = ptep_clear_flush(vma, address, pte);
entry = pte_wrprotect(entry);
entry = pte_mkclean(entry);
@@ -1137,7 +1138,7 @@ void page_move_anon_rmap(struct page *page, struct vm_area_struct *vma)
* @folio: Folio which contains page.
* @page: Page to add to rmap.
* @vma: VM area to add page to.
- * @address: User virtual address of the mapping
+ * @address: User virtual address of the mapping
* @exclusive: the page is exclusively owned by the current process
*/
static void __page_set_anon_rmap(struct folio *folio, struct page *page,
@@ -1458,6 +1459,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
bool anon_exclusive, ret = true;
struct mmu_notifier_range range;
enum ttu_flags flags = (enum ttu_flags)(long)arg;
+ unsigned long pfn;
/*
* When racing against e.g. zap_pte_range() on another cpu,
@@ -1508,8 +1510,8 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
break;
}
- subpage = folio_page(folio,
- pte_pfn(*pvmw.pte) - folio_pfn(folio));
+ pfn = pte_pfn(ptep_get(pvmw.pte));
+ subpage = folio_page(folio, pfn - folio_pfn(folio));
address = pvmw.address;
anon_exclusive = folio_test_anon(folio) &&
PageAnonExclusive(subpage);
@@ -1571,7 +1573,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
}
pteval = huge_ptep_clear_flush(vma, address, pvmw.pte);
} else {
- flush_cache_page(vma, address, pte_pfn(*pvmw.pte));
+ flush_cache_page(vma, address, pfn);
/* Nuke the page table entry. */
if (should_defer_flush(mm, flags)) {
/*
@@ -1818,6 +1820,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
bool anon_exclusive, ret = true;
struct mmu_notifier_range range;
enum ttu_flags flags = (enum ttu_flags)(long)arg;
+ unsigned long pfn;
/*
* When racing against e.g. zap_pte_range() on another cpu,
@@ -1877,6 +1880,8 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
/* Unexpected PMD-mapped THP? */
VM_BUG_ON_FOLIO(!pvmw.pte, folio);
+ pfn = pte_pfn(ptep_get(pvmw.pte));
+
if (folio_is_zone_device(folio)) {
/*
* Our PTE is a non-present device exclusive entry and
@@ -1891,8 +1896,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
VM_BUG_ON_FOLIO(folio_nr_pages(folio) > 1, folio);
subpage = &folio->page;
} else {
- subpage = folio_page(folio,
- pte_pfn(*pvmw.pte) - folio_pfn(folio));
+ subpage = folio_page(folio, pfn - folio_pfn(folio));
}
address = pvmw.address;
anon_exclusive = folio_test_anon(folio) &&
@@ -1952,7 +1956,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
/* Nuke the hugetlb page table entry */
pteval = huge_ptep_clear_flush(vma, address, pvmw.pte);
} else {
- flush_cache_page(vma, address, pte_pfn(*pvmw.pte));
+ flush_cache_page(vma, address, pfn);
/* Nuke the page table entry. */
if (should_defer_flush(mm, flags)) {
/*
@@ -2187,6 +2191,7 @@ static bool page_make_device_exclusive_one(struct folio *folio,
struct mmu_notifier_range range;
swp_entry_t entry;
pte_t swp_pte;
+ pte_t ptent;
mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0,
vma->vm_mm, address, min(vma->vm_end,
@@ -2198,18 +2203,19 @@ static bool page_make_device_exclusive_one(struct folio *folio,
/* Unexpected PMD-mapped THP? */
VM_BUG_ON_FOLIO(!pvmw.pte, folio);
- if (!pte_present(*pvmw.pte)) {
+ ptent = ptep_get(pvmw.pte);
+ if (!pte_present(ptent)) {
ret = false;
page_vma_mapped_walk_done(&pvmw);
break;
}
subpage = folio_page(folio,
- pte_pfn(*pvmw.pte) - folio_pfn(folio));
+ pte_pfn(ptent) - folio_pfn(folio));
address = pvmw.address;
/* Nuke the page table entry. */
- flush_cache_page(vma, address, pte_pfn(*pvmw.pte));
+ flush_cache_page(vma, address, pte_pfn(ptent));
pteval = ptep_clear_flush(vma, address, pvmw.pte);
/* Set the dirty flag on the folio now the pte is gone. */
@@ -2328,7 +2334,7 @@ int make_device_exclusive_range(struct mm_struct *mm, unsigned long start,
npages = get_user_pages_remote(mm, start, npages,
FOLL_GET | FOLL_WRITE | FOLL_SPLIT_PMD,
- pages, NULL, NULL);
+ pages, NULL);
if (npages < 0)
return npages;
diff --git a/mm/secretmem.c b/mm/secretmem.c
index 0b502625cd30..86442a15d12f 100644
--- a/mm/secretmem.c
+++ b/mm/secretmem.c
@@ -35,7 +35,7 @@
#define SECRETMEM_MODE_MASK (0x0)
#define SECRETMEM_FLAGS_MASK SECRETMEM_MODE_MASK
-static bool secretmem_enable __ro_after_init;
+static bool secretmem_enable __ro_after_init = 1;
module_param_named(enable, secretmem_enable, bool, 0400);
MODULE_PARM_DESC(secretmem_enable,
"Enable secretmem and memfd_secret(2) system call");
@@ -125,7 +125,7 @@ static int secretmem_mmap(struct file *file, struct vm_area_struct *vma)
if ((vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) == 0)
return -EINVAL;
- if (mlock_future_check(vma->vm_mm, vma->vm_flags | VM_LOCKED, len))
+ if (!mlock_future_ok(vma->vm_mm, vma->vm_flags | VM_LOCKED, len))
return -EAGAIN;
vm_flags_set(vma, VM_LOCKED | VM_DONTDUMP);
diff --git a/mm/shmem.c b/mm/shmem.c
index e40a08c5c6d7..2f2e0e618072 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2731,6 +2731,138 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
return retval ? retval : error;
}
+static bool zero_pipe_buf_get(struct pipe_inode_info *pipe,
+ struct pipe_buffer *buf)
+{
+ return true;
+}
+
+static void zero_pipe_buf_release(struct pipe_inode_info *pipe,
+ struct pipe_buffer *buf)
+{
+}
+
+static bool zero_pipe_buf_try_steal(struct pipe_inode_info *pipe,
+ struct pipe_buffer *buf)
+{
+ return false;
+}
+
+static const struct pipe_buf_operations zero_pipe_buf_ops = {
+ .release = zero_pipe_buf_release,
+ .try_steal = zero_pipe_buf_try_steal,
+ .get = zero_pipe_buf_get,
+};
+
+static size_t splice_zeropage_into_pipe(struct pipe_inode_info *pipe,
+ loff_t fpos, size_t size)
+{
+ size_t offset = fpos & ~PAGE_MASK;
+
+ size = min_t(size_t, size, PAGE_SIZE - offset);
+
+ if (!pipe_full(pipe->head, pipe->tail, pipe->max_usage)) {
+ struct pipe_buffer *buf = pipe_head_buf(pipe);
+
+ *buf = (struct pipe_buffer) {
+ .ops = &zero_pipe_buf_ops,
+ .page = ZERO_PAGE(0),
+ .offset = offset,
+ .len = size,
+ };
+ pipe->head++;
+ }
+
+ return size;
+}
+
+static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
+ struct pipe_inode_info *pipe,
+ size_t len, unsigned int flags)
+{
+ struct inode *inode = file_inode(in);
+ struct address_space *mapping = inode->i_mapping;
+ struct folio *folio = NULL;
+ size_t total_spliced = 0, used, npages, n, part;
+ loff_t isize;
+ int error = 0;
+
+ /* Work out how much data we can actually add into the pipe */
+ used = pipe_occupancy(pipe->head, pipe->tail);
+ npages = max_t(ssize_t, pipe->max_usage - used, 0);
+ len = min_t(size_t, len, npages * PAGE_SIZE);
+
+ do {
+ if (*ppos >= i_size_read(inode))
+ break;
+
+ error = shmem_get_folio(inode, *ppos / PAGE_SIZE, &folio, SGP_READ);
+ if (error) {
+ if (error == -EINVAL)
+ error = 0;
+ break;
+ }
+ if (folio) {
+ folio_unlock(folio);
+
+ if (folio_test_hwpoison(folio)) {
+ error = -EIO;
+ break;
+ }
+ }
+
+ /*
+ * i_size must be checked after we know the pages are Uptodate.
+ *
+ * Checking i_size after the check allows us to calculate
+ * the correct value for "nr", which means the zero-filled
+ * part of the page is not copied back to userspace (unless
+ * another truncate extends the file - this is desired though).
+ */
+ isize = i_size_read(inode);
+ if (unlikely(*ppos >= isize))
+ break;
+ part = min_t(loff_t, isize - *ppos, len);
+
+ if (folio) {
+ /*
+ * If users can be writing to this page using arbitrary
+ * virtual addresses, take care about potential aliasing
+ * before reading the page on the kernel side.
+ */
+ if (mapping_writably_mapped(mapping))
+ flush_dcache_folio(folio);
+ folio_mark_accessed(folio);
+ /*
+ * Ok, we have the page, and it's up-to-date, so we can
+ * now splice it into the pipe.
+ */
+ n = splice_folio_into_pipe(pipe, folio, *ppos, part);
+ folio_put(folio);
+ folio = NULL;
+ } else {
+ n = splice_zeropage_into_pipe(pipe, *ppos, len);
+ }
+
+ if (!n)
+ break;
+ len -= n;
+ total_spliced += n;
+ *ppos += n;
+ in->f_ra.prev_pos = *ppos;
+ if (pipe_full(pipe->head, pipe->tail, pipe->max_usage))
+ break;
+
+ cond_resched();
+ } while (len);
+
+ if (folio)
+ folio_put(folio);
+
+ file_accessed(in);
+ return total_spliced ? total_spliced : error;
+}
+
static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence)
{
struct address_space *mapping = file->f_mapping;
@@ -3726,6 +3858,7 @@ out:
static int shmem_show_options(struct seq_file *seq, struct dentry *root)
{
struct shmem_sb_info *sbinfo = SHMEM_SB(root->d_sb);
+ struct mempolicy *mpol;
if (sbinfo->max_blocks != shmem_default_max_blocks())
seq_printf(seq, ",size=%luk",
@@ -3768,7 +3901,9 @@ static int shmem_show_options(struct seq_file *seq, struct dentry *root)
if (sbinfo->huge)
seq_printf(seq, ",huge=%s", shmem_format_huge(sbinfo->huge));
#endif
- shmem_show_mpol(seq, sbinfo->mpol);
+ mpol = shmem_get_sbmpol(sbinfo);
+ shmem_show_mpol(seq, mpol);
+ mpol_put(mpol);
if (sbinfo->noswap)
seq_printf(seq, ",noswap");
return 0;
@@ -3971,7 +4106,7 @@ static const struct file_operations shmem_file_operations = {
.read_iter = shmem_file_read_iter,
.write_iter = generic_file_write_iter,
.fsync = noop_fsync,
- .splice_read = generic_file_splice_read,
+ .splice_read = shmem_file_splice_read,
.splice_write = iter_file_splice_write,
.fallocate = shmem_fallocate,
#endif
@@ -4196,7 +4331,7 @@ static struct file_system_type shmem_fs_type = {
.name = "tmpfs",
.init_fs_context = ramfs_init_fs_context,
.parameters = ramfs_fs_parameters,
- .kill_sb = kill_litter_super,
+ .kill_sb = ramfs_kill_sb,
.fs_flags = FS_USERNS_MOUNT,
};
diff --git a/mm/show_mem.c b/mm/show_mem.c
new file mode 100644
index 000000000000..01f8e9905817
--- /dev/null
+++ b/mm/show_mem.c
@@ -0,0 +1,429 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Generic show_mem() implementation
+ *
+ * Copyright (C) 2008 Johannes Weiner <hannes@saeurebad.de>
+ */
+
+#include <linux/blkdev.h>
+#include <linux/cma.h>
+#include <linux/cpuset.h>
+#include <linux/highmem.h>
+#include <linux/hugetlb.h>
+#include <linux/mm.h>
+#include <linux/mmzone.h>
+#include <linux/swap.h>
+#include <linux/vmstat.h>
+
+#include "internal.h"
+#include "swap.h"
+
+atomic_long_t _totalram_pages __read_mostly;
+EXPORT_SYMBOL(_totalram_pages);
+unsigned long totalreserve_pages __read_mostly;
+unsigned long totalcma_pages __read_mostly;
+
+static inline void show_node(struct zone *zone)
+{
+ if (IS_ENABLED(CONFIG_NUMA))
+ printk("Node %d ", zone_to_nid(zone));
+}
+
+long si_mem_available(void)
+{
+ long available;
+ unsigned long pagecache;
+ unsigned long wmark_low = 0;
+ unsigned long pages[NR_LRU_LISTS];
+ unsigned long reclaimable;
+ struct zone *zone;
+ int lru;
+
+ for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++)
+ pages[lru] = global_node_page_state(NR_LRU_BASE + lru);
+
+ for_each_zone(zone)
+ wmark_low += low_wmark_pages(zone);
+
+ /*
+ * Estimate the amount of memory available for userspace allocations,
+ * without causing swapping or OOM.
+ */
+ available = global_zone_page_state(NR_FREE_PAGES) - totalreserve_pages;
+
+ /*
+ * Not all the page cache can be freed, otherwise the system will
+ * start swapping or thrashing. Assume at least half of the page
+ * cache, or the low watermark worth of cache, needs to stay.
+ */
+ pagecache = pages[LRU_ACTIVE_FILE] + pages[LRU_INACTIVE_FILE];
+ pagecache -= min(pagecache / 2, wmark_low);
+ available += pagecache;
+
+ /*
+ * Part of the reclaimable slab and other kernel memory consists of
+ * items that are in use, and cannot be freed. Cap this estimate at the
+ * low watermark.
+ */
+ reclaimable = global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B) +
+ global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE);
+ available += reclaimable - min(reclaimable / 2, wmark_low);
+
+ if (available < 0)
+ available = 0;
+ return available;
+}
+EXPORT_SYMBOL_GPL(si_mem_available);
+
+void si_meminfo(struct sysinfo *val)
+{
+ val->totalram = totalram_pages();
+ val->sharedram = global_node_page_state(NR_SHMEM);
+ val->freeram = global_zone_page_state(NR_FREE_PAGES);
+ val->bufferram = nr_blockdev_pages();
+ val->totalhigh = totalhigh_pages();
+ val->freehigh = nr_free_highpages();
+ val->mem_unit = PAGE_SIZE;
+}
+
+EXPORT_SYMBOL(si_meminfo);
+
+#ifdef CONFIG_NUMA
+void si_meminfo_node(struct sysinfo *val, int nid)
+{
+ int zone_type; /* needs to be signed */
+ unsigned long managed_pages = 0;
+ unsigned long managed_highpages = 0;
+ unsigned long free_highpages = 0;
+ pg_data_t *pgdat = NODE_DATA(nid);
+
+ for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)
+ managed_pages += zone_managed_pages(&pgdat->node_zones[zone_type]);
+ val->totalram = managed_pages;
+ val->sharedram = node_page_state(pgdat, NR_SHMEM);
+ val->freeram = sum_zone_node_page_state(nid, NR_FREE_PAGES);
+#ifdef CONFIG_HIGHMEM
+ for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
+ struct zone *zone = &pgdat->node_zones[zone_type];
+
+ if (is_highmem(zone)) {
+ managed_highpages += zone_managed_pages(zone);
+ free_highpages += zone_page_state(zone, NR_FREE_PAGES);
+ }
+ }
+ val->totalhigh = managed_highpages;
+ val->freehigh = free_highpages;
+#else
+ val->totalhigh = managed_highpages;
+ val->freehigh = free_highpages;
+#endif
+ val->mem_unit = PAGE_SIZE;
+}
+#endif
+
+/*
+ * Determine whether the node should be displayed or not, depending on whether
+ * SHOW_MEM_FILTER_NODES was passed to show_free_areas().
+ */
+static bool show_mem_node_skip(unsigned int flags, int nid, nodemask_t *nodemask)
+{
+ if (!(flags & SHOW_MEM_FILTER_NODES))
+ return false;
+
+ /*
+ * no node mask - aka implicit memory numa policy. Do not bother with
+ * the synchronization - read_mems_allowed_begin - because we do not
+ * have to be precise here.
+ */
+ if (!nodemask)
+ nodemask = &cpuset_current_mems_allowed;
+
+ return !node_isset(nid, *nodemask);
+}
+
+static void show_migration_types(unsigned char type)
+{
+ static const char types[MIGRATE_TYPES] = {
+ [MIGRATE_UNMOVABLE] = 'U',
+ [MIGRATE_MOVABLE] = 'M',
+ [MIGRATE_RECLAIMABLE] = 'E',
+ [MIGRATE_HIGHATOMIC] = 'H',
+#ifdef CONFIG_CMA
+ [MIGRATE_CMA] = 'C',
+#endif
+#ifdef CONFIG_MEMORY_ISOLATION
+ [MIGRATE_ISOLATE] = 'I',
+#endif
+ };
+ char tmp[MIGRATE_TYPES + 1];
+ char *p = tmp;
+ int i;
+
+ for (i = 0; i < MIGRATE_TYPES; i++) {
+ if (type & (1 << i))
+ *p++ = types[i];
+ }
+
+ *p = '\0';
+ printk(KERN_CONT "(%s) ", tmp);
+}
+
+static bool node_has_managed_zones(pg_data_t *pgdat, int max_zone_idx)
+{
+ int zone_idx;
+ for (zone_idx = 0; zone_idx <= max_zone_idx; zone_idx++)
+ if (zone_managed_pages(pgdat->node_zones + zone_idx))
+ return true;
+ return false;
+}
+
+/*
+ * Show free area list (used inside shift_scroll-lock stuff)
+ * We also calculate the percentage fragmentation. We do this by counting the
+ * memory on each free list with the exception of the first item on the list.
+ *
+ * Bits in @filter:
+ * SHOW_MEM_FILTER_NODES: suppress nodes that are not allowed by current's
+ * cpuset.
+ */
+void __show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_zone_idx)
+{
+ unsigned long free_pcp = 0;
+ int cpu, nid;
+ struct zone *zone;
+ pg_data_t *pgdat;
+
+ for_each_populated_zone(zone) {
+ if (zone_idx(zone) > max_zone_idx)
+ continue;
+ if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
+ continue;
+
+ for_each_online_cpu(cpu)
+ free_pcp += per_cpu_ptr(zone->per_cpu_pageset, cpu)->count;
+ }
+
+ printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"
+ " active_file:%lu inactive_file:%lu isolated_file:%lu\n"
+ " unevictable:%lu dirty:%lu writeback:%lu\n"
+ " slab_reclaimable:%lu slab_unreclaimable:%lu\n"
+ " mapped:%lu shmem:%lu pagetables:%lu\n"
+ " sec_pagetables:%lu bounce:%lu\n"
+ " kernel_misc_reclaimable:%lu\n"
+ " free:%lu free_pcp:%lu free_cma:%lu\n",
+ global_node_page_state(NR_ACTIVE_ANON),
+ global_node_page_state(NR_INACTIVE_ANON),
+ global_node_page_state(NR_ISOLATED_ANON),
+ global_node_page_state(NR_ACTIVE_FILE),
+ global_node_page_state(NR_INACTIVE_FILE),
+ global_node_page_state(NR_ISOLATED_FILE),
+ global_node_page_state(NR_UNEVICTABLE),
+ global_node_page_state(NR_FILE_DIRTY),
+ global_node_page_state(NR_WRITEBACK),
+ global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B),
+ global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B),
+ global_node_page_state(NR_FILE_MAPPED),
+ global_node_page_state(NR_SHMEM),
+ global_node_page_state(NR_PAGETABLE),
+ global_node_page_state(NR_SECONDARY_PAGETABLE),
+ global_zone_page_state(NR_BOUNCE),
+ global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE),
+ global_zone_page_state(NR_FREE_PAGES),
+ free_pcp,
+ global_zone_page_state(NR_FREE_CMA_PAGES));
+
+ for_each_online_pgdat(pgdat) {
+ if (show_mem_node_skip(filter, pgdat->node_id, nodemask))
+ continue;
+ if (!node_has_managed_zones(pgdat, max_zone_idx))
+ continue;
+
+ printk("Node %d"
+ " active_anon:%lukB"
+ " inactive_anon:%lukB"
+ " active_file:%lukB"
+ " inactive_file:%lukB"
+ " unevictable:%lukB"
+ " isolated(anon):%lukB"
+ " isolated(file):%lukB"
+ " mapped:%lukB"
+ " dirty:%lukB"
+ " writeback:%lukB"
+ " shmem:%lukB"
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ " shmem_thp: %lukB"
+ " shmem_pmdmapped: %lukB"
+ " anon_thp: %lukB"
+#endif
+ " writeback_tmp:%lukB"
+ " kernel_stack:%lukB"
+#ifdef CONFIG_SHADOW_CALL_STACK
+ " shadow_call_stack:%lukB"
+#endif
+ " pagetables:%lukB"
+ " sec_pagetables:%lukB"
+ " all_unreclaimable? %s"
+ "\n",
+ pgdat->node_id,
+ K(node_page_state(pgdat, NR_ACTIVE_ANON)),
+ K(node_page_state(pgdat, NR_INACTIVE_ANON)),
+ K(node_page_state(pgdat, NR_ACTIVE_FILE)),
+ K(node_page_state(pgdat, NR_INACTIVE_FILE)),
+ K(node_page_state(pgdat, NR_UNEVICTABLE)),
+ K(node_page_state(pgdat, NR_ISOLATED_ANON)),
+ K(node_page_state(pgdat, NR_ISOLATED_FILE)),
+ K(node_page_state(pgdat, NR_FILE_MAPPED)),
+ K(node_page_state(pgdat, NR_FILE_DIRTY)),
+ K(node_page_state(pgdat, NR_WRITEBACK)),
+ K(node_page_state(pgdat, NR_SHMEM)),
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ K(node_page_state(pgdat, NR_SHMEM_THPS)),
+ K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED)),
+ K(node_page_state(pgdat, NR_ANON_THPS)),
+#endif
+ K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),
+ node_page_state(pgdat, NR_KERNEL_STACK_KB),
+#ifdef CONFIG_SHADOW_CALL_STACK
+ node_page_state(pgdat, NR_KERNEL_SCS_KB),
+#endif
+ K(node_page_state(pgdat, NR_PAGETABLE)),
+ K(node_page_state(pgdat, NR_SECONDARY_PAGETABLE)),
+ pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ?
+ "yes" : "no");
+ }
+
+ for_each_populated_zone(zone) {
+ int i;
+
+ if (zone_idx(zone) > max_zone_idx)
+ continue;
+ if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
+ continue;
+
+ free_pcp = 0;
+ for_each_online_cpu(cpu)
+ free_pcp += per_cpu_ptr(zone->per_cpu_pageset, cpu)->count;
+
+ show_node(zone);
+ printk(KERN_CONT
+ "%s"
+ " free:%lukB"
+ " boost:%lukB"
+ " min:%lukB"
+ " low:%lukB"
+ " high:%lukB"
+ " reserved_highatomic:%luKB"
+ " active_anon:%lukB"
+ " inactive_anon:%lukB"
+ " active_file:%lukB"
+ " inactive_file:%lukB"
+ " unevictable:%lukB"
+ " writepending:%lukB"
+ " present:%lukB"
+ " managed:%lukB"
+ " mlocked:%lukB"
+ " bounce:%lukB"
+ " free_pcp:%lukB"
+ " local_pcp:%ukB"
+ " free_cma:%lukB"
+ "\n",
+ zone->name,
+ K(zone_page_state(zone, NR_FREE_PAGES)),
+ K(zone->watermark_boost),
+ K(min_wmark_pages(zone)),
+ K(low_wmark_pages(zone)),
+ K(high_wmark_pages(zone)),
+ K(zone->nr_reserved_highatomic),
+ K(zone_page_state(zone, NR_ZONE_ACTIVE_ANON)),
+ K(zone_page_state(zone, NR_ZONE_INACTIVE_ANON)),
+ K(zone_page_state(zone, NR_ZONE_ACTIVE_FILE)),
+ K(zone_page_state(zone, NR_ZONE_INACTIVE_FILE)),
+ K(zone_page_state(zone, NR_ZONE_UNEVICTABLE)),
+ K(zone_page_state(zone, NR_ZONE_WRITE_PENDING)),
+ K(zone->present_pages),
+ K(zone_managed_pages(zone)),
+ K(zone_page_state(zone, NR_MLOCK)),
+ K(zone_page_state(zone, NR_BOUNCE)),
+ K(free_pcp),
+ K(this_cpu_read(zone->per_cpu_pageset->count)),
+ K(zone_page_state(zone, NR_FREE_CMA_PAGES)));
+ printk("lowmem_reserve[]:");
+ for (i = 0; i < MAX_NR_ZONES; i++)
+ printk(KERN_CONT " %ld", zone->lowmem_reserve[i]);
+ printk(KERN_CONT "\n");
+ }
+
+ for_each_populated_zone(zone) {
+ unsigned int order;
+ unsigned long nr[MAX_ORDER + 1], flags, total = 0;
+ unsigned char types[MAX_ORDER + 1];
+
+ if (zone_idx(zone) > max_zone_idx)
+ continue;
+ if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
+ continue;
+ show_node(zone);
+ printk(KERN_CONT "%s: ", zone->name);
+
+ spin_lock_irqsave(&zone->lock, flags);
+ for (order = 0; order <= MAX_ORDER; order++) {
+ struct free_area *area = &zone->free_area[order];
+ int type;
+
+ nr[order] = area->nr_free;
+ total += nr[order] << order;
+
+ types[order] = 0;
+ for (type = 0; type < MIGRATE_TYPES; type++) {
+ if (!free_area_empty(area, type))
+ types[order] |= 1 << type;
+ }
+ }
+ spin_unlock_irqrestore(&zone->lock, flags);
+ for (order = 0; order <= MAX_ORDER; order++) {
+ printk(KERN_CONT "%lu*%lukB ",
+ nr[order], K(1UL) << order);
+ if (nr[order])
+ show_migration_types(types[order]);
+ }
+ printk(KERN_CONT "= %lukB\n", K(total));
+ }
+
+ for_each_online_node(nid) {
+ if (show_mem_node_skip(filter, nid, nodemask))
+ continue;
+ hugetlb_show_meminfo_node(nid);
+ }
+
+ printk("%ld total pagecache pages\n", global_node_page_state(NR_FILE_PAGES));
+
+ show_swap_cache_info();
+}
+
+void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx)
+{
+ unsigned long total = 0, reserved = 0, highmem = 0;
+ struct zone *zone;
+
+ printk("Mem-Info:\n");
+ __show_free_areas(filter, nodemask, max_zone_idx);
+
+ for_each_populated_zone(zone) {
+
+ total += zone->present_pages;
+ reserved += zone->present_pages - zone_managed_pages(zone);
+
+ if (is_highmem(zone))
+ highmem += zone->present_pages;
+ }
+
+ printk("%lu pages RAM\n", total);
+ printk("%lu pages HighMem/MovableOnly\n", highmem);
+ printk("%lu pages reserved\n", reserved);
+#ifdef CONFIG_CMA
+ printk("%lu pages cma reserved\n", totalcma_pages);
+#endif
+#ifdef CONFIG_MEMORY_FAILURE
+ printk("%lu pages hwpoisoned\n", atomic_long_read(&num_poisoned_pages));
+#endif
+}
diff --git a/mm/shrinker_debug.c b/mm/shrinker_debug.c
index fe10436d9911..3ab53fad8876 100644
--- a/mm/shrinker_debug.c
+++ b/mm/shrinker_debug.c
@@ -5,12 +5,10 @@
#include <linux/seq_file.h>
#include <linux/shrinker.h>
#include <linux/memcontrol.h>
-#include <linux/srcu.h>
/* defined in vmscan.c */
-extern struct mutex shrinker_mutex;
+extern struct rw_semaphore shrinker_rwsem;
extern struct list_head shrinker_list;
-extern struct srcu_struct shrinker_srcu;
static DEFINE_IDA(shrinker_debugfs_ida);
static struct dentry *shrinker_debugfs_root;
@@ -51,13 +49,18 @@ static int shrinker_debugfs_count_show(struct seq_file *m, void *v)
struct mem_cgroup *memcg;
unsigned long total;
bool memcg_aware;
- int ret = 0, nid, srcu_idx;
+ int ret, nid;
count_per_node = kcalloc(nr_node_ids, sizeof(unsigned long), GFP_KERNEL);
if (!count_per_node)
return -ENOMEM;
- srcu_idx = srcu_read_lock(&shrinker_srcu);
+ ret = down_read_killable(&shrinker_rwsem);
+ if (ret) {
+ kfree(count_per_node);
+ return ret;
+ }
+ rcu_read_lock();
memcg_aware = shrinker->flags & SHRINKER_MEMCG_AWARE;
@@ -88,7 +91,8 @@ static int shrinker_debugfs_count_show(struct seq_file *m, void *v)
}
} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
- srcu_read_unlock(&shrinker_srcu, srcu_idx);
+ rcu_read_unlock();
+ up_read(&shrinker_rwsem);
kfree(count_per_node);
return ret;
@@ -111,8 +115,9 @@ static ssize_t shrinker_debugfs_scan_write(struct file *file,
.gfp_mask = GFP_KERNEL,
};
struct mem_cgroup *memcg = NULL;
- int nid, srcu_idx;
+ int nid;
char kbuf[72];
+ ssize_t ret;
read_len = size < (sizeof(kbuf) - 1) ? size : (sizeof(kbuf) - 1);
if (copy_from_user(kbuf, buf, read_len))
@@ -141,7 +146,11 @@ static ssize_t shrinker_debugfs_scan_write(struct file *file,
return -EINVAL;
}
- srcu_idx = srcu_read_lock(&shrinker_srcu);
+ ret = down_read_killable(&shrinker_rwsem);
+ if (ret) {
+ mem_cgroup_put(memcg);
+ return ret;
+ }
sc.nid = nid;
sc.memcg = memcg;
@@ -150,7 +159,7 @@ static ssize_t shrinker_debugfs_scan_write(struct file *file,
shrinker->scan_objects(shrinker, &sc);
- srcu_read_unlock(&shrinker_srcu, srcu_idx);
+ up_read(&shrinker_rwsem);
mem_cgroup_put(memcg);
return size;
@@ -168,7 +177,7 @@ int shrinker_debugfs_add(struct shrinker *shrinker)
char buf[128];
int id;
- lockdep_assert_held(&shrinker_mutex);
+ lockdep_assert_held(&shrinker_rwsem);
/* debugfs isn't initialized yet, add debugfs entries later. */
if (!shrinker_debugfs_root)
@@ -211,7 +220,7 @@ int shrinker_debugfs_rename(struct shrinker *shrinker, const char *fmt, ...)
if (!new)
return -ENOMEM;
- mutex_lock(&shrinker_mutex);
+ down_write(&shrinker_rwsem);
old = shrinker->name;
shrinker->name = new;
@@ -229,7 +238,7 @@ int shrinker_debugfs_rename(struct shrinker *shrinker, const char *fmt, ...)
shrinker->debugfs_entry = entry;
}
- mutex_unlock(&shrinker_mutex);
+ up_write(&shrinker_rwsem);
kfree_const(old);
@@ -242,7 +251,7 @@ struct dentry *shrinker_debugfs_detach(struct shrinker *shrinker,
{
struct dentry *entry = shrinker->debugfs_entry;
- lockdep_assert_held(&shrinker_mutex);
+ lockdep_assert_held(&shrinker_rwsem);
kfree_const(shrinker->name);
shrinker->name = NULL;
@@ -271,14 +280,14 @@ static int __init shrinker_debugfs_init(void)
shrinker_debugfs_root = dentry;
/* Create debugfs entries for shrinkers registered at boot */
- mutex_lock(&shrinker_mutex);
+ down_write(&shrinker_rwsem);
list_for_each_entry(shrinker, &shrinker_list, list)
if (!shrinker->debugfs_entry) {
ret = shrinker_debugfs_add(shrinker);
if (ret)
break;
}
- mutex_unlock(&shrinker_mutex);
+ up_write(&shrinker_rwsem);
return ret;
}
diff --git a/mm/slab.c b/mm/slab.c
index bb57f7fdbae1..88194391d553 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1240,11 +1240,7 @@ void __init kmem_cache_init(void)
* Initialize the caches that provide memory for the kmem_cache_node
* structures first. Without this, further allocations will bug.
*/
- kmalloc_caches[KMALLOC_NORMAL][INDEX_NODE] = create_kmalloc_cache(
- kmalloc_info[INDEX_NODE].name[KMALLOC_NORMAL],
- kmalloc_info[INDEX_NODE].size,
- ARCH_KMALLOC_FLAGS, 0,
- kmalloc_info[INDEX_NODE].size);
+ new_kmalloc_cache(INDEX_NODE, KMALLOC_NORMAL, ARCH_KMALLOC_FLAGS);
slab_state = PARTIAL_NODE;
setup_kmalloc_cache_index_table();
@@ -1887,14 +1883,12 @@ static bool set_on_slab_cache(struct kmem_cache *cachep,
return true;
}
-/**
+/*
* __kmem_cache_create - Create a cache.
* @cachep: cache management descriptor
* @flags: SLAB flags
*
- * Returns a ptr to the cache on success, NULL on failure.
- * Cannot be called within an int, but can be interrupted.
- * The @ctor is run when new pages are allocated by the cache.
+ * Returns zero on success, nonzero on failure.
*
* The flags are
*
@@ -1907,8 +1901,6 @@ static bool set_on_slab_cache(struct kmem_cache *cachep,
* %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
* cacheline. This can be beneficial if you're counting cycles as closely
* as davem.
- *
- * Return: a pointer to the created cache or %NULL in case of error
*/
int __kmem_cache_create(struct kmem_cache *cachep, slab_flags_t flags)
{
@@ -2359,44 +2351,34 @@ static void cache_init_objs_debug(struct kmem_cache *cachep, struct slab *slab)
#ifdef CONFIG_SLAB_FREELIST_RANDOM
/* Hold information during a freelist initialization */
-union freelist_init_state {
- struct {
- unsigned int pos;
- unsigned int *list;
- unsigned int count;
- };
- struct rnd_state rnd_state;
+struct freelist_init_state {
+ unsigned int pos;
+ unsigned int *list;
+ unsigned int count;
};
/*
* Initialize the state based on the randomization method available.
* return true if the pre-computed list is available, false otherwise.
*/
-static bool freelist_state_initialize(union freelist_init_state *state,
+static bool freelist_state_initialize(struct freelist_init_state *state,
struct kmem_cache *cachep,
unsigned int count)
{
bool ret;
- unsigned int rand;
-
- /* Use best entropy available to define a random shift */
- rand = get_random_u32();
-
- /* Use a random state if the pre-computed list is not available */
if (!cachep->random_seq) {
- prandom_seed_state(&state->rnd_state, rand);
ret = false;
} else {
state->list = cachep->random_seq;
state->count = count;
- state->pos = rand % count;
+ state->pos = get_random_u32_below(count);
ret = true;
}
return ret;
}
/* Get the next entry on the list and randomize it using a random shift */
-static freelist_idx_t next_random_slot(union freelist_init_state *state)
+static freelist_idx_t next_random_slot(struct freelist_init_state *state)
{
if (state->pos >= state->count)
state->pos = 0;
@@ -2417,7 +2399,7 @@ static void swap_free_obj(struct slab *slab, unsigned int a, unsigned int b)
static bool shuffle_freelist(struct kmem_cache *cachep, struct slab *slab)
{
unsigned int objfreelist = 0, i, rand, count = cachep->num;
- union freelist_init_state state;
+ struct freelist_init_state state;
bool precomputed;
if (count < 2)
@@ -2446,8 +2428,7 @@ static bool shuffle_freelist(struct kmem_cache *cachep, struct slab *slab)
/* Fisher-Yates shuffle */
for (i = count - 1; i > 0; i--) {
- rand = prandom_u32_state(&state.rnd_state);
- rand %= (i + 1);
+ rand = get_random_u32_below(i + 1);
swap_free_obj(slab, i, rand);
}
} else {
diff --git a/mm/slab.h b/mm/slab.h
index f01ac256a8f5..6a5633b25eb5 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -6,6 +6,38 @@
*/
void __init kmem_cache_init(void);
+#ifdef CONFIG_64BIT
+# ifdef system_has_cmpxchg128
+# define system_has_freelist_aba() system_has_cmpxchg128()
+# define try_cmpxchg_freelist try_cmpxchg128
+# endif
+#define this_cpu_try_cmpxchg_freelist this_cpu_try_cmpxchg128
+typedef u128 freelist_full_t;
+#else /* CONFIG_64BIT */
+# ifdef system_has_cmpxchg64
+# define system_has_freelist_aba() system_has_cmpxchg64()
+# define try_cmpxchg_freelist try_cmpxchg64
+# endif
+#define this_cpu_try_cmpxchg_freelist this_cpu_try_cmpxchg64
+typedef u64 freelist_full_t;
+#endif /* CONFIG_64BIT */
+
+#if defined(system_has_freelist_aba) && !defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
+#undef system_has_freelist_aba
+#endif
+
+/*
+ * Freelist pointer and counter to cmpxchg together, avoids the typical ABA
+ * problems with cmpxchg of just a pointer.
+ */
+typedef union {
+ struct {
+ void *freelist;
+ unsigned long counter;
+ };
+ freelist_full_t full;
+} freelist_aba_t;
+
/* Reuses the bits in struct page */
struct slab {
unsigned long __page_flags;
@@ -38,14 +70,21 @@ struct slab {
#endif
};
/* Double-word boundary */
- void *freelist; /* first free object */
union {
- unsigned long counters;
struct {
- unsigned inuse:16;
- unsigned objects:15;
- unsigned frozen:1;
+ void *freelist; /* first free object */
+ union {
+ unsigned long counters;
+ struct {
+ unsigned inuse:16;
+ unsigned objects:15;
+ unsigned frozen:1;
+ };
+ };
};
+#ifdef system_has_freelist_aba
+ freelist_aba_t freelist_counter;
+#endif
};
};
struct rcu_head rcu_head;
@@ -72,8 +111,8 @@ SLAB_MATCH(memcg_data, memcg_data);
#endif
#undef SLAB_MATCH
static_assert(sizeof(struct slab) <= sizeof(struct page));
-#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && defined(CONFIG_SLUB)
-static_assert(IS_ALIGNED(offsetof(struct slab, freelist), 2*sizeof(void *)));
+#if defined(system_has_freelist_aba) && defined(CONFIG_SLUB)
+static_assert(IS_ALIGNED(offsetof(struct slab, freelist), sizeof(freelist_aba_t)));
#endif
/**
@@ -255,9 +294,8 @@ gfp_t kmalloc_fix_flags(gfp_t flags);
/* Functions provided by the slab allocators */
int __kmem_cache_create(struct kmem_cache *, slab_flags_t flags);
-struct kmem_cache *create_kmalloc_cache(const char *name, unsigned int size,
- slab_flags_t flags, unsigned int useroffset,
- unsigned int usersize);
+void __init new_kmalloc_cache(int idx, enum kmalloc_cache_type type,
+ slab_flags_t flags);
extern void create_boot_cache(struct kmem_cache *, const char *name,
unsigned int size, slab_flags_t flags,
unsigned int useroffset, unsigned int usersize);
@@ -294,11 +332,11 @@ static inline bool is_kmalloc_cache(struct kmem_cache *s)
#if defined(CONFIG_SLAB)
#define SLAB_CACHE_FLAGS (SLAB_MEM_SPREAD | SLAB_NOLEAKTRACE | \
SLAB_RECLAIM_ACCOUNT | SLAB_TEMPORARY | \
- SLAB_ACCOUNT)
+ SLAB_ACCOUNT | SLAB_NO_MERGE)
#elif defined(CONFIG_SLUB)
#define SLAB_CACHE_FLAGS (SLAB_NOLEAKTRACE | SLAB_RECLAIM_ACCOUNT | \
SLAB_TEMPORARY | SLAB_ACCOUNT | \
- SLAB_NO_USER_FLAGS | SLAB_KMALLOC)
+ SLAB_NO_USER_FLAGS | SLAB_KMALLOC | SLAB_NO_MERGE)
#else
#define SLAB_CACHE_FLAGS (SLAB_NOLEAKTRACE)
#endif
@@ -319,6 +357,7 @@ static inline bool is_kmalloc_cache(struct kmem_cache *s)
SLAB_TEMPORARY | \
SLAB_ACCOUNT | \
SLAB_KMALLOC | \
+ SLAB_NO_MERGE | \
SLAB_NO_USER_FLAGS)
bool __kmem_cache_empty(struct kmem_cache *);
@@ -832,16 +871,8 @@ struct kmem_obj_info {
void __kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab);
#endif
-#ifdef CONFIG_HAVE_HARDENED_USERCOPY_ALLOCATOR
void __check_heap_object(const void *ptr, unsigned long n,
const struct slab *slab, bool to_user);
-#else
-static inline
-void __check_heap_object(const void *ptr, unsigned long n,
- const struct slab *slab, bool to_user)
-{
-}
-#endif
#ifdef CONFIG_SLUB_DEBUG
void skip_orig_size_check(struct kmem_cache *s, const void *object);
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 607249785c07..d1555ea2981a 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -17,6 +17,8 @@
#include <linux/cpu.h>
#include <linux/uaccess.h>
#include <linux/seq_file.h>
+#include <linux/dma-mapping.h>
+#include <linux/swiotlb.h>
#include <linux/proc_fs.h>
#include <linux/debugfs.h>
#include <linux/kasan.h>
@@ -47,7 +49,7 @@ static DECLARE_WORK(slab_caches_to_rcu_destroy_work,
*/
#define SLAB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
SLAB_TRACE | SLAB_TYPESAFE_BY_RCU | SLAB_NOLEAKTRACE | \
- SLAB_FAILSLAB | kasan_never_merge())
+ SLAB_FAILSLAB | SLAB_NO_MERGE | kasan_never_merge())
#define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | \
SLAB_CACHE_DMA32 | SLAB_ACCOUNT)
@@ -236,14 +238,12 @@ static struct kmem_cache *create_cache(const char *name,
s->refcount = 1;
list_add(&s->list, &slab_caches);
-out:
- if (err)
- return ERR_PTR(err);
return s;
out_free_cache:
kmem_cache_free(kmem_cache, s);
- goto out;
+out:
+ return ERR_PTR(err);
}
/**
@@ -658,17 +658,16 @@ void __init create_boot_cache(struct kmem_cache *s, const char *name,
s->refcount = -1; /* Exempt from merging for now */
}
-struct kmem_cache *__init create_kmalloc_cache(const char *name,
- unsigned int size, slab_flags_t flags,
- unsigned int useroffset, unsigned int usersize)
+static struct kmem_cache *__init create_kmalloc_cache(const char *name,
+ unsigned int size,
+ slab_flags_t flags)
{
struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
if (!s)
panic("Out of memory when creating slab %s\n", name);
- create_boot_cache(s, name, size, flags | SLAB_KMALLOC, useroffset,
- usersize);
+ create_boot_cache(s, name, size, flags | SLAB_KMALLOC, 0, size);
list_add(&s->list, &slab_caches);
s->refcount = 1;
return s;
@@ -863,9 +862,22 @@ void __init setup_kmalloc_cache_index_table(void)
}
}
-static void __init
+static unsigned int __kmalloc_minalign(void)
+{
+#ifdef CONFIG_DMA_BOUNCE_UNALIGNED_KMALLOC
+ if (io_tlb_default_mem.nslabs)
+ return ARCH_KMALLOC_MINALIGN;
+#endif
+ return dma_get_cache_alignment();
+}
+
+void __init
new_kmalloc_cache(int idx, enum kmalloc_cache_type type, slab_flags_t flags)
{
+ unsigned int minalign = __kmalloc_minalign();
+ unsigned int aligned_size = kmalloc_info[idx].size;
+ int aligned_idx = idx;
+
if ((KMALLOC_RECLAIM != KMALLOC_NORMAL) && (type == KMALLOC_RECLAIM)) {
flags |= SLAB_RECLAIM_ACCOUNT;
} else if (IS_ENABLED(CONFIG_MEMCG_KMEM) && (type == KMALLOC_CGROUP)) {
@@ -878,17 +890,24 @@ new_kmalloc_cache(int idx, enum kmalloc_cache_type type, slab_flags_t flags)
flags |= SLAB_CACHE_DMA;
}
- kmalloc_caches[type][idx] = create_kmalloc_cache(
- kmalloc_info[idx].name[type],
- kmalloc_info[idx].size, flags, 0,
- kmalloc_info[idx].size);
-
/*
* If CONFIG_MEMCG_KMEM is enabled, disable cache merging for
* KMALLOC_NORMAL caches.
*/
if (IS_ENABLED(CONFIG_MEMCG_KMEM) && (type == KMALLOC_NORMAL))
- kmalloc_caches[type][idx]->refcount = -1;
+ flags |= SLAB_NO_MERGE;
+
+ if (minalign > ARCH_KMALLOC_MINALIGN) {
+ aligned_size = ALIGN(aligned_size, minalign);
+ aligned_idx = __kmalloc_index(aligned_size, false);
+ }
+
+ if (!kmalloc_caches[type][aligned_idx])
+ kmalloc_caches[type][aligned_idx] = create_kmalloc_cache(
+ kmalloc_info[aligned_idx].name[type],
+ aligned_size, flags);
+ if (idx != aligned_idx)
+ kmalloc_caches[type][idx] = kmalloc_caches[type][aligned_idx];
}
/*
@@ -1141,7 +1160,7 @@ EXPORT_SYMBOL(kmalloc_large_node);
#ifdef CONFIG_SLAB_FREELIST_RANDOM
/* Randomize a generic freelist */
-static void freelist_randomize(struct rnd_state *state, unsigned int *list,
+static void freelist_randomize(unsigned int *list,
unsigned int count)
{
unsigned int rand;
@@ -1152,8 +1171,7 @@ static void freelist_randomize(struct rnd_state *state, unsigned int *list,
/* Fisher-Yates shuffle */
for (i = count - 1; i > 0; i--) {
- rand = prandom_u32_state(state);
- rand %= (i + 1);
+ rand = get_random_u32_below(i + 1);
swap(list[i], list[rand]);
}
}
@@ -1162,7 +1180,6 @@ static void freelist_randomize(struct rnd_state *state, unsigned int *list,
int cache_random_seq_create(struct kmem_cache *cachep, unsigned int count,
gfp_t gfp)
{
- struct rnd_state state;
if (count < 2 || cachep->random_seq)
return 0;
@@ -1171,10 +1188,7 @@ int cache_random_seq_create(struct kmem_cache *cachep, unsigned int count,
if (!cachep->random_seq)
return -ENOMEM;
- /* Get best entropy at this stage of boot */
- prandom_seed_state(&state, get_random_long());
-
- freelist_randomize(&state, cachep->random_seq, count);
+ freelist_randomize(cachep->random_seq, count);
return 0;
}
diff --git a/mm/slub.c b/mm/slub.c
index c87628cd8a9a..e3b5d5c0eb3a 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -292,7 +292,12 @@ static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s)
/* Poison object */
#define __OBJECT_POISON ((slab_flags_t __force)0x80000000U)
/* Use cmpxchg_double */
+
+#ifdef system_has_freelist_aba
#define __CMPXCHG_DOUBLE ((slab_flags_t __force)0x40000000U)
+#else
+#define __CMPXCHG_DOUBLE ((slab_flags_t __force)0U)
+#endif
/*
* Tracking user of a slab.
@@ -512,6 +517,40 @@ static __always_inline void slab_unlock(struct slab *slab)
__bit_spin_unlock(PG_locked, &page->flags);
}
+static inline bool
+__update_freelist_fast(struct slab *slab,
+ void *freelist_old, unsigned long counters_old,
+ void *freelist_new, unsigned long counters_new)
+{
+#ifdef system_has_freelist_aba
+ freelist_aba_t old = { .freelist = freelist_old, .counter = counters_old };
+ freelist_aba_t new = { .freelist = freelist_new, .counter = counters_new };
+
+ return try_cmpxchg_freelist(&slab->freelist_counter.full, &old.full, new.full);
+#else
+ return false;
+#endif
+}
+
+static inline bool
+__update_freelist_slow(struct slab *slab,
+ void *freelist_old, unsigned long counters_old,
+ void *freelist_new, unsigned long counters_new)
+{
+ bool ret = false;
+
+ slab_lock(slab);
+ if (slab->freelist == freelist_old &&
+ slab->counters == counters_old) {
+ slab->freelist = freelist_new;
+ slab->counters = counters_new;
+ ret = true;
+ }
+ slab_unlock(slab);
+
+ return ret;
+}
+
/*
* Interrupts must be disabled (for the fallback code to work right), typically
* by an _irqsave() lock variant. On PREEMPT_RT the preempt_disable(), which is
@@ -519,33 +558,25 @@ static __always_inline void slab_unlock(struct slab *slab)
* allocation/ free operation in hardirq context. Therefore nothing can
* interrupt the operation.
*/
-static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct slab *slab,
+static inline bool __slab_update_freelist(struct kmem_cache *s, struct slab *slab,
void *freelist_old, unsigned long counters_old,
void *freelist_new, unsigned long counters_new,
const char *n)
{
+ bool ret;
+
if (USE_LOCKLESS_FAST_PATH())
lockdep_assert_irqs_disabled();
-#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
- defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
+
if (s->flags & __CMPXCHG_DOUBLE) {
- if (cmpxchg_double(&slab->freelist, &slab->counters,
- freelist_old, counters_old,
- freelist_new, counters_new))
- return true;
- } else
-#endif
- {
- slab_lock(slab);
- if (slab->freelist == freelist_old &&
- slab->counters == counters_old) {
- slab->freelist = freelist_new;
- slab->counters = counters_new;
- slab_unlock(slab);
- return true;
- }
- slab_unlock(slab);
+ ret = __update_freelist_fast(slab, freelist_old, counters_old,
+ freelist_new, counters_new);
+ } else {
+ ret = __update_freelist_slow(slab, freelist_old, counters_old,
+ freelist_new, counters_new);
}
+ if (likely(ret))
+ return true;
cpu_relax();
stat(s, CMPXCHG_DOUBLE_FAIL);
@@ -557,36 +588,26 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct slab *slab
return false;
}
-static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct slab *slab,
+static inline bool slab_update_freelist(struct kmem_cache *s, struct slab *slab,
void *freelist_old, unsigned long counters_old,
void *freelist_new, unsigned long counters_new,
const char *n)
{
-#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
- defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
+ bool ret;
+
if (s->flags & __CMPXCHG_DOUBLE) {
- if (cmpxchg_double(&slab->freelist, &slab->counters,
- freelist_old, counters_old,
- freelist_new, counters_new))
- return true;
- } else
-#endif
- {
+ ret = __update_freelist_fast(slab, freelist_old, counters_old,
+ freelist_new, counters_new);
+ } else {
unsigned long flags;
local_irq_save(flags);
- slab_lock(slab);
- if (slab->freelist == freelist_old &&
- slab->counters == counters_old) {
- slab->freelist = freelist_new;
- slab->counters = counters_new;
- slab_unlock(slab);
- local_irq_restore(flags);
- return true;
- }
- slab_unlock(slab);
+ ret = __update_freelist_slow(slab, freelist_old, counters_old,
+ freelist_new, counters_new);
local_irq_restore(flags);
}
+ if (likely(ret))
+ return true;
cpu_relax();
stat(s, CMPXCHG_DOUBLE_FAIL);
@@ -1344,14 +1365,6 @@ static void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, struct
list_del(&slab->slab_list);
}
-/* Tracking of the number of slabs for debugging purposes */
-static inline unsigned long slabs_node(struct kmem_cache *s, int node)
-{
- struct kmem_cache_node *n = get_node(s, node);
-
- return atomic_long_read(&n->nr_slabs);
-}
-
static inline unsigned long node_nr_slabs(struct kmem_cache_node *n)
{
return atomic_long_read(&n->nr_slabs);
@@ -1722,8 +1735,6 @@ slab_flags_t kmem_cache_flags(unsigned int object_size,
#define disable_higher_order_debug 0
-static inline unsigned long slabs_node(struct kmem_cache *s, int node)
- { return 0; }
static inline unsigned long node_nr_slabs(struct kmem_cache_node *n)
{ return 0; }
static inline void inc_slabs_node(struct kmem_cache *s, int node,
@@ -2228,7 +2239,7 @@ static inline void *acquire_slab(struct kmem_cache *s,
VM_BUG_ON(new.frozen);
new.frozen = 1;
- if (!__cmpxchg_double_slab(s, slab,
+ if (!__slab_update_freelist(s, slab,
freelist, counters,
new.freelist, new.counters,
"acquire_slab"))
@@ -2554,7 +2565,7 @@ redo:
}
- if (!cmpxchg_double_slab(s, slab,
+ if (!slab_update_freelist(s, slab,
old.freelist, old.counters,
new.freelist, new.counters,
"unfreezing slab")) {
@@ -2611,7 +2622,7 @@ static void __unfreeze_partials(struct kmem_cache *s, struct slab *partial_slab)
new.frozen = 0;
- } while (!__cmpxchg_double_slab(s, slab,
+ } while (!__slab_update_freelist(s, slab,
old.freelist, old.counters,
new.freelist, new.counters,
"unfreezing slab"));
@@ -3008,6 +3019,18 @@ static inline bool pfmemalloc_match(struct slab *slab, gfp_t gfpflags)
}
#ifndef CONFIG_SLUB_TINY
+static inline bool
+__update_cpu_freelist_fast(struct kmem_cache *s,
+ void *freelist_old, void *freelist_new,
+ unsigned long tid)
+{
+ freelist_aba_t old = { .freelist = freelist_old, .counter = tid };
+ freelist_aba_t new = { .freelist = freelist_new, .counter = next_tid(tid) };
+
+ return this_cpu_try_cmpxchg_freelist(s->cpu_slab->freelist_tid.full,
+ &old.full, new.full);
+}
+
/*
* Check the slab->freelist and either transfer the freelist to the
* per cpu freelist or deactivate the slab.
@@ -3034,7 +3057,7 @@ static inline void *get_freelist(struct kmem_cache *s, struct slab *slab)
new.inuse = slab->objects;
new.frozen = freelist != NULL;
- } while (!__cmpxchg_double_slab(s, slab,
+ } while (!__slab_update_freelist(s, slab,
freelist, counters,
NULL, new.counters,
"get_freelist"));
@@ -3359,11 +3382,7 @@ redo:
* against code executing on this cpu *not* from access by
* other cpus.
*/
- if (unlikely(!this_cpu_cmpxchg_double(
- s->cpu_slab->freelist, s->cpu_slab->tid,
- object, tid,
- next_object, next_tid(tid)))) {
-
+ if (unlikely(!__update_cpu_freelist_fast(s, object, next_object, tid))) {
note_cmpxchg_failure("slab_alloc", s, tid);
goto redo;
}
@@ -3631,7 +3650,7 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab,
}
}
- } while (!cmpxchg_double_slab(s, slab,
+ } while (!slab_update_freelist(s, slab,
prior, counters,
head, new.counters,
"__slab_free"));
@@ -3736,11 +3755,7 @@ redo:
set_freepointer(s, tail_obj, freelist);
- if (unlikely(!this_cpu_cmpxchg_double(
- s->cpu_slab->freelist, s->cpu_slab->tid,
- freelist, tid,
- head, next_tid(tid)))) {
-
+ if (unlikely(!__update_cpu_freelist_fast(s, freelist, head, tid))) {
note_cmpxchg_failure("slab_free", s, tid);
goto redo;
}
@@ -4505,11 +4520,11 @@ static int kmem_cache_open(struct kmem_cache *s, slab_flags_t flags)
}
}
-#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
- defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
- if (system_has_cmpxchg_double() && (s->flags & SLAB_NO_CMPXCHG) == 0)
+#ifdef system_has_freelist_aba
+ if (system_has_freelist_aba() && !(s->flags & SLAB_NO_CMPXCHG)) {
/* Enable fast mode */
s->flags |= __CMPXCHG_DOUBLE;
+ }
#endif
/*
@@ -4598,7 +4613,7 @@ bool __kmem_cache_empty(struct kmem_cache *s)
struct kmem_cache_node *n;
for_each_kmem_cache_node(s, node, n)
- if (n->nr_partial || slabs_node(s, node))
+ if (n->nr_partial || node_nr_slabs(n))
return false;
return true;
}
@@ -4615,7 +4630,7 @@ int __kmem_cache_shutdown(struct kmem_cache *s)
/* Attempt to free all objects */
for_each_kmem_cache_node(s, node, n) {
free_partial(s, n);
- if (n->nr_partial || slabs_node(s, node))
+ if (n->nr_partial || node_nr_slabs(n))
return 1;
}
return 0;
@@ -4828,7 +4843,7 @@ static int __kmem_cache_do_shrink(struct kmem_cache *s)
list_for_each_entry_safe(slab, t, &discard, slab_list)
free_slab(s, slab);
- if (slabs_node(s, node))
+ if (node_nr_slabs(n))
ret = 1;
}
@@ -5166,9 +5181,9 @@ static int validate_slab_node(struct kmem_cache *s,
validate_slab(s, slab, obj_map);
count++;
}
- if (count != atomic_long_read(&n->nr_slabs)) {
+ if (count != node_nr_slabs(n)) {
pr_err("SLUB: %s %ld slabs counted but counter=%ld\n",
- s->name, count, atomic_long_read(&n->nr_slabs));
+ s->name, count, node_nr_slabs(n));
slab_add_kunit_errors();
}
@@ -5452,12 +5467,11 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
for_each_kmem_cache_node(s, node, n) {
if (flags & SO_TOTAL)
- x = atomic_long_read(&n->total_objects);
+ x = node_nr_objs(n);
else if (flags & SO_OBJECTS)
- x = atomic_long_read(&n->total_objects) -
- count_partial(n, count_free);
+ x = node_nr_objs(n) - count_partial(n, count_free);
else
- x = atomic_long_read(&n->nr_slabs);
+ x = node_nr_slabs(n);
total += x;
nodes[node] += x;
}
@@ -5612,12 +5626,6 @@ static ssize_t cpu_slabs_show(struct kmem_cache *s, char *buf)
}
SLAB_ATTR_RO(cpu_slabs);
-static ssize_t objects_show(struct kmem_cache *s, char *buf)
-{
- return show_slab_objects(s, buf, SO_ALL|SO_OBJECTS);
-}
-SLAB_ATTR_RO(objects);
-
static ssize_t objects_partial_show(struct kmem_cache *s, char *buf)
{
return show_slab_objects(s, buf, SO_PARTIAL|SO_OBJECTS);
@@ -5646,7 +5654,7 @@ static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf)
objects = (slabs * oo_objects(s->oo)) / 2;
len += sysfs_emit_at(buf, len, "%d(%d)", objects, slabs);
-#if defined(CONFIG_SLUB_CPU_PARTIAL) && defined(CONFIG_SMP)
+#ifdef CONFIG_SLUB_CPU_PARTIAL
for_each_online_cpu(cpu) {
struct slab *slab;
@@ -5712,6 +5720,12 @@ static ssize_t total_objects_show(struct kmem_cache *s, char *buf)
}
SLAB_ATTR_RO(total_objects);
+static ssize_t objects_show(struct kmem_cache *s, char *buf)
+{
+ return show_slab_objects(s, buf, SO_ALL|SO_OBJECTS);
+}
+SLAB_ATTR_RO(objects);
+
static ssize_t sanity_checks_show(struct kmem_cache *s, char *buf)
{
return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_CONSISTENCY_CHECKS));
@@ -5943,7 +5957,6 @@ static struct attribute *slab_attrs[] = {
&order_attr.attr,
&min_partial_attr.attr,
&cpu_partial_attr.attr,
- &objects_attr.attr,
&objects_partial_attr.attr,
&partial_attr.attr,
&cpu_slabs_attr.attr,
@@ -5957,6 +5970,7 @@ static struct attribute *slab_attrs[] = {
&slabs_cpu_partial_attr.attr,
#ifdef CONFIG_SLUB_DEBUG
&total_objects_attr.attr,
+ &objects_attr.attr,
&slabs_attr.attr,
&sanity_checks_attr.attr,
&trace_attr.attr,
@@ -6224,7 +6238,7 @@ static int __init slab_sysfs_init(void)
if (!slab_kset) {
mutex_unlock(&slab_mutex);
pr_err("Cannot register slab subsystem.\n");
- return -ENOSYS;
+ return -ENOMEM;
}
slab_state = FULL;
@@ -6396,7 +6410,7 @@ static int slab_debug_trace_open(struct inode *inode, struct file *filep)
unsigned long flags;
struct slab *slab;
- if (!atomic_long_read(&n->nr_slabs))
+ if (!node_nr_slabs(n))
continue;
spin_lock_irqsave(&n->list_lock, flags);
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index 10d73a0dfcec..a044a130405b 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -133,7 +133,7 @@ static void * __meminit altmap_alloc_block_buf(unsigned long size,
void __meminit vmemmap_verify(pte_t *pte, int node,
unsigned long start, unsigned long end)
{
- unsigned long pfn = pte_pfn(*pte);
+ unsigned long pfn = pte_pfn(ptep_get(pte));
int actual_node = early_pfn_to_nid(pfn);
if (node_distance(actual_node, node) > LOCAL_DISTANCE)
@@ -146,7 +146,7 @@ pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node,
struct page *reuse)
{
pte_t *pte = pte_offset_kernel(pmd, addr);
- if (pte_none(*pte)) {
+ if (pte_none(ptep_get(pte))) {
pte_t entry;
void *p;
@@ -414,7 +414,7 @@ static int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn,
* with just tail struct pages.
*/
return vmemmap_populate_range(start, end, node, NULL,
- pte_page(*pte));
+ pte_page(ptep_get(pte)));
}
size = min(end - start, pgmap_vmemmap_nr(pgmap) * sizeof(struct page));
@@ -438,7 +438,7 @@ static int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn,
*/
next += PAGE_SIZE;
rc = vmemmap_populate_range(next, last, node, NULL,
- pte_page(*pte));
+ pte_page(ptep_get(pte)));
if (rc)
return -ENOMEM;
}
diff --git a/mm/sparse.c b/mm/sparse.c
index c2afdb26039e..297a8b772e8d 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -701,7 +701,7 @@ static int fill_subsection_map(unsigned long pfn, unsigned long nr_pages)
return rc;
}
#else
-struct page * __meminit populate_section_memmap(unsigned long pfn,
+static struct page * __meminit populate_section_memmap(unsigned long pfn,
unsigned long nr_pages, int nid, struct vmem_altmap *altmap,
struct dev_pagemap *pgmap)
{
@@ -922,10 +922,14 @@ int __meminit sparse_add_section(int nid, unsigned long start_pfn,
return 0;
}
-void sparse_remove_section(struct mem_section *ms, unsigned long pfn,
- unsigned long nr_pages, unsigned long map_offset,
- struct vmem_altmap *altmap)
+void sparse_remove_section(unsigned long pfn, unsigned long nr_pages,
+ struct vmem_altmap *altmap)
{
+ struct mem_section *ms = __pfn_to_section(pfn);
+
+ if (WARN_ON_ONCE(!valid_section(ms)))
+ return;
+
section_deactivate(pfn, nr_pages, altmap);
}
#endif /* CONFIG_MEMORY_HOTPLUG */
diff --git a/mm/swap.c b/mm/swap.c
index 423199ee8478..cd8f0150ba3a 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -76,7 +76,7 @@ static DEFINE_PER_CPU(struct cpu_fbatches, cpu_fbatches) = {
/*
* This path almost never happens for VM activity - pages are normally freed
- * via pagevecs. But it gets used by networking - and for compound pages.
+ * in batches. But it gets used by networking - and for compound pages.
*/
static void __page_cache_release(struct folio *folio)
{
@@ -1044,25 +1044,25 @@ void release_pages(release_pages_arg arg, int nr)
EXPORT_SYMBOL(release_pages);
/*
- * The pages which we're about to release may be in the deferred lru-addition
+ * The folios which we're about to release may be in the deferred lru-addition
* queues. That would prevent them from really being freed right now. That's
- * OK from a correctness point of view but is inefficient - those pages may be
+ * OK from a correctness point of view but is inefficient - those folios may be
* cache-warm and we want to give them back to the page allocator ASAP.
*
- * So __pagevec_release() will drain those queues here.
+ * So __folio_batch_release() will drain those queues here.
* folio_batch_move_lru() calls folios_put() directly to avoid
* mutual recursion.
*/
-void __pagevec_release(struct pagevec *pvec)
+void __folio_batch_release(struct folio_batch *fbatch)
{
- if (!pvec->percpu_pvec_drained) {
+ if (!fbatch->percpu_pvec_drained) {
lru_add_drain();
- pvec->percpu_pvec_drained = true;
+ fbatch->percpu_pvec_drained = true;
}
- release_pages(pvec->pages, pagevec_count(pvec));
- pagevec_reinit(pvec);
+ release_pages(fbatch->folios, folio_batch_count(fbatch));
+ folio_batch_reinit(fbatch);
}
-EXPORT_SYMBOL(__pagevec_release);
+EXPORT_SYMBOL(__folio_batch_release);
/**
* folio_batch_remove_exceptionals() - Prune non-folios from a batch.
diff --git a/mm/swap_state.c b/mm/swap_state.c
index b76a65ac28b3..f8ea7015bad4 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -16,7 +16,6 @@
#include <linux/pagemap.h>
#include <linux/backing-dev.h>
#include <linux/blkdev.h>
-#include <linux/pagevec.h>
#include <linux/migrate.h>
#include <linux/vmalloc.h>
#include <linux/swap_slots.h>
@@ -275,9 +274,9 @@ void clear_shadow_from_swap_cache(int type, unsigned long begin,
}
}
-/*
- * If we are the only user, then try to free up the swap cache.
- *
+/*
+ * If we are the only user, then try to free up the swap cache.
+ *
* Its ok to check the swapcache flag without the folio lock
* here because we are going to recheck again inside
* folio_free_swap() _with_ the lock.
@@ -294,7 +293,7 @@ void free_swap_cache(struct page *page)
}
}
-/*
+/*
* Perform a free_page(), also freeing any swap cache associated with
* this page if it is the last user of the page.
*/
@@ -417,9 +416,13 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
{
struct swap_info_struct *si;
struct folio *folio;
+ struct page *page;
void *shadow = NULL;
*new_page_allocated = false;
+ si = get_swap_device(entry);
+ if (!si)
+ return NULL;
for (;;) {
int err;
@@ -428,14 +431,12 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
* called after swap_cache_get_folio() failed, re-calling
* that would confuse statistics.
*/
- si = get_swap_device(entry);
- if (!si)
- return NULL;
folio = filemap_get_folio(swap_address_space(entry),
swp_offset(entry));
- put_swap_device(si);
- if (!IS_ERR(folio))
- return folio_file_page(folio, swp_offset(entry));
+ if (!IS_ERR(folio)) {
+ page = folio_file_page(folio, swp_offset(entry));
+ goto got_page;
+ }
/*
* Just skip read ahead for unused swap slot.
@@ -445,8 +446,8 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
* as SWAP_HAS_CACHE. That's done in later part of code or
* else swap_off will be aborted if we return NULL.
*/
- if (!__swp_swapcount(entry) && swap_slot_cache_enabled)
- return NULL;
+ if (!swap_swapcount(si, entry) && swap_slot_cache_enabled)
+ goto fail_put_swap;
/*
* Get a new page to read into from swap. Allocate it now,
@@ -455,7 +456,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
*/
folio = vma_alloc_folio(gfp_mask, 0, vma, addr, false);
if (!folio)
- return NULL;
+ goto fail_put_swap;
/*
* Swap entry may have been freed since our caller observed it.
@@ -466,7 +467,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
folio_put(folio);
if (err != -EEXIST)
- return NULL;
+ goto fail_put_swap;
/*
* We might race against __delete_from_swap_cache(), and
@@ -500,12 +501,17 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
/* Caller will initiate read into locked folio */
folio_add_lru(folio);
*new_page_allocated = true;
- return &folio->page;
+ page = &folio->page;
+got_page:
+ put_swap_device(si);
+ return page;
fail_unlock:
put_swap_folio(folio, entry);
folio_unlock(folio);
folio_put(folio);
+fail_put_swap:
+ put_swap_device(si);
return NULL;
}
@@ -514,6 +520,10 @@ fail_unlock:
* and reading the disk if it is not already cached.
* A failure return means that either the page allocation failed or that
* the swap entry is no longer in use.
+ *
+ * get/put_swap_device() aren't needed to call this function, because
+ * __read_swap_cache_async() call them and swap_readpage() holds the
+ * swap cache folio lock.
*/
struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
struct vm_area_struct *vma,
@@ -698,6 +708,14 @@ void exit_swap_address_space(unsigned int type)
swapper_spaces[type] = NULL;
}
+#define SWAP_RA_ORDER_CEILING 5
+
+struct vma_swap_readahead {
+ unsigned short win;
+ unsigned short offset;
+ unsigned short nr_pte;
+};
+
static void swap_ra_info(struct vm_fault *vmf,
struct vma_swap_readahead *ra_info)
{
@@ -705,11 +723,7 @@ static void swap_ra_info(struct vm_fault *vmf,
unsigned long ra_val;
unsigned long faddr, pfn, fpfn, lpfn, rpfn;
unsigned long start, end;
- pte_t *pte, *orig_pte;
unsigned int max_win, hits, prev_win, win;
-#ifndef CONFIG_64BIT
- pte_t *tpte;
-#endif
max_win = 1 << min_t(unsigned int, READ_ONCE(page_cluster),
SWAP_RA_ORDER_CEILING);
@@ -728,12 +742,9 @@ static void swap_ra_info(struct vm_fault *vmf,
max_win, prev_win);
atomic_long_set(&vma->swap_readahead_info,
SWAP_RA_VAL(faddr, win, 0));
-
if (win == 1)
return;
- /* Copy the PTEs because the page table may be unmapped */
- orig_pte = pte = pte_offset_map(vmf->pmd, faddr);
if (fpfn == pfn + 1) {
lpfn = fpfn;
rpfn = fpfn + win;
@@ -753,15 +764,6 @@ static void swap_ra_info(struct vm_fault *vmf,
ra_info->nr_pte = end - start;
ra_info->offset = fpfn - start;
- pte -= ra_info->offset;
-#ifdef CONFIG_64BIT
- ra_info->ptes = pte;
-#else
- tpte = ra_info->ptes;
- for (pfn = start; pfn != end; pfn++)
- *tpte++ = *pte++;
-#endif
- pte_unmap(orig_pte);
}
/**
@@ -785,7 +787,8 @@ static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask,
struct swap_iocb *splug = NULL;
struct vm_area_struct *vma = vmf->vma;
struct page *page;
- pte_t *pte, pentry;
+ pte_t *pte = NULL, pentry;
+ unsigned long addr;
swp_entry_t entry;
unsigned int i;
bool page_allocated;
@@ -797,17 +800,25 @@ static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask,
if (ra_info.win == 1)
goto skip;
+ addr = vmf->address - (ra_info.offset * PAGE_SIZE);
+
blk_start_plug(&plug);
- for (i = 0, pte = ra_info.ptes; i < ra_info.nr_pte;
- i++, pte++) {
- pentry = *pte;
+ for (i = 0; i < ra_info.nr_pte; i++, addr += PAGE_SIZE) {
+ if (!pte++) {
+ pte = pte_offset_map(vmf->pmd, addr);
+ if (!pte)
+ break;
+ }
+ pentry = ptep_get_lockless(pte);
if (!is_swap_pte(pentry))
continue;
entry = pte_to_swp_entry(pentry);
if (unlikely(non_swap_entry(entry)))
continue;
+ pte_unmap(pte);
+ pte = NULL;
page = __read_swap_cache_async(entry, gfp_mask, vma,
- vmf->address, &page_allocated);
+ addr, &page_allocated);
if (!page)
continue;
if (page_allocated) {
@@ -819,6 +830,8 @@ static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask,
}
put_page(page);
}
+ if (pte)
+ pte_unmap(pte);
blk_finish_plug(&plug);
swap_read_unplug(splug);
lru_add_drain();
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 274bbf797480..8e6dde68b389 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -41,6 +41,7 @@
#include <linux/swap_slots.h>
#include <linux/sort.h>
#include <linux/completion.h>
+#include <linux/suspend.h>
#include <asm/tlbflush.h>
#include <linux/swapops.h>
@@ -1219,6 +1220,13 @@ static unsigned char __swap_entry_free_locked(struct swap_info_struct *p,
}
/*
+ * When we get a swap entry, if there aren't some other ways to
+ * prevent swapoff, such as the folio in swap cache is locked, page
+ * table lock is held, etc., the swap entry may become invalid because
+ * of swapoff. Then, we need to enclose all swap related functions
+ * with get_swap_device() and put_swap_device(), unless the swap
+ * functions call get/put_swap_device() by themselves.
+ *
* Check whether swap entry is valid in the swap device. If so,
* return pointer to swap_info_struct, and keep the swap entry valid
* via preventing the swap device from being swapoff, until
@@ -1227,9 +1235,8 @@ static unsigned char __swap_entry_free_locked(struct swap_info_struct *p,
* Notice that swapoff or swapoff+swapon can still happen before the
* percpu_ref_tryget_live() in get_swap_device() or after the
* percpu_ref_put() in put_swap_device() if there isn't any other way
- * to prevent swapoff, such as page lock, page table lock, etc. The
- * caller must be prepared for that. For example, the following
- * situation is possible.
+ * to prevent swapoff. The caller must be prepared for that. For
+ * example, the following situation is possible.
*
* CPU1 CPU2
* do_swap_page()
@@ -1432,16 +1439,10 @@ void swapcache_free_entries(swp_entry_t *entries, int n)
int __swap_count(swp_entry_t entry)
{
- struct swap_info_struct *si;
+ struct swap_info_struct *si = swp_swap_info(entry);
pgoff_t offset = swp_offset(entry);
- int count = 0;
- si = get_swap_device(entry);
- if (si) {
- count = swap_count(si->swap_map[offset]);
- put_swap_device(si);
- }
- return count;
+ return swap_count(si->swap_map[offset]);
}
/*
@@ -1449,7 +1450,7 @@ int __swap_count(swp_entry_t entry)
* This does not give an exact answer when swap count is continued,
* but does include the high COUNT_CONTINUED flag to allow for that.
*/
-static int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry)
+int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry)
{
pgoff_t offset = swp_offset(entry);
struct swap_cluster_info *ci;
@@ -1463,24 +1464,6 @@ static int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry)
/*
* How many references to @entry are currently swapped out?
- * This does not give an exact answer when swap count is continued,
- * but does include the high COUNT_CONTINUED flag to allow for that.
- */
-int __swp_swapcount(swp_entry_t entry)
-{
- int count = 0;
- struct swap_info_struct *si;
-
- si = get_swap_device(entry);
- if (si) {
- count = swap_swapcount(si, entry);
- put_swap_device(si);
- }
- return count;
-}
-
-/*
- * How many references to @entry are currently swapped out?
* This considers COUNT_CONTINUED so it returns exact answer.
*/
int swp_swapcount(swp_entry_t entry)
@@ -1762,7 +1745,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
struct page *page = folio_file_page(folio, swp_offset(entry));
struct page *swapcache;
spinlock_t *ptl;
- pte_t *pte, new_pte;
+ pte_t *pte, new_pte, old_pte;
bool hwposioned = false;
int ret = 1;
@@ -1774,11 +1757,14 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
hwposioned = true;
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
- if (unlikely(!pte_same_as_swp(*pte, swp_entry_to_pte(entry)))) {
+ if (unlikely(!pte || !pte_same_as_swp(ptep_get(pte),
+ swp_entry_to_pte(entry)))) {
ret = 0;
goto out;
}
+ old_pte = ptep_get(pte);
+
if (unlikely(hwposioned || !PageUptodate(page))) {
swp_entry_t swp_entry;
@@ -1810,7 +1796,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
* call and have the page locked.
*/
VM_BUG_ON_PAGE(PageWriteback(page), page);
- if (pte_swp_exclusive(*pte))
+ if (pte_swp_exclusive(old_pte))
rmap_flags |= RMAP_EXCLUSIVE;
page_add_anon_rmap(page, vma, addr, rmap_flags);
@@ -1819,15 +1805,16 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
lru_cache_add_inactive_or_unevictable(page, vma);
}
new_pte = pte_mkold(mk_pte(page, vma->vm_page_prot));
- if (pte_swp_soft_dirty(*pte))
+ if (pte_swp_soft_dirty(old_pte))
new_pte = pte_mksoft_dirty(new_pte);
- if (pte_swp_uffd_wp(*pte))
+ if (pte_swp_uffd_wp(old_pte))
new_pte = pte_mkuffd_wp(new_pte);
setpte:
set_pte_at(vma->vm_mm, addr, pte, new_pte);
swap_free(entry);
out:
- pte_unmap_unlock(pte, ptl);
+ if (pte)
+ pte_unmap_unlock(pte, ptl);
if (page != swapcache) {
unlock_page(page);
put_page(page);
@@ -1839,27 +1826,37 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, unsigned long end,
unsigned int type)
{
- swp_entry_t entry;
- pte_t *pte;
+ pte_t *pte = NULL;
struct swap_info_struct *si;
- int ret = 0;
si = swap_info[type];
- pte = pte_offset_map(pmd, addr);
do {
struct folio *folio;
unsigned long offset;
unsigned char swp_count;
+ swp_entry_t entry;
+ int ret;
+ pte_t ptent;
+
+ if (!pte++) {
+ pte = pte_offset_map(pmd, addr);
+ if (!pte)
+ break;
+ }
+
+ ptent = ptep_get_lockless(pte);
- if (!is_swap_pte(*pte))
+ if (!is_swap_pte(ptent))
continue;
- entry = pte_to_swp_entry(*pte);
+ entry = pte_to_swp_entry(ptent);
if (swp_type(entry) != type)
continue;
offset = swp_offset(entry);
pte_unmap(pte);
+ pte = NULL;
+
folio = swap_cache_get_folio(entry, vma, addr);
if (!folio) {
struct page *page;
@@ -1878,8 +1875,7 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
if (!folio) {
swp_count = READ_ONCE(si->swap_map[offset]);
if (swp_count == 0 || swp_count == SWAP_MAP_BAD)
- goto try_next;
-
+ continue;
return -ENOMEM;
}
@@ -1889,20 +1885,17 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
if (ret < 0) {
folio_unlock(folio);
folio_put(folio);
- goto out;
+ return ret;
}
folio_free_swap(folio);
folio_unlock(folio);
folio_put(folio);
-try_next:
- pte = pte_offset_map(pmd, addr);
- } while (pte++, addr += PAGE_SIZE, addr != end);
- pte_unmap(pte - 1);
+ } while (addr += PAGE_SIZE, addr != end);
- ret = 0;
-out:
- return ret;
+ if (pte)
+ pte_unmap(pte);
+ return 0;
}
static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
@@ -1917,8 +1910,6 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
do {
cond_resched();
next = pmd_addr_end(addr, end);
- if (pmd_none_or_trans_huge_or_clear_bad(pmd))
- continue;
ret = unuse_pte_range(vma, pmd, addr, next, type);
if (ret)
return ret;
@@ -2539,7 +2530,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
struct block_device *bdev = I_BDEV(inode);
set_blocksize(bdev, old_block_size);
- blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
+ blkdev_put(bdev, p);
}
inode_lock(inode);
@@ -2770,7 +2761,7 @@ static int claim_swapfile(struct swap_info_struct *p, struct inode *inode)
if (S_ISBLK(inode->i_mode)) {
p->bdev = blkdev_get_by_dev(inode->i_rdev,
- FMODE_READ | FMODE_WRITE | FMODE_EXCL, p);
+ BLK_OPEN_READ | BLK_OPEN_WRITE, p, NULL);
if (IS_ERR(p->bdev)) {
error = PTR_ERR(p->bdev);
p->bdev = NULL;
@@ -3221,7 +3212,7 @@ bad_swap:
p->cluster_next_cpu = NULL;
if (inode && S_ISBLK(inode->i_mode) && p->bdev) {
set_blocksize(p->bdev, p->old_block_size);
- blkdev_put(p->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
+ blkdev_put(p->bdev, p);
}
inode = NULL;
destroy_swap_extents(p);
@@ -3288,9 +3279,7 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
unsigned char has_cache;
int err;
- p = get_swap_device(entry);
- if (!p)
- return -EINVAL;
+ p = swp_swap_info(entry);
offset = swp_offset(entry);
ci = lock_cluster_or_swap_info(p, offset);
@@ -3337,7 +3326,6 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
unlock_out:
unlock_cluster_or_swap_info(p, ci);
- put_swap_device(p);
return err;
}
@@ -3468,11 +3456,6 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
goto out;
}
- /*
- * We are fortunate that although vmalloc_to_page uses pte_offset_map,
- * no architecture is using highmem pages for kernel page tables: so it
- * will not corrupt the GFP_ATOMIC caller's atomic page table kmaps.
- */
head = vmalloc_to_page(si->swap_map + offset);
offset &= ~PAGE_MASK;
diff --git a/mm/truncate.c b/mm/truncate.c
index 86de31ed4d32..95d1291d269b 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -486,18 +486,17 @@ void truncate_inode_pages_final(struct address_space *mapping)
EXPORT_SYMBOL(truncate_inode_pages_final);
/**
- * invalidate_mapping_pagevec - Invalidate all the unlocked pages of one inode
- * @mapping: the address_space which holds the pages to invalidate
+ * mapping_try_invalidate - Invalidate all the evictable folios of one inode
+ * @mapping: the address_space which holds the folios to invalidate
* @start: the offset 'from' which to invalidate
* @end: the offset 'to' which to invalidate (inclusive)
- * @nr_pagevec: invalidate failed page number for caller
+ * @nr_failed: How many folio invalidations failed
*
- * This helper is similar to invalidate_mapping_pages(), except that it accounts
- * for pages that are likely on a pagevec and counts them in @nr_pagevec, which
- * will be used by the caller.
+ * This function is similar to invalidate_mapping_pages(), except that it
+ * returns the number of folios which could not be evicted in @nr_failed.
*/
-unsigned long invalidate_mapping_pagevec(struct address_space *mapping,
- pgoff_t start, pgoff_t end, unsigned long *nr_pagevec)
+unsigned long mapping_try_invalidate(struct address_space *mapping,
+ pgoff_t start, pgoff_t end, unsigned long *nr_failed)
{
pgoff_t indices[PAGEVEC_SIZE];
struct folio_batch fbatch;
@@ -527,9 +526,9 @@ unsigned long invalidate_mapping_pagevec(struct address_space *mapping,
*/
if (!ret) {
deactivate_file_folio(folio);
- /* It is likely on the pagevec of a remote CPU */
- if (nr_pagevec)
- (*nr_pagevec)++;
+ /* Likely in the lru cache of a remote CPU */
+ if (nr_failed)
+ (*nr_failed)++;
}
count += ret;
}
@@ -552,12 +551,12 @@ unsigned long invalidate_mapping_pagevec(struct address_space *mapping,
* If you want to remove all the pages of one inode, regardless of
* their use and writeback state, use truncate_inode_pages().
*
- * Return: the number of the cache entries that were invalidated
+ * Return: The number of indices that had their contents invalidated
*/
unsigned long invalidate_mapping_pages(struct address_space *mapping,
pgoff_t start, pgoff_t end)
{
- return invalidate_mapping_pagevec(mapping, start, end, NULL);
+ return mapping_try_invalidate(mapping, start, end, NULL);
}
EXPORT_SYMBOL(invalidate_mapping_pages);
@@ -566,7 +565,7 @@ EXPORT_SYMBOL(invalidate_mapping_pages);
* refcount. We do this because invalidate_inode_pages2() needs stronger
* invalidation guarantees, and cannot afford to leave pages behind because
* shrink_page_list() has a temp ref on them, or because they're transiently
- * sitting in the folio_add_lru() pagevecs.
+ * sitting in the folio_add_lru() caches.
*/
static int invalidate_complete_folio2(struct address_space *mapping,
struct folio *folio)
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index e97a0b4889fc..a2bf37ee276d 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -76,7 +76,10 @@ int mfill_atomic_install_pte(pmd_t *dst_pmd,
if (flags & MFILL_ATOMIC_WP)
_dst_pte = pte_mkuffd_wp(_dst_pte);
+ ret = -EAGAIN;
dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
+ if (!dst_pte)
+ goto out;
if (vma_is_shmem(dst_vma)) {
/* serialize against truncate with the page table lock */
@@ -94,7 +97,7 @@ int mfill_atomic_install_pte(pmd_t *dst_pmd,
* registered, we firstly wr-protect a none pte which has no page cache
* page backing it, then access the page.
*/
- if (!pte_none_mostly(*dst_pte))
+ if (!pte_none_mostly(ptep_get(dst_pte)))
goto out_unlock;
folio = page_folio(page);
@@ -121,6 +124,7 @@ int mfill_atomic_install_pte(pmd_t *dst_pmd,
ret = 0;
out_unlock:
pte_unmap_unlock(dst_pte, ptl);
+out:
return ret;
}
@@ -212,7 +216,10 @@ static int mfill_atomic_pte_zeropage(pmd_t *dst_pmd,
_dst_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr),
dst_vma->vm_page_prot));
+ ret = -EAGAIN;
dst_pte = pte_offset_map_lock(dst_vma->vm_mm, dst_pmd, dst_addr, &ptl);
+ if (!dst_pte)
+ goto out;
if (dst_vma->vm_file) {
/* the shmem MAP_PRIVATE case requires checking the i_size */
inode = dst_vma->vm_file->f_inode;
@@ -223,7 +230,7 @@ static int mfill_atomic_pte_zeropage(pmd_t *dst_pmd,
goto out_unlock;
}
ret = -EEXIST;
- if (!pte_none(*dst_pte))
+ if (!pte_none(ptep_get(dst_pte)))
goto out_unlock;
set_pte_at(dst_vma->vm_mm, dst_addr, dst_pte, _dst_pte);
/* No need to invalidate - it was non-present before */
@@ -231,6 +238,7 @@ static int mfill_atomic_pte_zeropage(pmd_t *dst_pmd,
ret = 0;
out_unlock:
pte_unmap_unlock(dst_pte, ptl);
+out:
return ret;
}
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 9683573f1225..93cf99aba335 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -103,7 +103,7 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
if (!pte)
return -ENOMEM;
do {
- BUG_ON(!pte_none(*pte));
+ BUG_ON(!pte_none(ptep_get(pte)));
#ifdef CONFIG_HUGETLB_PAGE
size = arch_vmap_pte_range_map_size(addr, end, pfn, max_page_shift);
@@ -472,7 +472,7 @@ static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr,
do {
struct page *page = pages[*nr];
- if (WARN_ON(!pte_none(*pte)))
+ if (WARN_ON(!pte_none(ptep_get(pte))))
return -EBUSY;
if (WARN_ON(!page))
return -ENOMEM;
@@ -703,11 +703,10 @@ struct page *vmalloc_to_page(const void *vmalloc_addr)
if (WARN_ON_ONCE(pmd_bad(*pmd)))
return NULL;
- ptep = pte_offset_map(pmd, addr);
- pte = *ptep;
+ ptep = pte_offset_kernel(pmd, addr);
+ pte = ptep_get(ptep);
if (pte_present(pte))
page = pte_page(pte);
- pte_unmap(ptep);
return page;
}
@@ -791,7 +790,7 @@ get_subtree_max_size(struct rb_node *node)
RB_DECLARE_CALLBACKS_MAX(static, free_vmap_area_rb_augment_cb,
struct vmap_area, rb_node, unsigned long, subtree_max_size, va_size)
-static void purge_vmap_area_lazy(void);
+static void reclaim_and_purge_vmap_areas(void);
static BLOCKING_NOTIFIER_HEAD(vmap_notify_list);
static void drain_vmap_area_work(struct work_struct *work);
static DECLARE_WORK(drain_vmap_work, drain_vmap_area_work);
@@ -1649,7 +1648,7 @@ retry:
overflow:
if (!purged) {
- purge_vmap_area_lazy();
+ reclaim_and_purge_vmap_areas();
purged = 1;
goto retry;
}
@@ -1785,9 +1784,10 @@ out:
}
/*
- * Kick off a purge of the outstanding lazy areas.
+ * Reclaim vmap areas by purging fragmented blocks and purge_vmap_area_list.
*/
-static void purge_vmap_area_lazy(void)
+static void reclaim_and_purge_vmap_areas(void)
+
{
mutex_lock(&vmap_purge_lock);
purge_fragmented_blocks_allcpus();
@@ -1908,6 +1908,12 @@ static struct vmap_area *find_unlink_vmap_area(unsigned long addr)
#define VMAP_BLOCK_SIZE (VMAP_BBMAP_BITS * PAGE_SIZE)
+/*
+ * Purge threshold to prevent overeager purging of fragmented blocks for
+ * regular operations: Purge if vb->free is less than 1/4 of the capacity.
+ */
+#define VMAP_PURGE_THRESHOLD (VMAP_BBMAP_BITS / 4)
+
#define VMAP_RAM 0x1 /* indicates vm_map_ram area*/
#define VMAP_BLOCK 0x2 /* mark out the vmap_block sub-type*/
#define VMAP_FLAGS_MASK 0x3
@@ -2086,39 +2092,62 @@ static void free_vmap_block(struct vmap_block *vb)
kfree_rcu(vb, rcu_head);
}
+static bool purge_fragmented_block(struct vmap_block *vb,
+ struct vmap_block_queue *vbq, struct list_head *purge_list,
+ bool force_purge)
+{
+ if (vb->free + vb->dirty != VMAP_BBMAP_BITS ||
+ vb->dirty == VMAP_BBMAP_BITS)
+ return false;
+
+ /* Don't overeagerly purge usable blocks unless requested */
+ if (!(force_purge || vb->free < VMAP_PURGE_THRESHOLD))
+ return false;
+
+ /* prevent further allocs after releasing lock */
+ WRITE_ONCE(vb->free, 0);
+ /* prevent purging it again */
+ WRITE_ONCE(vb->dirty, VMAP_BBMAP_BITS);
+ vb->dirty_min = 0;
+ vb->dirty_max = VMAP_BBMAP_BITS;
+ spin_lock(&vbq->lock);
+ list_del_rcu(&vb->free_list);
+ spin_unlock(&vbq->lock);
+ list_add_tail(&vb->purge, purge_list);
+ return true;
+}
+
+static void free_purged_blocks(struct list_head *purge_list)
+{
+ struct vmap_block *vb, *n_vb;
+
+ list_for_each_entry_safe(vb, n_vb, purge_list, purge) {
+ list_del(&vb->purge);
+ free_vmap_block(vb);
+ }
+}
+
static void purge_fragmented_blocks(int cpu)
{
LIST_HEAD(purge);
struct vmap_block *vb;
- struct vmap_block *n_vb;
struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu);
rcu_read_lock();
list_for_each_entry_rcu(vb, &vbq->free, free_list) {
+ unsigned long free = READ_ONCE(vb->free);
+ unsigned long dirty = READ_ONCE(vb->dirty);
- if (!(vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS))
+ if (free + dirty != VMAP_BBMAP_BITS ||
+ dirty == VMAP_BBMAP_BITS)
continue;
spin_lock(&vb->lock);
- if (vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS) {
- vb->free = 0; /* prevent further allocs after releasing lock */
- vb->dirty = VMAP_BBMAP_BITS; /* prevent purging it again */
- vb->dirty_min = 0;
- vb->dirty_max = VMAP_BBMAP_BITS;
- spin_lock(&vbq->lock);
- list_del_rcu(&vb->free_list);
- spin_unlock(&vbq->lock);
- spin_unlock(&vb->lock);
- list_add_tail(&vb->purge, &purge);
- } else
- spin_unlock(&vb->lock);
+ purge_fragmented_block(vb, vbq, &purge, true);
+ spin_unlock(&vb->lock);
}
rcu_read_unlock();
-
- list_for_each_entry_safe(vb, n_vb, &purge, purge) {
- list_del(&vb->purge);
- free_vmap_block(vb);
- }
+ free_purged_blocks(&purge);
}
static void purge_fragmented_blocks_allcpus(void)
@@ -2153,6 +2182,9 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
list_for_each_entry_rcu(vb, &vbq->free, free_list) {
unsigned long pages_off;
+ if (READ_ONCE(vb->free) < (1UL << order))
+ continue;
+
spin_lock(&vb->lock);
if (vb->free < (1UL << order)) {
spin_unlock(&vb->lock);
@@ -2161,7 +2193,7 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
pages_off = VMAP_BBMAP_BITS - vb->free;
vaddr = vmap_block_vaddr(vb->va->va_start, pages_off);
- vb->free -= 1UL << order;
+ WRITE_ONCE(vb->free, vb->free - (1UL << order));
bitmap_set(vb->used_map, pages_off, (1UL << order));
if (vb->free == 0) {
spin_lock(&vbq->lock);
@@ -2211,11 +2243,11 @@ static void vb_free(unsigned long addr, unsigned long size)
spin_lock(&vb->lock);
- /* Expand dirty range */
+ /* Expand the not yet TLB flushed dirty range */
vb->dirty_min = min(vb->dirty_min, offset);
vb->dirty_max = max(vb->dirty_max, offset + (1UL << order));
- vb->dirty += 1UL << order;
+ WRITE_ONCE(vb->dirty, vb->dirty + (1UL << order));
if (vb->dirty == VMAP_BBMAP_BITS) {
BUG_ON(vb->free);
spin_unlock(&vb->lock);
@@ -2226,21 +2258,30 @@ static void vb_free(unsigned long addr, unsigned long size)
static void _vm_unmap_aliases(unsigned long start, unsigned long end, int flush)
{
+ LIST_HEAD(purge_list);
int cpu;
if (unlikely(!vmap_initialized))
return;
- might_sleep();
+ mutex_lock(&vmap_purge_lock);
for_each_possible_cpu(cpu) {
struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu);
struct vmap_block *vb;
+ unsigned long idx;
rcu_read_lock();
- list_for_each_entry_rcu(vb, &vbq->free, free_list) {
+ xa_for_each(&vbq->vmap_blocks, idx, vb) {
spin_lock(&vb->lock);
- if (vb->dirty && vb->dirty != VMAP_BBMAP_BITS) {
+
+ /*
+ * Try to purge a fragmented block first. If it's
+ * not purgeable, check whether there is dirty
+ * space to be flushed.
+ */
+ if (!purge_fragmented_block(vb, vbq, &purge_list, false) &&
+ vb->dirty_max && vb->dirty != VMAP_BBMAP_BITS) {
unsigned long va_start = vb->va->va_start;
unsigned long s, e;
@@ -2250,15 +2291,18 @@ static void _vm_unmap_aliases(unsigned long start, unsigned long end, int flush)
start = min(s, start);
end = max(e, end);
+ /* Prevent that this is flushed again */
+ vb->dirty_min = VMAP_BBMAP_BITS;
+ vb->dirty_max = 0;
+
flush = 1;
}
spin_unlock(&vb->lock);
}
rcu_read_unlock();
}
+ free_purged_blocks(&purge_list);
- mutex_lock(&vmap_purge_lock);
- purge_fragmented_blocks_allcpus();
if (!__purge_vmap_area_lazy(start, end) && flush)
flush_tlb_kernel_range(start, end);
mutex_unlock(&vmap_purge_lock);
@@ -2899,10 +2943,16 @@ struct vmap_pfn_data {
static int vmap_pfn_apply(pte_t *pte, unsigned long addr, void *private)
{
struct vmap_pfn_data *data = private;
+ unsigned long pfn = data->pfns[data->idx];
+ pte_t ptent;
- if (WARN_ON_ONCE(pfn_valid(data->pfns[data->idx])))
+ if (WARN_ON_ONCE(pfn_valid(pfn)))
return -EINVAL;
- *pte = pte_mkspecial(pfn_pte(data->pfns[data->idx++], data->prot));
+
+ ptent = pte_mkspecial(pfn_pte(pfn, data->prot));
+ set_pte_at(&init_mm, addr, pte, ptent);
+
+ data->idx++;
return 0;
}
@@ -3098,11 +3148,20 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
* allocation request, free them via vfree() if any.
*/
if (area->nr_pages != nr_small_pages) {
- /* vm_area_alloc_pages() can also fail due to a fatal signal */
- if (!fatal_signal_pending(current))
+ /*
+ * vm_area_alloc_pages() can fail due to insufficient memory but
+ * also:-
+ *
+ * - a pending fatal signal
+ * - insufficient huge page-order pages
+ *
+ * Since we always retry allocations at order-0 in the huge page
+ * case a warning for either is spurious.
+ */
+ if (!fatal_signal_pending(current) && page_order == 0)
warn_alloc(gfp_mask, NULL,
- "vmalloc error: size %lu, page order %u, failed to allocate pages",
- area->nr_pages * PAGE_SIZE, page_order);
+ "vmalloc error: size %lu, failed to allocate pages",
+ area->nr_pages * PAGE_SIZE);
goto fail;
}
@@ -3511,7 +3570,7 @@ static size_t zero_iter(struct iov_iter *iter, size_t count)
while (remains > 0) {
size_t num, copied;
- num = remains < PAGE_SIZE ? remains : PAGE_SIZE;
+ num = min_t(size_t, remains, PAGE_SIZE);
copied = copy_page_to_iter_nofault(ZERO_PAGE(0), 0, num, iter);
remains -= copied;
@@ -4142,7 +4201,7 @@ recovery:
overflow:
spin_unlock(&free_vmap_area_lock);
if (!purged) {
- purge_vmap_area_lazy();
+ reclaim_and_purge_vmap_areas();
purged = true;
/* Before "retry", check if we recover. */
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 6d0cd2840cf0..1080209a568b 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -35,7 +35,7 @@
#include <linux/cpuset.h>
#include <linux/compaction.h>
#include <linux/notifier.h>
-#include <linux/mutex.h>
+#include <linux/rwsem.h>
#include <linux/delay.h>
#include <linux/kthread.h>
#include <linux/freezer.h>
@@ -57,7 +57,6 @@
#include <linux/khugepaged.h>
#include <linux/rculist_nulls.h>
#include <linux/random.h>
-#include <linux/srcu.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
@@ -190,9 +189,7 @@ struct scan_control {
int vm_swappiness = 60;
LIST_HEAD(shrinker_list);
-DEFINE_MUTEX(shrinker_mutex);
-DEFINE_SRCU(shrinker_srcu);
-static atomic_t shrinker_srcu_generation = ATOMIC_INIT(0);
+DECLARE_RWSEM(shrinker_rwsem);
#ifdef CONFIG_MEMCG
static int shrinker_nr_max;
@@ -211,21 +208,8 @@ static inline int shrinker_defer_size(int nr_items)
static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg,
int nid)
{
- return srcu_dereference_check(memcg->nodeinfo[nid]->shrinker_info,
- &shrinker_srcu,
- lockdep_is_held(&shrinker_mutex));
-}
-
-static struct shrinker_info *shrinker_info_srcu(struct mem_cgroup *memcg,
- int nid)
-{
- return srcu_dereference(memcg->nodeinfo[nid]->shrinker_info,
- &shrinker_srcu);
-}
-
-static void free_shrinker_info_rcu(struct rcu_head *head)
-{
- kvfree(container_of(head, struct shrinker_info, rcu));
+ return rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_info,
+ lockdep_is_held(&shrinker_rwsem));
}
static int expand_one_shrinker_info(struct mem_cgroup *memcg,
@@ -266,7 +250,7 @@ static int expand_one_shrinker_info(struct mem_cgroup *memcg,
defer_size - old_defer_size);
rcu_assign_pointer(pn->shrinker_info, new);
- call_srcu(&shrinker_srcu, &old->rcu, free_shrinker_info_rcu);
+ kvfree_rcu(old, rcu);
}
return 0;
@@ -292,7 +276,7 @@ int alloc_shrinker_info(struct mem_cgroup *memcg)
int nid, size, ret = 0;
int map_size, defer_size = 0;
- mutex_lock(&shrinker_mutex);
+ down_write(&shrinker_rwsem);
map_size = shrinker_map_size(shrinker_nr_max);
defer_size = shrinker_defer_size(shrinker_nr_max);
size = map_size + defer_size;
@@ -308,7 +292,7 @@ int alloc_shrinker_info(struct mem_cgroup *memcg)
info->map_nr_max = shrinker_nr_max;
rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_info, info);
}
- mutex_unlock(&shrinker_mutex);
+ up_write(&shrinker_rwsem);
return ret;
}
@@ -324,7 +308,7 @@ static int expand_shrinker_info(int new_id)
if (!root_mem_cgroup)
goto out;
- lockdep_assert_held(&shrinker_mutex);
+ lockdep_assert_held(&shrinker_rwsem);
map_size = shrinker_map_size(new_nr_max);
defer_size = shrinker_defer_size(new_nr_max);
@@ -352,16 +336,15 @@ void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id)
{
if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) {
struct shrinker_info *info;
- int srcu_idx;
- srcu_idx = srcu_read_lock(&shrinker_srcu);
- info = shrinker_info_srcu(memcg, nid);
+ rcu_read_lock();
+ info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info);
if (!WARN_ON_ONCE(shrinker_id >= info->map_nr_max)) {
/* Pairs with smp mb in shrink_slab() */
smp_mb__before_atomic();
set_bit(shrinker_id, info->map);
}
- srcu_read_unlock(&shrinker_srcu, srcu_idx);
+ rcu_read_unlock();
}
}
@@ -374,7 +357,8 @@ static int prealloc_memcg_shrinker(struct shrinker *shrinker)
if (mem_cgroup_disabled())
return -ENOSYS;
- mutex_lock(&shrinker_mutex);
+ down_write(&shrinker_rwsem);
+ /* This may call shrinker, so it must use down_read_trylock() */
id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL);
if (id < 0)
goto unlock;
@@ -388,7 +372,7 @@ static int prealloc_memcg_shrinker(struct shrinker *shrinker)
shrinker->id = id;
ret = 0;
unlock:
- mutex_unlock(&shrinker_mutex);
+ up_write(&shrinker_rwsem);
return ret;
}
@@ -398,7 +382,7 @@ static void unregister_memcg_shrinker(struct shrinker *shrinker)
BUG_ON(id < 0);
- lockdep_assert_held(&shrinker_mutex);
+ lockdep_assert_held(&shrinker_rwsem);
idr_remove(&shrinker_idr, id);
}
@@ -408,7 +392,7 @@ static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker,
{
struct shrinker_info *info;
- info = shrinker_info_srcu(memcg, nid);
+ info = shrinker_info_protected(memcg, nid);
return atomic_long_xchg(&info->nr_deferred[shrinker->id], 0);
}
@@ -417,7 +401,7 @@ static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker,
{
struct shrinker_info *info;
- info = shrinker_info_srcu(memcg, nid);
+ info = shrinker_info_protected(memcg, nid);
return atomic_long_add_return(nr, &info->nr_deferred[shrinker->id]);
}
@@ -433,7 +417,7 @@ void reparent_shrinker_deferred(struct mem_cgroup *memcg)
parent = root_mem_cgroup;
/* Prevent from concurrent shrinker_info expand */
- mutex_lock(&shrinker_mutex);
+ down_read(&shrinker_rwsem);
for_each_node(nid) {
child_info = shrinker_info_protected(memcg, nid);
parent_info = shrinker_info_protected(parent, nid);
@@ -442,15 +426,20 @@ void reparent_shrinker_deferred(struct mem_cgroup *memcg)
atomic_long_add(nr, &parent_info->nr_deferred[i]);
}
}
- mutex_unlock(&shrinker_mutex);
+ up_read(&shrinker_rwsem);
}
+/* Returns true for reclaim through cgroup limits or cgroup interfaces. */
static bool cgroup_reclaim(struct scan_control *sc)
{
return sc->target_mem_cgroup;
}
-static bool global_reclaim(struct scan_control *sc)
+/*
+ * Returns true for reclaim on the root cgroup. This is true for direct
+ * allocator reclaim and reclaim through cgroup interfaces on the root cgroup.
+ */
+static bool root_reclaim(struct scan_control *sc)
{
return !sc->target_mem_cgroup || mem_cgroup_is_root(sc->target_mem_cgroup);
}
@@ -505,7 +494,7 @@ static bool cgroup_reclaim(struct scan_control *sc)
return false;
}
-static bool global_reclaim(struct scan_control *sc)
+static bool root_reclaim(struct scan_control *sc)
{
return true;
}
@@ -562,7 +551,7 @@ static void flush_reclaim_state(struct scan_control *sc)
* memcg reclaim, to make reporting more accurate and reduce
* underestimation, but it's probably not worth the complexity for now.
*/
- if (current->reclaim_state && global_reclaim(sc)) {
+ if (current->reclaim_state && root_reclaim(sc)) {
sc->nr_reclaimed += current->reclaim_state->reclaimed;
current->reclaim_state->reclaimed = 0;
}
@@ -743,9 +732,9 @@ void free_prealloced_shrinker(struct shrinker *shrinker)
shrinker->name = NULL;
#endif
if (shrinker->flags & SHRINKER_MEMCG_AWARE) {
- mutex_lock(&shrinker_mutex);
+ down_write(&shrinker_rwsem);
unregister_memcg_shrinker(shrinker);
- mutex_unlock(&shrinker_mutex);
+ up_write(&shrinker_rwsem);
return;
}
@@ -755,11 +744,11 @@ void free_prealloced_shrinker(struct shrinker *shrinker)
void register_shrinker_prepared(struct shrinker *shrinker)
{
- mutex_lock(&shrinker_mutex);
- list_add_tail_rcu(&shrinker->list, &shrinker_list);
+ down_write(&shrinker_rwsem);
+ list_add_tail(&shrinker->list, &shrinker_list);
shrinker->flags |= SHRINKER_REGISTERED;
shrinker_debugfs_add(shrinker);
- mutex_unlock(&shrinker_mutex);
+ up_write(&shrinker_rwsem);
}
static int __register_shrinker(struct shrinker *shrinker)
@@ -810,16 +799,13 @@ void unregister_shrinker(struct shrinker *shrinker)
if (!(shrinker->flags & SHRINKER_REGISTERED))
return;
- mutex_lock(&shrinker_mutex);
- list_del_rcu(&shrinker->list);
+ down_write(&shrinker_rwsem);
+ list_del(&shrinker->list);
shrinker->flags &= ~SHRINKER_REGISTERED;
if (shrinker->flags & SHRINKER_MEMCG_AWARE)
unregister_memcg_shrinker(shrinker);
debugfs_entry = shrinker_debugfs_detach(shrinker, &debugfs_id);
- mutex_unlock(&shrinker_mutex);
-
- atomic_inc(&shrinker_srcu_generation);
- synchronize_srcu(&shrinker_srcu);
+ up_write(&shrinker_rwsem);
shrinker_debugfs_remove(debugfs_entry, debugfs_id);
@@ -831,13 +817,15 @@ EXPORT_SYMBOL(unregister_shrinker);
/**
* synchronize_shrinkers - Wait for all running shrinkers to complete.
*
- * This is useful to guarantee that all shrinker invocations have seen an
- * update, before freeing memory.
+ * This is equivalent to calling unregister_shrink() and register_shrinker(),
+ * but atomically and with less overhead. This is useful to guarantee that all
+ * shrinker invocations have seen an update, before freeing memory, similar to
+ * rcu.
*/
void synchronize_shrinkers(void)
{
- atomic_inc(&shrinker_srcu_generation);
- synchronize_srcu(&shrinker_srcu);
+ down_write(&shrinker_rwsem);
+ up_write(&shrinker_rwsem);
}
EXPORT_SYMBOL(synchronize_shrinkers);
@@ -946,20 +934,19 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
{
struct shrinker_info *info;
unsigned long ret, freed = 0;
- int srcu_idx, generation;
- int i = 0;
+ int i;
if (!mem_cgroup_online(memcg))
return 0;
-again:
- srcu_idx = srcu_read_lock(&shrinker_srcu);
- info = shrinker_info_srcu(memcg, nid);
+ if (!down_read_trylock(&shrinker_rwsem))
+ return 0;
+
+ info = shrinker_info_protected(memcg, nid);
if (unlikely(!info))
goto unlock;
- generation = atomic_read(&shrinker_srcu_generation);
- for_each_set_bit_from(i, info->map, info->map_nr_max) {
+ for_each_set_bit(i, info->map, info->map_nr_max) {
struct shrink_control sc = {
.gfp_mask = gfp_mask,
.nid = nid,
@@ -1005,14 +992,14 @@ again:
set_shrinker_bit(memcg, nid, i);
}
freed += ret;
- if (atomic_read(&shrinker_srcu_generation) != generation) {
- srcu_read_unlock(&shrinker_srcu, srcu_idx);
- i++;
- goto again;
+
+ if (rwsem_is_contended(&shrinker_rwsem)) {
+ freed = freed ? : 1;
+ break;
}
}
unlock:
- srcu_read_unlock(&shrinker_srcu, srcu_idx);
+ up_read(&shrinker_rwsem);
return freed;
}
#else /* CONFIG_MEMCG */
@@ -1049,7 +1036,6 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
{
unsigned long ret, freed = 0;
struct shrinker *shrinker;
- int srcu_idx, generation;
/*
* The root memcg might be allocated even though memcg is disabled
@@ -1061,11 +1047,10 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg))
return shrink_slab_memcg(gfp_mask, nid, memcg, priority);
- srcu_idx = srcu_read_lock(&shrinker_srcu);
+ if (!down_read_trylock(&shrinker_rwsem))
+ goto out;
- generation = atomic_read(&shrinker_srcu_generation);
- list_for_each_entry_srcu(shrinker, &shrinker_list, list,
- srcu_read_lock_held(&shrinker_srcu)) {
+ list_for_each_entry(shrinker, &shrinker_list, list) {
struct shrink_control sc = {
.gfp_mask = gfp_mask,
.nid = nid,
@@ -1076,14 +1061,19 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
if (ret == SHRINK_EMPTY)
ret = 0;
freed += ret;
-
- if (atomic_read(&shrinker_srcu_generation) != generation) {
+ /*
+ * Bail out if someone want to register a new shrinker to
+ * prevent the registration from being stalled for long periods
+ * by parallel ongoing shrinking.
+ */
+ if (rwsem_is_contended(&shrinker_rwsem)) {
freed = freed ? : 1;
break;
}
}
- srcu_read_unlock(&shrinker_srcu, srcu_idx);
+ up_read(&shrinker_rwsem);
+out:
cond_resched();
return freed;
}
@@ -1621,9 +1611,10 @@ static void folio_check_dirty_writeback(struct folio *folio,
mapping->a_ops->is_dirty_writeback(folio, dirty, writeback);
}
-static struct page *alloc_demote_page(struct page *page, unsigned long private)
+static struct folio *alloc_demote_folio(struct folio *src,
+ unsigned long private)
{
- struct page *target_page;
+ struct folio *dst;
nodemask_t *allowed_mask;
struct migration_target_control *mtc;
@@ -1641,14 +1632,14 @@ static struct page *alloc_demote_page(struct page *page, unsigned long private)
*/
mtc->nmask = NULL;
mtc->gfp_mask |= __GFP_THISNODE;
- target_page = alloc_migration_target(page, (unsigned long)mtc);
- if (target_page)
- return target_page;
+ dst = alloc_migration_target(src, (unsigned long)mtc);
+ if (dst)
+ return dst;
mtc->gfp_mask &= ~__GFP_THISNODE;
mtc->nmask = allowed_mask;
- return alloc_migration_target(page, (unsigned long)mtc);
+ return alloc_migration_target(src, (unsigned long)mtc);
}
/*
@@ -1683,7 +1674,7 @@ static unsigned int demote_folio_list(struct list_head *demote_folios,
node_get_allowed_targets(pgdat, &allowed_mask);
/* Demotion ignores all cpuset and mempolicy settings */
- migrate_pages(demote_folios, alloc_demote_page, NULL,
+ migrate_pages(demote_folios, alloc_demote_folio, NULL,
(unsigned long)&mtc, MIGRATE_ASYNC, MR_DEMOTION,
&nr_succeeded);
@@ -2270,6 +2261,25 @@ static __always_inline void update_lru_sizes(struct lruvec *lruvec,
}
+#ifdef CONFIG_CMA
+/*
+ * It is waste of effort to scan and reclaim CMA pages if it is not available
+ * for current allocation context. Kswapd can not be enrolled as it can not
+ * distinguish this scenario by using sc->gfp_mask = GFP_KERNEL
+ */
+static bool skip_cma(struct folio *folio, struct scan_control *sc)
+{
+ return !current_is_kswapd() &&
+ gfp_migratetype(sc->gfp_mask) != MIGRATE_MOVABLE &&
+ get_pageblock_migratetype(&folio->page) == MIGRATE_CMA;
+}
+#else
+static bool skip_cma(struct folio *folio, struct scan_control *sc)
+{
+ return false;
+}
+#endif
+
/*
* Isolating page from the lruvec to fill in @dst list by nr_to_scan times.
*
@@ -2316,7 +2326,8 @@ static unsigned long isolate_lru_folios(unsigned long nr_to_scan,
nr_pages = folio_nr_pages(folio);
total_scan += nr_pages;
- if (folio_zonenum(folio) > sc->reclaim_idx) {
+ if (folio_zonenum(folio) > sc->reclaim_idx ||
+ skip_cma(folio, sc)) {
nr_skipped[folio_zonenum(folio)] += nr_pages;
move_to = &folios_skipped;
goto move;
@@ -2458,7 +2469,7 @@ static int too_many_isolated(struct pglist_data *pgdat, int file,
* won't get blocked by normal direct-reclaimers, forming a circular
* deadlock.
*/
- if ((sc->gfp_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS))
+ if (gfp_has_io_fs(sc->gfp_mask))
inactive >>= 3;
too_many = isolated > inactive;
@@ -3233,6 +3244,16 @@ DEFINE_STATIC_KEY_ARRAY_FALSE(lru_gen_caps, NR_LRU_GEN_CAPS);
#define get_cap(cap) static_branch_unlikely(&lru_gen_caps[cap])
#endif
+static bool should_walk_mmu(void)
+{
+ return arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK);
+}
+
+static bool should_clear_pmd_young(void)
+{
+ return arch_has_hw_nonleaf_pmd_young() && get_cap(LRU_GEN_NONLEAF_YOUNG);
+}
+
/******************************************************************************
* shorthand helpers
******************************************************************************/
@@ -3993,28 +4014,29 @@ static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end,
struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
int old_gen, new_gen = lru_gen_from_seq(walk->max_seq);
- VM_WARN_ON_ONCE(pmd_leaf(*pmd));
-
- ptl = pte_lockptr(args->mm, pmd);
- if (!spin_trylock(ptl))
+ pte = pte_offset_map_nolock(args->mm, pmd, start & PMD_MASK, &ptl);
+ if (!pte)
+ return false;
+ if (!spin_trylock(ptl)) {
+ pte_unmap(pte);
return false;
+ }
arch_enter_lazy_mmu_mode();
-
- pte = pte_offset_map(pmd, start & PMD_MASK);
restart:
for (i = pte_index(start), addr = start; addr != end; i++, addr += PAGE_SIZE) {
unsigned long pfn;
struct folio *folio;
+ pte_t ptent = ptep_get(pte + i);
total++;
walk->mm_stats[MM_LEAF_TOTAL]++;
- pfn = get_pte_pfn(pte[i], args->vma, addr);
+ pfn = get_pte_pfn(ptent, args->vma, addr);
if (pfn == -1)
continue;
- if (!pte_young(pte[i])) {
+ if (!pte_young(ptent)) {
walk->mm_stats[MM_LEAF_OLD]++;
continue;
}
@@ -4029,7 +4051,7 @@ restart:
young++;
walk->mm_stats[MM_LEAF_YOUNG]++;
- if (pte_dirty(pte[i]) && !folio_test_dirty(folio) &&
+ if (pte_dirty(ptent) && !folio_test_dirty(folio) &&
!(folio_test_anon(folio) && folio_test_swapbacked(folio) &&
!folio_test_swapcache(folio)))
folio_mark_dirty(folio);
@@ -4042,10 +4064,8 @@ restart:
if (i < PTRS_PER_PTE && get_next_vma(PMD_MASK, PAGE_SIZE, args, &start, &end))
goto restart;
- pte_unmap(pte);
-
arch_leave_lazy_mmu_mode();
- spin_unlock(ptl);
+ pte_unmap_unlock(pte, ptl);
return suitable_to_scan(total, young);
}
@@ -4097,7 +4117,7 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area
goto next;
if (!pmd_trans_huge(pmd[i])) {
- if (arch_has_hw_nonleaf_pmd_young() && get_cap(LRU_GEN_NONLEAF_YOUNG))
+ if (should_clear_pmd_young())
pmdp_test_and_clear_young(vma, addr, pmd + i);
goto next;
}
@@ -4143,7 +4163,7 @@ static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end,
unsigned long next;
unsigned long addr;
struct vm_area_struct *vma;
- unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)];
+ DECLARE_BITMAP(bitmap, MIN_LRU_BATCH);
unsigned long first = -1;
struct lru_gen_mm_walk *walk = args->private;
@@ -4190,7 +4210,7 @@ restart:
#endif
walk->mm_stats[MM_NONLEAF_TOTAL]++;
- if (arch_has_hw_nonleaf_pmd_young() && get_cap(LRU_GEN_NONLEAF_YOUNG)) {
+ if (should_clear_pmd_young()) {
if (!pmd_young(val))
continue;
@@ -4492,7 +4512,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
* handful of PTEs. Spreading the work out over a period of time usually
* is less efficient, but it avoids bursty page faults.
*/
- if (!arch_has_hw_pte_young() || !get_cap(LRU_GEN_MM_WALK)) {
+ if (!should_walk_mmu()) {
success = iterate_mm_list_nowalk(lruvec, max_seq);
goto done;
}
@@ -4674,12 +4694,13 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
for (i = 0, addr = start; addr != end; i++, addr += PAGE_SIZE) {
unsigned long pfn;
+ pte_t ptent = ptep_get(pte + i);
- pfn = get_pte_pfn(pte[i], pvmw->vma, addr);
+ pfn = get_pte_pfn(ptent, pvmw->vma, addr);
if (pfn == -1)
continue;
- if (!pte_young(pte[i]))
+ if (!pte_young(ptent))
continue;
folio = get_pfn_folio(pfn, memcg, pgdat, !walk || walk->can_swap);
@@ -4691,7 +4712,7 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
young++;
- if (pte_dirty(pte[i]) && !folio_test_dirty(folio) &&
+ if (pte_dirty(ptent) && !folio_test_dirty(folio) &&
!(folio_test_anon(folio) && folio_test_swapbacked(folio) &&
!folio_test_swapcache(folio)))
folio_mark_dirty(folio);
@@ -4743,10 +4764,11 @@ static void lru_gen_rotate_memcg(struct lruvec *lruvec, int op)
{
int seg;
int old, new;
+ unsigned long flags;
int bin = get_random_u32_below(MEMCG_NR_BINS);
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
- spin_lock(&pgdat->memcg_lru.lock);
+ spin_lock_irqsave(&pgdat->memcg_lru.lock, flags);
VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list));
@@ -4781,7 +4803,7 @@ static void lru_gen_rotate_memcg(struct lruvec *lruvec, int op)
if (!pgdat->memcg_lru.nr_memcgs[old] && old == get_memcg_gen(pgdat->memcg_lru.seq))
WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1);
- spin_unlock(&pgdat->memcg_lru.lock);
+ spin_unlock_irqrestore(&pgdat->memcg_lru.lock, flags);
}
void lru_gen_online_memcg(struct mem_cgroup *memcg)
@@ -4794,7 +4816,7 @@ void lru_gen_online_memcg(struct mem_cgroup *memcg)
struct pglist_data *pgdat = NODE_DATA(nid);
struct lruvec *lruvec = get_lruvec(memcg, nid);
- spin_lock(&pgdat->memcg_lru.lock);
+ spin_lock_irq(&pgdat->memcg_lru.lock);
VM_WARN_ON_ONCE(!hlist_nulls_unhashed(&lruvec->lrugen.list));
@@ -4805,7 +4827,7 @@ void lru_gen_online_memcg(struct mem_cgroup *memcg)
lruvec->lrugen.gen = gen;
- spin_unlock(&pgdat->memcg_lru.lock);
+ spin_unlock_irq(&pgdat->memcg_lru.lock);
}
}
@@ -4829,7 +4851,7 @@ void lru_gen_release_memcg(struct mem_cgroup *memcg)
struct pglist_data *pgdat = NODE_DATA(nid);
struct lruvec *lruvec = get_lruvec(memcg, nid);
- spin_lock(&pgdat->memcg_lru.lock);
+ spin_lock_irq(&pgdat->memcg_lru.lock);
VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list));
@@ -4841,12 +4863,14 @@ void lru_gen_release_memcg(struct mem_cgroup *memcg)
if (!pgdat->memcg_lru.nr_memcgs[gen] && gen == get_memcg_gen(pgdat->memcg_lru.seq))
WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1);
- spin_unlock(&pgdat->memcg_lru.lock);
+ spin_unlock_irq(&pgdat->memcg_lru.lock);
}
}
-void lru_gen_soft_reclaim(struct lruvec *lruvec)
+void lru_gen_soft_reclaim(struct mem_cgroup *memcg, int nid)
{
+ struct lruvec *lruvec = get_lruvec(memcg, nid);
+
/* see the comment on MEMCG_NR_GENS */
if (lru_gen_memcg_seg(lruvec) != MEMCG_LRU_HEAD)
lru_gen_rotate_memcg(lruvec, MEMCG_LRU_HEAD);
@@ -4912,7 +4936,6 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, int tier_idx)
WRITE_ONCE(lrugen->protected[hist][type][tier - 1],
lrugen->protected[hist][type][tier - 1] + delta);
- __mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + type, delta);
return true;
}
@@ -5307,7 +5330,7 @@ static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, bool
static unsigned long get_nr_to_reclaim(struct scan_control *sc)
{
/* don't abort memcg reclaim to ensure fairness */
- if (!global_reclaim(sc))
+ if (!root_reclaim(sc))
return -1;
return max(sc->nr_to_reclaim, compact_gap(sc->order));
@@ -5459,7 +5482,7 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc
{
struct blk_plug plug;
- VM_WARN_ON_ONCE(global_reclaim(sc));
+ VM_WARN_ON_ONCE(root_reclaim(sc));
VM_WARN_ON_ONCE(!sc->may_writepage || !sc->may_unmap);
lru_add_drain();
@@ -5520,7 +5543,7 @@ static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *
struct blk_plug plug;
unsigned long reclaimed = sc->nr_reclaimed;
- VM_WARN_ON_ONCE(!global_reclaim(sc));
+ VM_WARN_ON_ONCE(!root_reclaim(sc));
/*
* Unmapped clean folios are already prioritized. Scanning for more of
@@ -5727,10 +5750,10 @@ static ssize_t enabled_show(struct kobject *kobj, struct kobj_attribute *attr, c
if (get_cap(LRU_GEN_CORE))
caps |= BIT(LRU_GEN_CORE);
- if (arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK))
+ if (should_walk_mmu())
caps |= BIT(LRU_GEN_MM_WALK);
- if (arch_has_hw_nonleaf_pmd_young() && get_cap(LRU_GEN_NONLEAF_YOUNG))
+ if (should_clear_pmd_young())
caps |= BIT(LRU_GEN_NONLEAF_YOUNG);
return sysfs_emit(buf, "0x%04x\n", caps);
@@ -6242,7 +6265,7 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
bool proportional_reclaim;
struct blk_plug plug;
- if (lru_gen_enabled() && !global_reclaim(sc)) {
+ if (lru_gen_enabled() && !root_reclaim(sc)) {
lru_gen_shrink_lruvec(lruvec, sc);
return;
}
@@ -6398,14 +6421,13 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat,
if (!managed_zone(zone))
continue;
- switch (compaction_suitable(zone, sc->order, 0, sc->reclaim_idx)) {
- case COMPACT_SUCCESS:
- case COMPACT_CONTINUE:
+ /* Allocation can already succeed, nothing to do */
+ if (zone_watermark_ok(zone, sc->order, min_wmark_pages(zone),
+ sc->reclaim_idx, 0))
+ return false;
+
+ if (compaction_suitable(zone, sc->order, sc->reclaim_idx))
return false;
- default:
- /* check next zone */
- ;
- }
}
/*
@@ -6484,7 +6506,7 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
struct lruvec *target_lruvec;
bool reclaimable = false;
- if (lru_gen_enabled() && global_reclaim(sc)) {
+ if (lru_gen_enabled() && root_reclaim(sc)) {
lru_gen_shrink_node(pgdat, sc);
return;
}
@@ -6556,10 +6578,13 @@ again:
* Legacy memcg will stall in page writeback so avoid forcibly
* stalling in reclaim_throttle().
*/
- if ((current_is_kswapd() ||
- (cgroup_reclaim(sc) && writeback_throttling_sane(sc))) &&
- sc->nr.dirty && sc->nr.dirty == sc->nr.congested)
- set_bit(LRUVEC_CONGESTED, &target_lruvec->flags);
+ if (sc->nr.dirty && sc->nr.dirty == sc->nr.congested) {
+ if (cgroup_reclaim(sc) && writeback_throttling_sane(sc))
+ set_bit(LRUVEC_CGROUP_CONGESTED, &target_lruvec->flags);
+
+ if (current_is_kswapd())
+ set_bit(LRUVEC_NODE_CONGESTED, &target_lruvec->flags);
+ }
/*
* Stall direct reclaim for IO completions if the lruvec is
@@ -6569,7 +6594,8 @@ again:
*/
if (!current_is_kswapd() && current_may_throttle() &&
!sc->hibernation_mode &&
- test_bit(LRUVEC_CONGESTED, &target_lruvec->flags))
+ (test_bit(LRUVEC_CGROUP_CONGESTED, &target_lruvec->flags) ||
+ test_bit(LRUVEC_NODE_CONGESTED, &target_lruvec->flags)))
reclaim_throttle(pgdat, VMSCAN_THROTTLE_CONGESTED);
if (should_continue_reclaim(pgdat, nr_node_reclaimed, sc))
@@ -6593,14 +6619,14 @@ again:
static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
{
unsigned long watermark;
- enum compact_result suitable;
- suitable = compaction_suitable(zone, sc->order, 0, sc->reclaim_idx);
- if (suitable == COMPACT_SUCCESS)
- /* Allocation should succeed already. Don't reclaim. */
+ /* Allocation can already succeed, nothing to do */
+ if (zone_watermark_ok(zone, sc->order, min_wmark_pages(zone),
+ sc->reclaim_idx, 0))
return true;
- if (suitable == COMPACT_SKIPPED)
- /* Compaction cannot yet proceed. Do reclaim. */
+
+ /* Compaction cannot yet proceed. Do reclaim. */
+ if (!compaction_suitable(zone, sc->order, sc->reclaim_idx))
return false;
/*
@@ -6826,7 +6852,7 @@ retry:
lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup,
zone->zone_pgdat);
- clear_bit(LRUVEC_CONGESTED, &lruvec->flags);
+ clear_bit(LRUVEC_CGROUP_CONGESTED, &lruvec->flags);
}
}
@@ -6887,7 +6913,7 @@ static bool allow_direct_reclaim(pg_data_t *pgdat)
continue;
pfmemalloc_reserve += min_wmark_pages(zone);
- free_pages += zone_page_state(zone, NR_FREE_PAGES);
+ free_pages += zone_page_state_snapshot(zone, NR_FREE_PAGES);
}
/* If there are no reserves (unexpected config) then do not throttle */
@@ -7215,7 +7241,8 @@ static void clear_pgdat_congested(pg_data_t *pgdat)
{
struct lruvec *lruvec = mem_cgroup_lruvec(NULL, pgdat);
- clear_bit(LRUVEC_CONGESTED, &lruvec->flags);
+ clear_bit(LRUVEC_NODE_CONGESTED, &lruvec->flags);
+ clear_bit(LRUVEC_CGROUP_CONGESTED, &lruvec->flags);
clear_bit(PGDAT_DIRTY, &pgdat->flags);
clear_bit(PGDAT_WRITEBACK, &pgdat->flags);
}
@@ -7840,7 +7867,7 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
/*
* This kswapd start function will be called by init and node-hot-add.
*/
-void kswapd_run(int nid)
+void __meminit kswapd_run(int nid)
{
pg_data_t *pgdat = NODE_DATA(nid);
@@ -7861,7 +7888,7 @@ void kswapd_run(int nid)
* Called by memory hotplug when all memory in a node is offlined. Caller must
* be holding mem_hotplug_begin/done().
*/
-void kswapd_stop(int nid)
+void __meminit kswapd_stop(int nid)
{
pg_data_t *pgdat = NODE_DATA(nid);
struct task_struct *kswapd;
@@ -8058,23 +8085,6 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
}
#endif
-void check_move_unevictable_pages(struct pagevec *pvec)
-{
- struct folio_batch fbatch;
- unsigned i;
-
- folio_batch_init(&fbatch);
- for (i = 0; i < pvec->nr; i++) {
- struct page *page = pvec->pages[i];
-
- if (PageTransTail(page))
- continue;
- folio_batch_add(&fbatch, page_folio(page));
- }
- check_move_unevictable_folios(&fbatch);
-}
-EXPORT_SYMBOL_GPL(check_move_unevictable_pages);
-
/**
* check_move_unevictable_folios - Move evictable folios to appropriate zone
* lru list
diff --git a/mm/vmstat.c b/mm/vmstat.c
index c28046371b45..b731d57996c5 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -28,6 +28,7 @@
#include <linux/mm_inline.h>
#include <linux/page_ext.h>
#include <linux/page_owner.h>
+#include <linux/sched/isolation.h>
#include "internal.h"
@@ -1180,6 +1181,9 @@ const char * const vmstat_text[] = {
"nr_zspages",
#endif
"nr_free_cma",
+#ifdef CONFIG_UNACCEPTED_MEMORY
+ "nr_unaccepted",
+#endif
/* enum numa_stat_item counters */
#ifdef CONFIG_NUMA
@@ -2022,6 +2026,20 @@ static void vmstat_shepherd(struct work_struct *w)
for_each_online_cpu(cpu) {
struct delayed_work *dw = &per_cpu(vmstat_work, cpu);
+ /*
+ * In kernel users of vmstat counters either require the precise value and
+ * they are using zone_page_state_snapshot interface or they can live with
+ * an imprecision as the regular flushing can happen at arbitrary time and
+ * cumulative error can grow (see calculate_normal_threshold).
+ *
+ * From that POV the regular flushing can be postponed for CPUs that have
+ * been isolated from the kernel interference without critical
+ * infrastructure ever noticing. Skip regular flushing from vmstat_shepherd
+ * for all isolated CPUs to avoid interference with the isolated workload.
+ */
+ if (cpu_is_isolated(cpu))
+ continue;
+
if (!delayed_work_pending(dw) && need_update(cpu))
queue_delayed_work_on(cpu, mm_percpu_wq, dw, 0);
diff --git a/mm/workingset.c b/mm/workingset.c
index 817758951886..4686ae363000 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -255,45 +255,58 @@ static void *lru_gen_eviction(struct folio *folio)
return pack_shadow(mem_cgroup_id(memcg), pgdat, token, refs);
}
+/*
+ * Tests if the shadow entry is for a folio that was recently evicted.
+ * Fills in @lruvec, @token, @workingset with the values unpacked from shadow.
+ */
+static bool lru_gen_test_recent(void *shadow, bool file, struct lruvec **lruvec,
+ unsigned long *token, bool *workingset)
+{
+ int memcg_id;
+ unsigned long min_seq;
+ struct mem_cgroup *memcg;
+ struct pglist_data *pgdat;
+
+ unpack_shadow(shadow, &memcg_id, &pgdat, token, workingset);
+
+ memcg = mem_cgroup_from_id(memcg_id);
+ *lruvec = mem_cgroup_lruvec(memcg, pgdat);
+
+ min_seq = READ_ONCE((*lruvec)->lrugen.min_seq[file]);
+ return (*token >> LRU_REFS_WIDTH) == (min_seq & (EVICTION_MASK >> LRU_REFS_WIDTH));
+}
+
static void lru_gen_refault(struct folio *folio, void *shadow)
{
+ bool recent;
int hist, tier, refs;
- int memcg_id;
bool workingset;
unsigned long token;
- unsigned long min_seq;
struct lruvec *lruvec;
struct lru_gen_folio *lrugen;
- struct mem_cgroup *memcg;
- struct pglist_data *pgdat;
int type = folio_is_file_lru(folio);
int delta = folio_nr_pages(folio);
- unpack_shadow(shadow, &memcg_id, &pgdat, &token, &workingset);
-
- if (pgdat != folio_pgdat(folio))
- return;
-
rcu_read_lock();
- memcg = folio_memcg_rcu(folio);
- if (memcg_id != mem_cgroup_id(memcg))
+ recent = lru_gen_test_recent(shadow, type, &lruvec, &token, &workingset);
+ if (lruvec != folio_lruvec(folio))
goto unlock;
- lruvec = mem_cgroup_lruvec(memcg, pgdat);
- lrugen = &lruvec->lrugen;
+ mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + type, delta);
- min_seq = READ_ONCE(lrugen->min_seq[type]);
- if ((token >> LRU_REFS_WIDTH) != (min_seq & (EVICTION_MASK >> LRU_REFS_WIDTH)))
+ if (!recent)
goto unlock;
- hist = lru_hist_from_seq(min_seq);
+ lrugen = &lruvec->lrugen;
+
+ hist = lru_hist_from_seq(READ_ONCE(lrugen->min_seq[type]));
/* see the comment in folio_lru_refs() */
refs = (token & (BIT(LRU_REFS_WIDTH) - 1)) + workingset;
tier = lru_tier_from_refs(refs);
atomic_long_add(delta, &lrugen->refaulted[hist][type][tier]);
- mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + type, delta);
+ mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + type, delta);
/*
* Count the following two cases as stalls:
@@ -317,6 +330,12 @@ static void *lru_gen_eviction(struct folio *folio)
return NULL;
}
+static bool lru_gen_test_recent(void *shadow, bool file, struct lruvec **lruvec,
+ unsigned long *token, bool *workingset)
+{
+ return false;
+}
+
static void lru_gen_refault(struct folio *folio, void *shadow)
{
}
@@ -385,42 +404,33 @@ void *workingset_eviction(struct folio *folio, struct mem_cgroup *target_memcg)
}
/**
- * workingset_refault - Evaluate the refault of a previously evicted folio.
- * @folio: The freshly allocated replacement folio.
- * @shadow: Shadow entry of the evicted folio.
- *
- * Calculates and evaluates the refault distance of the previously
- * evicted folio in the context of the node and the memcg whose memory
- * pressure caused the eviction.
+ * workingset_test_recent - tests if the shadow entry is for a folio that was
+ * recently evicted. Also fills in @workingset with the value unpacked from
+ * shadow.
+ * @shadow: the shadow entry to be tested.
+ * @file: whether the corresponding folio is from the file lru.
+ * @workingset: where the workingset value unpacked from shadow should
+ * be stored.
+ *
+ * Return: true if the shadow is for a recently evicted folio; false otherwise.
*/
-void workingset_refault(struct folio *folio, void *shadow)
+bool workingset_test_recent(void *shadow, bool file, bool *workingset)
{
- bool file = folio_is_file_lru(folio);
struct mem_cgroup *eviction_memcg;
struct lruvec *eviction_lruvec;
unsigned long refault_distance;
unsigned long workingset_size;
- struct pglist_data *pgdat;
- struct mem_cgroup *memcg;
- unsigned long eviction;
- struct lruvec *lruvec;
unsigned long refault;
- bool workingset;
int memcgid;
- long nr;
+ struct pglist_data *pgdat;
+ unsigned long eviction;
- if (lru_gen_enabled()) {
- lru_gen_refault(folio, shadow);
- return;
- }
+ if (lru_gen_enabled())
+ return lru_gen_test_recent(shadow, file, &eviction_lruvec, &eviction, workingset);
- unpack_shadow(shadow, &memcgid, &pgdat, &eviction, &workingset);
+ unpack_shadow(shadow, &memcgid, &pgdat, &eviction, workingset);
eviction <<= bucket_order;
- /* Flush stats (and potentially sleep) before holding RCU read lock */
- mem_cgroup_flush_stats_ratelimited();
-
- rcu_read_lock();
/*
* Look up the memcg associated with the stored ID. It might
* have been deleted since the folio's eviction.
@@ -439,7 +449,8 @@ void workingset_refault(struct folio *folio, void *shadow)
*/
eviction_memcg = mem_cgroup_from_id(memcgid);
if (!mem_cgroup_disabled() && !eviction_memcg)
- goto out;
+ return false;
+
eviction_lruvec = mem_cgroup_lruvec(eviction_memcg, pgdat);
refault = atomic_long_read(&eviction_lruvec->nonresident_age);
@@ -462,20 +473,6 @@ void workingset_refault(struct folio *folio, void *shadow)
refault_distance = (refault - eviction) & EVICTION_MASK;
/*
- * The activation decision for this folio is made at the level
- * where the eviction occurred, as that is where the LRU order
- * during folio reclaim is being determined.
- *
- * However, the cgroup that will own the folio is the one that
- * is actually experiencing the refault event.
- */
- nr = folio_nr_pages(folio);
- memcg = folio_memcg(folio);
- pgdat = folio_pgdat(folio);
- lruvec = mem_cgroup_lruvec(memcg, pgdat);
-
- mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file, nr);
- /*
* Compare the distance to the existing workingset size. We
* don't activate pages that couldn't stay resident even if
* all the memory was available to the workingset. Whether
@@ -495,7 +492,54 @@ void workingset_refault(struct folio *folio, void *shadow)
NR_INACTIVE_ANON);
}
}
- if (refault_distance > workingset_size)
+
+ return refault_distance <= workingset_size;
+}
+
+/**
+ * workingset_refault - Evaluate the refault of a previously evicted folio.
+ * @folio: The freshly allocated replacement folio.
+ * @shadow: Shadow entry of the evicted folio.
+ *
+ * Calculates and evaluates the refault distance of the previously
+ * evicted folio in the context of the node and the memcg whose memory
+ * pressure caused the eviction.
+ */
+void workingset_refault(struct folio *folio, void *shadow)
+{
+ bool file = folio_is_file_lru(folio);
+ struct pglist_data *pgdat;
+ struct mem_cgroup *memcg;
+ struct lruvec *lruvec;
+ bool workingset;
+ long nr;
+
+ if (lru_gen_enabled()) {
+ lru_gen_refault(folio, shadow);
+ return;
+ }
+
+ /* Flush stats (and potentially sleep) before holding RCU read lock */
+ mem_cgroup_flush_stats_ratelimited();
+
+ rcu_read_lock();
+
+ /*
+ * The activation decision for this folio is made at the level
+ * where the eviction occurred, as that is where the LRU order
+ * during folio reclaim is being determined.
+ *
+ * However, the cgroup that will own the folio is the one that
+ * is actually experiencing the refault event.
+ */
+ nr = folio_nr_pages(folio);
+ memcg = folio_memcg(folio);
+ pgdat = folio_pgdat(folio);
+ lruvec = mem_cgroup_lruvec(memcg, pgdat);
+
+ mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file, nr);
+
+ if (!workingset_test_recent(shadow, file, &workingset))
goto out;
folio_set_active(folio);
diff --git a/mm/z3fold.c b/mm/z3fold.c
index 0cef845d397b..e84de91ecccb 100644
--- a/mm/z3fold.c
+++ b/mm/z3fold.c
@@ -125,13 +125,11 @@ struct z3fold_header {
/**
* struct z3fold_pool - stores metadata for each z3fold pool
* @name: pool name
- * @lock: protects pool unbuddied/lru lists
+ * @lock: protects pool unbuddied lists
* @stale_lock: protects pool stale page list
* @unbuddied: per-cpu array of lists tracking z3fold pages that contain 2-
* buddies; the list each z3fold page is added to depends on
* the size of its free region.
- * @lru: list tracking the z3fold pages in LRU order by most recently
- * added buddy.
* @stale: list of pages marked for freeing
* @pages_nr: number of z3fold pages in the pool.
* @c_handle: cache for z3fold_buddy_slots allocation
@@ -149,12 +147,9 @@ struct z3fold_pool {
spinlock_t lock;
spinlock_t stale_lock;
struct list_head *unbuddied;
- struct list_head lru;
struct list_head stale;
atomic64_t pages_nr;
struct kmem_cache *c_handle;
- struct zpool *zpool;
- const struct zpool_ops *zpool_ops;
struct workqueue_struct *compact_wq;
struct workqueue_struct *release_wq;
struct work_struct work;
@@ -329,7 +324,6 @@ static struct z3fold_header *init_z3fold_page(struct page *page, bool headless,
struct z3fold_header *zhdr = page_address(page);
struct z3fold_buddy_slots *slots;
- INIT_LIST_HEAD(&page->lru);
clear_bit(PAGE_HEADLESS, &page->private);
clear_bit(MIDDLE_CHUNK_MAPPED, &page->private);
clear_bit(NEEDS_COMPACTING, &page->private);
@@ -451,8 +445,6 @@ static void __release_z3fold_page(struct z3fold_header *zhdr, bool locked)
set_bit(PAGE_STALE, &page->private);
clear_bit(NEEDS_COMPACTING, &page->private);
spin_lock(&pool->lock);
- if (!list_empty(&page->lru))
- list_del_init(&page->lru);
spin_unlock(&pool->lock);
if (locked)
@@ -930,7 +922,6 @@ static struct z3fold_pool *z3fold_create_pool(const char *name, gfp_t gfp)
for_each_unbuddied_list(i, 0)
INIT_LIST_HEAD(&unbuddied[i]);
}
- INIT_LIST_HEAD(&pool->lru);
INIT_LIST_HEAD(&pool->stale);
atomic64_set(&pool->pages_nr, 0);
pool->name = name;
@@ -1073,12 +1064,6 @@ found:
headless:
spin_lock(&pool->lock);
- /* Add/move z3fold page to beginning of LRU */
- if (!list_empty(&page->lru))
- list_del(&page->lru);
-
- list_add(&page->lru, &pool->lru);
-
*handle = encode_handle(zhdr, bud);
spin_unlock(&pool->lock);
if (bud != HEADLESS)
@@ -1115,9 +1100,6 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
* immediately so we don't care about its value any more.
*/
if (!page_claimed) {
- spin_lock(&pool->lock);
- list_del(&page->lru);
- spin_unlock(&pool->lock);
put_z3fold_header(zhdr);
free_z3fold_page(page, true);
atomic64_dec(&pool->pages_nr);
@@ -1173,194 +1155,6 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
}
/**
- * z3fold_reclaim_page() - evicts allocations from a pool page and frees it
- * @pool: pool from which a page will attempt to be evicted
- * @retries: number of pages on the LRU list for which eviction will
- * be attempted before failing
- *
- * z3fold reclaim is different from normal system reclaim in that it is done
- * from the bottom, up. This is because only the bottom layer, z3fold, has
- * information on how the allocations are organized within each z3fold page.
- * This has the potential to create interesting locking situations between
- * z3fold and the user, however.
- *
- * To avoid these, this is how z3fold_reclaim_page() should be called:
- *
- * The user detects a page should be reclaimed and calls z3fold_reclaim_page().
- * z3fold_reclaim_page() will remove a z3fold page from the pool LRU list and
- * call the user-defined eviction handler with the pool and handle as
- * arguments.
- *
- * If the handle can not be evicted, the eviction handler should return
- * non-zero. z3fold_reclaim_page() will add the z3fold page back to the
- * appropriate list and try the next z3fold page on the LRU up to
- * a user defined number of retries.
- *
- * If the handle is successfully evicted, the eviction handler should
- * return 0 _and_ should have called z3fold_free() on the handle. z3fold_free()
- * contains logic to delay freeing the page if the page is under reclaim,
- * as indicated by the setting of the PG_reclaim flag on the underlying page.
- *
- * If all buddies in the z3fold page are successfully evicted, then the
- * z3fold page can be freed.
- *
- * Returns: 0 if page is successfully freed, otherwise -EINVAL if there are
- * no pages to evict or an eviction handler is not registered, -EAGAIN if
- * the retry limit was hit.
- */
-static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries)
-{
- int i, ret = -1;
- struct z3fold_header *zhdr = NULL;
- struct page *page = NULL;
- struct list_head *pos;
- unsigned long first_handle = 0, middle_handle = 0, last_handle = 0;
- struct z3fold_buddy_slots slots __attribute__((aligned(SLOTS_ALIGN)));
-
- rwlock_init(&slots.lock);
- slots.pool = (unsigned long)pool | (1 << HANDLES_NOFREE);
-
- spin_lock(&pool->lock);
- for (i = 0; i < retries; i++) {
- if (list_empty(&pool->lru)) {
- spin_unlock(&pool->lock);
- return -EINVAL;
- }
- list_for_each_prev(pos, &pool->lru) {
- page = list_entry(pos, struct page, lru);
-
- zhdr = page_address(page);
- if (test_bit(PAGE_HEADLESS, &page->private)) {
- /*
- * For non-headless pages, we wait to do this
- * until we have the page lock to avoid racing
- * with __z3fold_alloc(). Headless pages don't
- * have a lock (and __z3fold_alloc() will never
- * see them), but we still need to test and set
- * PAGE_CLAIMED to avoid racing with
- * z3fold_free(), so just do it now before
- * leaving the loop.
- */
- if (test_and_set_bit(PAGE_CLAIMED, &page->private))
- continue;
-
- break;
- }
-
- if (!z3fold_page_trylock(zhdr)) {
- zhdr = NULL;
- continue; /* can't evict at this point */
- }
-
- /* test_and_set_bit is of course atomic, but we still
- * need to do it under page lock, otherwise checking
- * that bit in __z3fold_alloc wouldn't make sense
- */
- if (zhdr->foreign_handles ||
- test_and_set_bit(PAGE_CLAIMED, &page->private)) {
- z3fold_page_unlock(zhdr);
- zhdr = NULL;
- continue; /* can't evict such page */
- }
- list_del_init(&zhdr->buddy);
- zhdr->cpu = -1;
- /* See comment in __z3fold_alloc. */
- kref_get(&zhdr->refcount);
- break;
- }
-
- if (!zhdr)
- break;
-
- list_del_init(&page->lru);
- spin_unlock(&pool->lock);
-
- if (!test_bit(PAGE_HEADLESS, &page->private)) {
- /*
- * We need encode the handles before unlocking, and
- * use our local slots structure because z3fold_free
- * can zero out zhdr->slots and we can't do much
- * about that
- */
- first_handle = 0;
- last_handle = 0;
- middle_handle = 0;
- memset(slots.slot, 0, sizeof(slots.slot));
- if (zhdr->first_chunks)
- first_handle = __encode_handle(zhdr, &slots,
- FIRST);
- if (zhdr->middle_chunks)
- middle_handle = __encode_handle(zhdr, &slots,
- MIDDLE);
- if (zhdr->last_chunks)
- last_handle = __encode_handle(zhdr, &slots,
- LAST);
- /*
- * it's safe to unlock here because we hold a
- * reference to this page
- */
- z3fold_page_unlock(zhdr);
- } else {
- first_handle = encode_handle(zhdr, HEADLESS);
- last_handle = middle_handle = 0;
- }
- /* Issue the eviction callback(s) */
- if (middle_handle) {
- ret = pool->zpool_ops->evict(pool->zpool, middle_handle);
- if (ret)
- goto next;
- }
- if (first_handle) {
- ret = pool->zpool_ops->evict(pool->zpool, first_handle);
- if (ret)
- goto next;
- }
- if (last_handle) {
- ret = pool->zpool_ops->evict(pool->zpool, last_handle);
- if (ret)
- goto next;
- }
-next:
- if (test_bit(PAGE_HEADLESS, &page->private)) {
- if (ret == 0) {
- free_z3fold_page(page, true);
- atomic64_dec(&pool->pages_nr);
- return 0;
- }
- spin_lock(&pool->lock);
- list_add(&page->lru, &pool->lru);
- spin_unlock(&pool->lock);
- clear_bit(PAGE_CLAIMED, &page->private);
- } else {
- struct z3fold_buddy_slots *slots = zhdr->slots;
- z3fold_page_lock(zhdr);
- if (kref_put(&zhdr->refcount,
- release_z3fold_page_locked)) {
- kmem_cache_free(pool->c_handle, slots);
- return 0;
- }
- /*
- * if we are here, the page is still not completely
- * free. Take the global pool lock then to be able
- * to add it back to the lru list
- */
- spin_lock(&pool->lock);
- list_add(&page->lru, &pool->lru);
- spin_unlock(&pool->lock);
- if (list_empty(&zhdr->buddy))
- add_to_unbuddied(pool, zhdr);
- clear_bit(PAGE_CLAIMED, &page->private);
- z3fold_page_unlock(zhdr);
- }
-
- /* We started off locked to we need to lock the pool back */
- spin_lock(&pool->lock);
- }
- spin_unlock(&pool->lock);
- return -EAGAIN;
-}
-
-/**
* z3fold_map() - maps the allocation associated with the given handle
* @pool: pool in which the allocation resides
* @handle: handle associated with the allocation to be mapped
@@ -1470,8 +1264,6 @@ static bool z3fold_page_isolate(struct page *page, isolate_mode_t mode)
spin_lock(&pool->lock);
if (!list_empty(&zhdr->buddy))
list_del_init(&zhdr->buddy);
- if (!list_empty(&page->lru))
- list_del_init(&page->lru);
spin_unlock(&pool->lock);
kref_get(&zhdr->refcount);
@@ -1531,9 +1323,6 @@ static int z3fold_page_migrate(struct page *newpage, struct page *page,
encode_handle(new_zhdr, MIDDLE);
set_bit(NEEDS_COMPACTING, &newpage->private);
new_zhdr->cpu = smp_processor_id();
- spin_lock(&pool->lock);
- list_add(&newpage->lru, &pool->lru);
- spin_unlock(&pool->lock);
__SetPageMovable(newpage, &z3fold_mops);
z3fold_page_unlock(new_zhdr);
@@ -1559,9 +1348,6 @@ static void z3fold_page_putback(struct page *page)
INIT_LIST_HEAD(&page->lru);
if (kref_put(&zhdr->refcount, release_z3fold_page_locked))
return;
- spin_lock(&pool->lock);
- list_add(&page->lru, &pool->lru);
- spin_unlock(&pool->lock);
if (list_empty(&zhdr->buddy))
add_to_unbuddied(pool, zhdr);
clear_bit(PAGE_CLAIMED, &page->private);
@@ -1578,18 +1364,9 @@ static const struct movable_operations z3fold_mops = {
* zpool
****************/
-static void *z3fold_zpool_create(const char *name, gfp_t gfp,
- const struct zpool_ops *zpool_ops,
- struct zpool *zpool)
+static void *z3fold_zpool_create(const char *name, gfp_t gfp)
{
- struct z3fold_pool *pool;
-
- pool = z3fold_create_pool(name, gfp);
- if (pool) {
- pool->zpool = zpool;
- pool->zpool_ops = zpool_ops;
- }
- return pool;
+ return z3fold_create_pool(name, gfp);
}
static void z3fold_zpool_destroy(void *pool)
@@ -1607,25 +1384,6 @@ static void z3fold_zpool_free(void *pool, unsigned long handle)
z3fold_free(pool, handle);
}
-static int z3fold_zpool_shrink(void *pool, unsigned int pages,
- unsigned int *reclaimed)
-{
- unsigned int total = 0;
- int ret = -EINVAL;
-
- while (total < pages) {
- ret = z3fold_reclaim_page(pool, 8);
- if (ret < 0)
- break;
- total++;
- }
-
- if (reclaimed)
- *reclaimed = total;
-
- return ret;
-}
-
static void *z3fold_zpool_map(void *pool, unsigned long handle,
enum zpool_mapmode mm)
{
@@ -1649,7 +1407,6 @@ static struct zpool_driver z3fold_zpool_driver = {
.destroy = z3fold_zpool_destroy,
.malloc = z3fold_zpool_malloc,
.free = z3fold_zpool_free,
- .shrink = z3fold_zpool_shrink,
.map = z3fold_zpool_map,
.unmap = z3fold_zpool_unmap,
.total_size = z3fold_zpool_total_size,
diff --git a/mm/zbud.c b/mm/zbud.c
index 3acd26193920..2190cc1f37b3 100644
--- a/mm/zbud.c
+++ b/mm/zbud.c
@@ -83,11 +83,7 @@ struct zbud_pool;
* its free region.
* @buddied: list tracking the zbud pages that contain two buddies;
* these zbud pages are full
- * @lru: list tracking the zbud pages in LRU order by most recently
- * added buddy.
* @pages_nr: number of zbud pages in the pool.
- * @zpool: zpool driver
- * @zpool_ops: zpool operations structure with an evict callback
*
* This structure is allocated at pool creation time and maintains metadata
* pertaining to a particular zbud pool.
@@ -102,26 +98,20 @@ struct zbud_pool {
struct list_head buddied;
struct list_head unbuddied[NCHUNKS];
};
- struct list_head lru;
u64 pages_nr;
- struct zpool *zpool;
- const struct zpool_ops *zpool_ops;
};
/*
* struct zbud_header - zbud page metadata occupying the first chunk of each
* zbud page.
* @buddy: links the zbud page into the unbuddied/buddied lists in the pool
- * @lru: links the zbud page into the lru list in the pool
* @first_chunks: the size of the first buddy in chunks, 0 if free
* @last_chunks: the size of the last buddy in chunks, 0 if free
*/
struct zbud_header {
struct list_head buddy;
- struct list_head lru;
unsigned int first_chunks;
unsigned int last_chunks;
- bool under_reclaim;
};
/*****************
@@ -149,8 +139,6 @@ static struct zbud_header *init_zbud_page(struct page *page)
zhdr->first_chunks = 0;
zhdr->last_chunks = 0;
INIT_LIST_HEAD(&zhdr->buddy);
- INIT_LIST_HEAD(&zhdr->lru);
- zhdr->under_reclaim = false;
return zhdr;
}
@@ -221,7 +209,6 @@ static struct zbud_pool *zbud_create_pool(gfp_t gfp)
for_each_unbuddied_list(i, 0)
INIT_LIST_HEAD(&pool->unbuddied[i]);
INIT_LIST_HEAD(&pool->buddied);
- INIT_LIST_HEAD(&pool->lru);
pool->pages_nr = 0;
return pool;
}
@@ -310,11 +297,6 @@ found:
list_add(&zhdr->buddy, &pool->buddied);
}
- /* Add/move zbud page to beginning of LRU */
- if (!list_empty(&zhdr->lru))
- list_del(&zhdr->lru);
- list_add(&zhdr->lru, &pool->lru);
-
*handle = encode_handle(zhdr, bud);
spin_unlock(&pool->lock);
@@ -325,11 +307,6 @@ found:
* zbud_free() - frees the allocation associated with the given handle
* @pool: pool in which the allocation resided
* @handle: handle associated with the allocation returned by zbud_alloc()
- *
- * In the case that the zbud page in which the allocation resides is under
- * reclaim, as indicated by the PG_reclaim flag being set, this function
- * only sets the first|last_chunks to 0. The page is actually freed
- * once both buddies are evicted (see zbud_reclaim_page() below).
*/
static void zbud_free(struct zbud_pool *pool, unsigned long handle)
{
@@ -345,18 +322,11 @@ static void zbud_free(struct zbud_pool *pool, unsigned long handle)
else
zhdr->first_chunks = 0;
- if (zhdr->under_reclaim) {
- /* zbud page is under reclaim, reclaim will free */
- spin_unlock(&pool->lock);
- return;
- }
-
/* Remove from existing buddy list */
list_del(&zhdr->buddy);
if (zhdr->first_chunks == 0 && zhdr->last_chunks == 0) {
/* zbud page is empty, free */
- list_del(&zhdr->lru);
free_zbud_page(zhdr);
pool->pages_nr--;
} else {
@@ -369,110 +339,6 @@ static void zbud_free(struct zbud_pool *pool, unsigned long handle)
}
/**
- * zbud_reclaim_page() - evicts allocations from a pool page and frees it
- * @pool: pool from which a page will attempt to be evicted
- * @retries: number of pages on the LRU list for which eviction will
- * be attempted before failing
- *
- * zbud reclaim is different from normal system reclaim in that the reclaim is
- * done from the bottom, up. This is because only the bottom layer, zbud, has
- * information on how the allocations are organized within each zbud page. This
- * has the potential to create interesting locking situations between zbud and
- * the user, however.
- *
- * To avoid these, this is how zbud_reclaim_page() should be called:
- *
- * The user detects a page should be reclaimed and calls zbud_reclaim_page().
- * zbud_reclaim_page() will remove a zbud page from the pool LRU list and call
- * the user-defined eviction handler with the pool and handle as arguments.
- *
- * If the handle can not be evicted, the eviction handler should return
- * non-zero. zbud_reclaim_page() will add the zbud page back to the
- * appropriate list and try the next zbud page on the LRU up to
- * a user defined number of retries.
- *
- * If the handle is successfully evicted, the eviction handler should
- * return 0 _and_ should have called zbud_free() on the handle. zbud_free()
- * contains logic to delay freeing the page if the page is under reclaim,
- * as indicated by the setting of the PG_reclaim flag on the underlying page.
- *
- * If all buddies in the zbud page are successfully evicted, then the
- * zbud page can be freed.
- *
- * Returns: 0 if page is successfully freed, otherwise -EINVAL if there are
- * no pages to evict or an eviction handler is not registered, -EAGAIN if
- * the retry limit was hit.
- */
-static int zbud_reclaim_page(struct zbud_pool *pool, unsigned int retries)
-{
- int i, ret, freechunks;
- struct zbud_header *zhdr;
- unsigned long first_handle = 0, last_handle = 0;
-
- spin_lock(&pool->lock);
- if (list_empty(&pool->lru)) {
- spin_unlock(&pool->lock);
- return -EINVAL;
- }
- for (i = 0; i < retries; i++) {
- zhdr = list_last_entry(&pool->lru, struct zbud_header, lru);
- list_del(&zhdr->lru);
- list_del(&zhdr->buddy);
- /* Protect zbud page against free */
- zhdr->under_reclaim = true;
- /*
- * We need encode the handles before unlocking, since we can
- * race with free that will set (first|last)_chunks to 0
- */
- first_handle = 0;
- last_handle = 0;
- if (zhdr->first_chunks)
- first_handle = encode_handle(zhdr, FIRST);
- if (zhdr->last_chunks)
- last_handle = encode_handle(zhdr, LAST);
- spin_unlock(&pool->lock);
-
- /* Issue the eviction callback(s) */
- if (first_handle) {
- ret = pool->zpool_ops->evict(pool->zpool, first_handle);
- if (ret)
- goto next;
- }
- if (last_handle) {
- ret = pool->zpool_ops->evict(pool->zpool, last_handle);
- if (ret)
- goto next;
- }
-next:
- spin_lock(&pool->lock);
- zhdr->under_reclaim = false;
- if (zhdr->first_chunks == 0 && zhdr->last_chunks == 0) {
- /*
- * Both buddies are now free, free the zbud page and
- * return success.
- */
- free_zbud_page(zhdr);
- pool->pages_nr--;
- spin_unlock(&pool->lock);
- return 0;
- } else if (zhdr->first_chunks == 0 ||
- zhdr->last_chunks == 0) {
- /* add to unbuddied list */
- freechunks = num_free_chunks(zhdr);
- list_add(&zhdr->buddy, &pool->unbuddied[freechunks]);
- } else {
- /* add to buddied list */
- list_add(&zhdr->buddy, &pool->buddied);
- }
-
- /* add to beginning of LRU */
- list_add(&zhdr->lru, &pool->lru);
- }
- spin_unlock(&pool->lock);
- return -EAGAIN;
-}
-
-/**
* zbud_map() - maps the allocation associated with the given handle
* @pool: pool in which the allocation resides
* @handle: handle associated with the allocation to be mapped
@@ -514,18 +380,9 @@ static u64 zbud_get_pool_size(struct zbud_pool *pool)
* zpool
****************/
-static void *zbud_zpool_create(const char *name, gfp_t gfp,
- const struct zpool_ops *zpool_ops,
- struct zpool *zpool)
+static void *zbud_zpool_create(const char *name, gfp_t gfp)
{
- struct zbud_pool *pool;
-
- pool = zbud_create_pool(gfp);
- if (pool) {
- pool->zpool = zpool;
- pool->zpool_ops = zpool_ops;
- }
- return pool;
+ return zbud_create_pool(gfp);
}
static void zbud_zpool_destroy(void *pool)
@@ -543,25 +400,6 @@ static void zbud_zpool_free(void *pool, unsigned long handle)
zbud_free(pool, handle);
}
-static int zbud_zpool_shrink(void *pool, unsigned int pages,
- unsigned int *reclaimed)
-{
- unsigned int total = 0;
- int ret = -EINVAL;
-
- while (total < pages) {
- ret = zbud_reclaim_page(pool, 8);
- if (ret < 0)
- break;
- total++;
- }
-
- if (reclaimed)
- *reclaimed = total;
-
- return ret;
-}
-
static void *zbud_zpool_map(void *pool, unsigned long handle,
enum zpool_mapmode mm)
{
@@ -585,7 +423,6 @@ static struct zpool_driver zbud_zpool_driver = {
.destroy = zbud_zpool_destroy,
.malloc = zbud_zpool_malloc,
.free = zbud_zpool_free,
- .shrink = zbud_zpool_shrink,
.map = zbud_zpool_map,
.unmap = zbud_zpool_unmap,
.total_size = zbud_zpool_total_size,
diff --git a/mm/zpool.c b/mm/zpool.c
index 6a19c4a58f77..846410479c2f 100644
--- a/mm/zpool.c
+++ b/mm/zpool.c
@@ -133,7 +133,6 @@ EXPORT_SYMBOL(zpool_has_pool);
* @type: The type of the zpool to create (e.g. zbud, zsmalloc)
* @name: The name of the zpool (e.g. zram0, zswap)
* @gfp: The GFP flags to use when allocating the pool.
- * @ops: The optional ops callback.
*
* This creates a new zpool of the specified type. The gfp flags will be
* used when allocating memory, if the implementation supports it. If the
@@ -145,8 +144,7 @@ EXPORT_SYMBOL(zpool_has_pool);
*
* Returns: New zpool on success, NULL on failure.
*/
-struct zpool *zpool_create_pool(const char *type, const char *name, gfp_t gfp,
- const struct zpool_ops *ops)
+struct zpool *zpool_create_pool(const char *type, const char *name, gfp_t gfp)
{
struct zpool_driver *driver;
struct zpool *zpool;
@@ -173,7 +171,7 @@ struct zpool *zpool_create_pool(const char *type, const char *name, gfp_t gfp,
}
zpool->driver = driver;
- zpool->pool = driver->create(name, gfp, ops, zpool);
+ zpool->pool = driver->create(name, gfp);
if (!zpool->pool) {
pr_err("couldn't create %s pool\n", type);
@@ -280,30 +278,6 @@ void zpool_free(struct zpool *zpool, unsigned long handle)
}
/**
- * zpool_shrink() - Shrink the pool size
- * @zpool: The zpool to shrink.
- * @pages: The number of pages to shrink the pool.
- * @reclaimed: The number of pages successfully evicted.
- *
- * This attempts to shrink the actual memory size of the pool
- * by evicting currently used handle(s). If the pool was
- * created with no zpool_ops, or the evict call fails for any
- * of the handles, this will fail. If non-NULL, the @reclaimed
- * parameter will be set to the number of pages reclaimed,
- * which may be more than the number of pages requested.
- *
- * Implementations must guarantee this to be thread-safe.
- *
- * Returns: 0 on success, negative value on error/failure.
- */
-int zpool_shrink(struct zpool *zpool, unsigned int pages,
- unsigned int *reclaimed)
-{
- return zpool->driver->shrink ?
- zpool->driver->shrink(zpool->pool, pages, reclaimed) : -EINVAL;
-}
-
-/**
* zpool_map_handle() - Map a previously allocated handle into memory
* @zpool: The zpool that the handle was allocated from
* @handle: The handle to map
@@ -360,24 +334,6 @@ u64 zpool_get_total_size(struct zpool *zpool)
}
/**
- * zpool_evictable() - Test if zpool is potentially evictable
- * @zpool: The zpool to test
- *
- * Zpool is only potentially evictable when it's created with struct
- * zpool_ops.evict and its driver implements struct zpool_driver.shrink.
- *
- * However, it doesn't necessarily mean driver will use zpool_ops.evict
- * in its implementation of zpool_driver.shrink. It could do internal
- * defragmentation instead.
- *
- * Returns: true if potentially evictable; false otherwise.
- */
-bool zpool_evictable(struct zpool *zpool)
-{
- return zpool->driver->shrink;
-}
-
-/**
* zpool_can_sleep_mapped - Test if zpool can sleep when do mapped.
* @zpool: The zpool to test
*
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 02f7f414aade..3f057970504e 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -107,21 +107,8 @@
*/
#define OBJ_ALLOCATED_TAG 1
-#ifdef CONFIG_ZPOOL
-/*
- * The second least-significant bit in the object's header identifies if the
- * value stored at the header is a deferred handle from the last reclaim
- * attempt.
- *
- * As noted above, this is valid because we have room for two bits.
- */
-#define OBJ_DEFERRED_HANDLE_TAG 2
-#define OBJ_TAG_BITS 2
-#define OBJ_TAG_MASK (OBJ_ALLOCATED_TAG | OBJ_DEFERRED_HANDLE_TAG)
-#else
#define OBJ_TAG_BITS 1
#define OBJ_TAG_MASK OBJ_ALLOCATED_TAG
-#endif /* CONFIG_ZPOOL */
#define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS - OBJ_TAG_BITS)
#define OBJ_INDEX_MASK ((_AC(1, UL) << OBJ_INDEX_BITS) - 1)
@@ -227,12 +214,6 @@ struct link_free {
* Handle of allocated object.
*/
unsigned long handle;
-#ifdef CONFIG_ZPOOL
- /*
- * Deferred handle of a reclaimed object.
- */
- unsigned long deferred_handle;
-#endif
};
};
@@ -250,13 +231,6 @@ struct zs_pool {
/* Compact classes */
struct shrinker shrinker;
-#ifdef CONFIG_ZPOOL
- /* List tracking the zspages in LRU order by most recently added object */
- struct list_head lru;
- struct zpool *zpool;
- const struct zpool_ops *zpool_ops;
-#endif
-
#ifdef CONFIG_ZSMALLOC_STAT
struct dentry *stat_dentry;
#endif
@@ -279,13 +253,6 @@ struct zspage {
unsigned int freeobj;
struct page *first_page;
struct list_head list; /* fullness list */
-
-#ifdef CONFIG_ZPOOL
- /* links the zspage to the lru list in the pool */
- struct list_head lru;
- bool under_reclaim;
-#endif
-
struct zs_pool *pool;
rwlock_t lock;
};
@@ -384,23 +351,14 @@ static void record_obj(unsigned long handle, unsigned long obj)
#ifdef CONFIG_ZPOOL
-static void *zs_zpool_create(const char *name, gfp_t gfp,
- const struct zpool_ops *zpool_ops,
- struct zpool *zpool)
+static void *zs_zpool_create(const char *name, gfp_t gfp)
{
/*
* Ignore global gfp flags: zs_malloc() may be invoked from
* different contexts and its caller must provide a valid
* gfp mask.
*/
- struct zs_pool *pool = zs_create_pool(name);
-
- if (pool) {
- pool->zpool = zpool;
- pool->zpool_ops = zpool_ops;
- }
-
- return pool;
+ return zs_create_pool(name);
}
static void zs_zpool_destroy(void *pool)
@@ -422,27 +380,6 @@ static void zs_zpool_free(void *pool, unsigned long handle)
zs_free(pool, handle);
}
-static int zs_reclaim_page(struct zs_pool *pool, unsigned int retries);
-
-static int zs_zpool_shrink(void *pool, unsigned int pages,
- unsigned int *reclaimed)
-{
- unsigned int total = 0;
- int ret = -EINVAL;
-
- while (total < pages) {
- ret = zs_reclaim_page(pool, 8);
- if (ret < 0)
- break;
- total++;
- }
-
- if (reclaimed)
- *reclaimed = total;
-
- return ret;
-}
-
static void *zs_zpool_map(void *pool, unsigned long handle,
enum zpool_mapmode mm)
{
@@ -481,7 +418,6 @@ static struct zpool_driver zs_zpool_driver = {
.malloc_support_movable = true,
.malloc = zs_zpool_malloc,
.free = zs_zpool_free,
- .shrink = zs_zpool_shrink,
.map = zs_zpool_map,
.unmap = zs_zpool_unmap,
.total_size = zs_zpool_total_size,
@@ -884,14 +820,6 @@ static inline bool obj_allocated(struct page *page, void *obj, unsigned long *ph
return obj_tagged(page, obj, phandle, OBJ_ALLOCATED_TAG);
}
-#ifdef CONFIG_ZPOOL
-static bool obj_stores_deferred_handle(struct page *page, void *obj,
- unsigned long *phandle)
-{
- return obj_tagged(page, obj, phandle, OBJ_DEFERRED_HANDLE_TAG);
-}
-#endif
-
static void reset_page(struct page *page)
{
__ClearPageMovable(page);
@@ -922,39 +850,6 @@ unlock:
return 0;
}
-#ifdef CONFIG_ZPOOL
-static unsigned long find_deferred_handle_obj(struct size_class *class,
- struct page *page, int *obj_idx);
-
-/*
- * Free all the deferred handles whose objects are freed in zs_free.
- */
-static void free_handles(struct zs_pool *pool, struct size_class *class,
- struct zspage *zspage)
-{
- int obj_idx = 0;
- struct page *page = get_first_page(zspage);
- unsigned long handle;
-
- while (1) {
- handle = find_deferred_handle_obj(class, page, &obj_idx);
- if (!handle) {
- page = get_next_page(page);
- if (!page)
- break;
- obj_idx = 0;
- continue;
- }
-
- cache_free_handle(pool, handle);
- obj_idx++;
- }
-}
-#else
-static inline void free_handles(struct zs_pool *pool, struct size_class *class,
- struct zspage *zspage) {}
-#endif
-
static void __free_zspage(struct zs_pool *pool, struct size_class *class,
struct zspage *zspage)
{
@@ -969,9 +864,6 @@ static void __free_zspage(struct zs_pool *pool, struct size_class *class,
VM_BUG_ON(get_zspage_inuse(zspage));
VM_BUG_ON(fg != ZS_INUSE_RATIO_0);
- /* Free all deferred handles from zs_free */
- free_handles(pool, class, zspage);
-
next = page = get_first_page(zspage);
do {
VM_BUG_ON_PAGE(!PageLocked(page), page);
@@ -1006,9 +898,6 @@ static void free_zspage(struct zs_pool *pool, struct size_class *class,
}
remove_zspage(class, zspage, ZS_INUSE_RATIO_0);
-#ifdef CONFIG_ZPOOL
- list_del(&zspage->lru);
-#endif
__free_zspage(pool, class, zspage);
}
@@ -1054,11 +943,6 @@ static void init_zspage(struct size_class *class, struct zspage *zspage)
off %= PAGE_SIZE;
}
-#ifdef CONFIG_ZPOOL
- INIT_LIST_HEAD(&zspage->lru);
- zspage->under_reclaim = false;
-#endif
-
set_freeobj(zspage, 0);
}
@@ -1341,7 +1225,7 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
spin_unlock(&pool->lock);
class = zspage_class(pool, zspage);
- off = (class->size * obj_idx) & ~PAGE_MASK;
+ off = offset_in_page(class->size * obj_idx);
local_lock(&zs_map_area.lock);
area = this_cpu_ptr(&zs_map_area);
@@ -1381,7 +1265,7 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
obj_to_location(obj, &page, &obj_idx);
zspage = get_zspage(page);
class = zspage_class(pool, zspage);
- off = (class->size * obj_idx) & ~PAGE_MASK;
+ off = offset_in_page(class->size * obj_idx);
area = this_cpu_ptr(&zs_map_area);
if (off + class->size <= PAGE_SIZE)
@@ -1438,7 +1322,7 @@ static unsigned long obj_malloc(struct zs_pool *pool,
offset = obj * class->size;
nr_page = offset >> PAGE_SHIFT;
- m_offset = offset & ~PAGE_MASK;
+ m_offset = offset_in_page(offset);
m_page = get_first_page(zspage);
for (i = 0; i < nr_page; i++)
@@ -1525,20 +1409,13 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp)
/* We completely set up zspage so mark them as movable */
SetZsPageMovable(pool, zspage);
out:
-#ifdef CONFIG_ZPOOL
- /* Add/move zspage to beginning of LRU */
- if (!list_empty(&zspage->lru))
- list_del(&zspage->lru);
- list_add(&zspage->lru, &pool->lru);
-#endif
-
spin_unlock(&pool->lock);
return handle;
}
EXPORT_SYMBOL_GPL(zs_malloc);
-static void obj_free(int class_size, unsigned long obj, unsigned long *handle)
+static void obj_free(int class_size, unsigned long obj)
{
struct link_free *link;
struct zspage *zspage;
@@ -1548,31 +1425,18 @@ static void obj_free(int class_size, unsigned long obj, unsigned long *handle)
void *vaddr;
obj_to_location(obj, &f_page, &f_objidx);
- f_offset = (class_size * f_objidx) & ~PAGE_MASK;
+ f_offset = offset_in_page(class_size * f_objidx);
zspage = get_zspage(f_page);
vaddr = kmap_atomic(f_page);
link = (struct link_free *)(vaddr + f_offset);
- if (handle) {
-#ifdef CONFIG_ZPOOL
- /* Stores the (deferred) handle in the object's header */
- *handle |= OBJ_DEFERRED_HANDLE_TAG;
- *handle &= ~OBJ_ALLOCATED_TAG;
-
- if (likely(!ZsHugePage(zspage)))
- link->deferred_handle = *handle;
- else
- f_page->index = *handle;
-#endif
- } else {
- /* Insert this object in containing zspage's freelist */
- if (likely(!ZsHugePage(zspage)))
- link->next = get_freeobj(zspage) << OBJ_TAG_BITS;
- else
- f_page->index = 0;
- set_freeobj(zspage, f_objidx);
- }
+ /* Insert this object in containing zspage's freelist */
+ if (likely(!ZsHugePage(zspage)))
+ link->next = get_freeobj(zspage) << OBJ_TAG_BITS;
+ else
+ f_page->index = 0;
+ set_freeobj(zspage, f_objidx);
kunmap_atomic(vaddr);
mod_zspage_inuse(zspage, -1);
@@ -1600,21 +1464,7 @@ void zs_free(struct zs_pool *pool, unsigned long handle)
class = zspage_class(pool, zspage);
class_stat_dec(class, ZS_OBJS_INUSE, 1);
-
-#ifdef CONFIG_ZPOOL
- if (zspage->under_reclaim) {
- /*
- * Reclaim needs the handles during writeback. It'll free
- * them along with the zspage when it's done with them.
- *
- * Record current deferred handle in the object's header.
- */
- obj_free(class->size, obj, &handle);
- spin_unlock(&pool->lock);
- return;
- }
-#endif
- obj_free(class->size, obj, NULL);
+ obj_free(class->size, obj);
fullness = fix_fullness_group(class, zspage);
if (fullness == ZS_INUSE_RATIO_0)
@@ -1640,8 +1490,8 @@ static void zs_object_copy(struct size_class *class, unsigned long dst,
obj_to_location(src, &s_page, &s_objidx);
obj_to_location(dst, &d_page, &d_objidx);
- s_off = (class->size * s_objidx) & ~PAGE_MASK;
- d_off = (class->size * d_objidx) & ~PAGE_MASK;
+ s_off = offset_in_page(class->size * s_objidx);
+ d_off = offset_in_page(class->size * d_objidx);
if (s_off + class->size > PAGE_SIZE)
s_size = PAGE_SIZE - s_off;
@@ -1735,18 +1585,6 @@ static unsigned long find_alloced_obj(struct size_class *class,
return find_tagged_obj(class, page, obj_idx, OBJ_ALLOCATED_TAG);
}
-#ifdef CONFIG_ZPOOL
-/*
- * Find object storing a deferred handle in header in zspage from index object
- * and return handle.
- */
-static unsigned long find_deferred_handle_obj(struct size_class *class,
- struct page *page, int *obj_idx)
-{
- return find_tagged_obj(class, page, obj_idx, OBJ_DEFERRED_HANDLE_TAG);
-}
-#endif
-
struct zs_compact_control {
/* Source spage for migration which could be a subpage of zspage */
struct page *s_page;
@@ -1786,7 +1624,7 @@ static void migrate_zspage(struct zs_pool *pool, struct size_class *class,
zs_object_copy(class, free_obj, used_obj);
obj_idx++;
record_obj(handle, free_obj);
- obj_free(class->size, used_obj, NULL);
+ obj_free(class->size, used_obj);
}
/* Remember last position in this iteration */
@@ -1846,7 +1684,7 @@ static int putback_zspage(struct size_class *class, struct zspage *zspage)
return fullness;
}
-#if defined(CONFIG_ZPOOL) || defined(CONFIG_COMPACTION)
+#ifdef CONFIG_COMPACTION
/*
* To prevent zspage destroy during migration, zspage freeing should
* hold locks of all pages in the zspage.
@@ -1888,24 +1726,7 @@ static void lock_zspage(struct zspage *zspage)
}
migrate_read_unlock(zspage);
}
-#endif /* defined(CONFIG_ZPOOL) || defined(CONFIG_COMPACTION) */
-
-#ifdef CONFIG_ZPOOL
-/*
- * Unlocks all the pages of the zspage.
- *
- * pool->lock must be held before this function is called
- * to prevent the underlying pages from migrating.
- */
-static void unlock_zspage(struct zspage *zspage)
-{
- struct page *page = get_first_page(zspage);
-
- do {
- unlock_page(page);
- } while ((page = get_next_page(page)) != NULL);
-}
-#endif /* CONFIG_ZPOOL */
+#endif /* CONFIG_COMPACTION */
static void migrate_lock_init(struct zspage *zspage)
{
@@ -2126,9 +1947,6 @@ static void async_free_zspage(struct work_struct *work)
VM_BUG_ON(fullness != ZS_INUSE_RATIO_0);
class = pool->size_class[class_idx];
spin_lock(&pool->lock);
-#ifdef CONFIG_ZPOOL
- list_del(&zspage->lru);
-#endif
__free_zspage(pool, class, zspage);
spin_unlock(&pool->lock);
}
@@ -2474,10 +2292,6 @@ struct zs_pool *zs_create_pool(const char *name)
*/
zs_register_shrinker(pool);
-#ifdef CONFIG_ZPOOL
- INIT_LIST_HEAD(&pool->lru);
-#endif
-
return pool;
err:
@@ -2520,190 +2334,6 @@ void zs_destroy_pool(struct zs_pool *pool)
}
EXPORT_SYMBOL_GPL(zs_destroy_pool);
-#ifdef CONFIG_ZPOOL
-static void restore_freelist(struct zs_pool *pool, struct size_class *class,
- struct zspage *zspage)
-{
- unsigned int obj_idx = 0;
- unsigned long handle, off = 0; /* off is within-page offset */
- struct page *page = get_first_page(zspage);
- struct link_free *prev_free = NULL;
- void *prev_page_vaddr = NULL;
-
- /* in case no free object found */
- set_freeobj(zspage, (unsigned int)(-1UL));
-
- while (page) {
- void *vaddr = kmap_atomic(page);
- struct page *next_page;
-
- while (off < PAGE_SIZE) {
- void *obj_addr = vaddr + off;
-
- /* skip allocated object */
- if (obj_allocated(page, obj_addr, &handle)) {
- obj_idx++;
- off += class->size;
- continue;
- }
-
- /* free deferred handle from reclaim attempt */
- if (obj_stores_deferred_handle(page, obj_addr, &handle))
- cache_free_handle(pool, handle);
-
- if (prev_free)
- prev_free->next = obj_idx << OBJ_TAG_BITS;
- else /* first free object found */
- set_freeobj(zspage, obj_idx);
-
- prev_free = (struct link_free *)vaddr + off / sizeof(*prev_free);
- /* if last free object in a previous page, need to unmap */
- if (prev_page_vaddr) {
- kunmap_atomic(prev_page_vaddr);
- prev_page_vaddr = NULL;
- }
-
- obj_idx++;
- off += class->size;
- }
-
- /*
- * Handle the last (full or partial) object on this page.
- */
- next_page = get_next_page(page);
- if (next_page) {
- if (!prev_free || prev_page_vaddr) {
- /*
- * There is no free object in this page, so we can safely
- * unmap it.
- */
- kunmap_atomic(vaddr);
- } else {
- /* update prev_page_vaddr since prev_free is on this page */
- prev_page_vaddr = vaddr;
- }
- } else { /* this is the last page */
- if (prev_free) {
- /*
- * Reset OBJ_TAG_BITS bit to last link to tell
- * whether it's allocated object or not.
- */
- prev_free->next = -1UL << OBJ_TAG_BITS;
- }
-
- /* unmap previous page (if not done yet) */
- if (prev_page_vaddr) {
- kunmap_atomic(prev_page_vaddr);
- prev_page_vaddr = NULL;
- }
-
- kunmap_atomic(vaddr);
- }
-
- page = next_page;
- off %= PAGE_SIZE;
- }
-}
-
-static int zs_reclaim_page(struct zs_pool *pool, unsigned int retries)
-{
- int i, obj_idx, ret = 0;
- unsigned long handle;
- struct zspage *zspage;
- struct page *page;
- int fullness;
-
- /* Lock LRU and fullness list */
- spin_lock(&pool->lock);
- if (list_empty(&pool->lru)) {
- spin_unlock(&pool->lock);
- return -EINVAL;
- }
-
- for (i = 0; i < retries; i++) {
- struct size_class *class;
-
- zspage = list_last_entry(&pool->lru, struct zspage, lru);
- list_del(&zspage->lru);
-
- /* zs_free may free objects, but not the zspage and handles */
- zspage->under_reclaim = true;
-
- class = zspage_class(pool, zspage);
- fullness = get_fullness_group(class, zspage);
-
- /* Lock out object allocations and object compaction */
- remove_zspage(class, zspage, fullness);
-
- spin_unlock(&pool->lock);
- cond_resched();
-
- /* Lock backing pages into place */
- lock_zspage(zspage);
-
- obj_idx = 0;
- page = get_first_page(zspage);
- while (1) {
- handle = find_alloced_obj(class, page, &obj_idx);
- if (!handle) {
- page = get_next_page(page);
- if (!page)
- break;
- obj_idx = 0;
- continue;
- }
-
- /*
- * This will write the object and call zs_free.
- *
- * zs_free will free the object, but the
- * under_reclaim flag prevents it from freeing
- * the zspage altogether. This is necessary so
- * that we can continue working with the
- * zspage potentially after the last object
- * has been freed.
- */
- ret = pool->zpool_ops->evict(pool->zpool, handle);
- if (ret)
- goto next;
-
- obj_idx++;
- }
-
-next:
- /* For freeing the zspage, or putting it back in the pool and LRU list. */
- spin_lock(&pool->lock);
- zspage->under_reclaim = false;
-
- if (!get_zspage_inuse(zspage)) {
- /*
- * Fullness went stale as zs_free() won't touch it
- * while the page is removed from the pool. Fix it
- * up for the check in __free_zspage().
- */
- zspage->fullness = ZS_INUSE_RATIO_0;
-
- __free_zspage(pool, class, zspage);
- spin_unlock(&pool->lock);
- return 0;
- }
-
- /*
- * Eviction fails on one of the handles, so we need to restore zspage.
- * We need to rebuild its freelist (and free stored deferred handles),
- * put it back to the correct size class, and add it to the LRU list.
- */
- restore_freelist(pool, class, zspage);
- putback_zspage(class, zspage);
- list_add(&zspage->lru, &pool->lru);
- unlock_zspage(zspage);
- }
-
- spin_unlock(&pool->lock);
- return -EAGAIN;
-}
-#endif /* CONFIG_ZPOOL */
-
static int __init zs_init(void)
{
int ret;
diff --git a/mm/zswap.c b/mm/zswap.c
index 59da2a415fbb..62195f72bf56 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -37,6 +37,7 @@
#include <linux/workqueue.h>
#include "swap.h"
+#include "internal.h"
/*********************************
* statistics
@@ -137,6 +138,10 @@ static bool zswap_non_same_filled_pages_enabled = true;
module_param_named(non_same_filled_pages_enabled, zswap_non_same_filled_pages_enabled,
bool, 0644);
+static bool zswap_exclusive_loads_enabled = IS_ENABLED(
+ CONFIG_ZSWAP_EXCLUSIVE_LOADS_DEFAULT_ON);
+module_param_named(exclusive_loads, zswap_exclusive_loads_enabled, bool, 0644);
+
/*********************************
* data structures
**********************************/
@@ -149,6 +154,12 @@ struct crypto_acomp_ctx {
struct mutex *mutex;
};
+/*
+ * The lock ordering is zswap_tree.lock -> zswap_pool.lru_lock.
+ * The only case where lru_lock is not acquired while holding tree.lock is
+ * when a zswap_entry is taken off the lru for writeback, in that case it
+ * needs to be verified that it's still valid in the tree.
+ */
struct zswap_pool {
struct zpool *zpool;
struct crypto_acomp_ctx __percpu *acomp_ctx;
@@ -158,6 +169,8 @@ struct zswap_pool {
struct work_struct shrink_work;
struct hlist_node node;
char tfm_name[CRYPTO_MAX_ALG_NAME];
+ struct list_head lru;
+ spinlock_t lru_lock;
};
/*
@@ -175,14 +188,16 @@ struct zswap_pool {
* be held while changing the refcount. Since the lock must
* be held, there is no reason to also make refcount atomic.
* length - the length in bytes of the compressed page data. Needed during
- * decompression. For a same value filled page length is 0.
+ * decompression. For a same value filled page length is 0, and both
+ * pool and lru are invalid and must be ignored.
* pool - the zswap_pool the entry's data is in
* handle - zpool allocation handle that stores the compressed page data
* value - value of the same-value filled pages which have same content
+ * lru - handle to the pool's lru used to evict pages.
*/
struct zswap_entry {
struct rb_node rbnode;
- pgoff_t offset;
+ swp_entry_t swpentry;
int refcount;
unsigned int length;
struct zswap_pool *pool;
@@ -191,10 +206,7 @@ struct zswap_entry {
unsigned long value;
};
struct obj_cgroup *objcg;
-};
-
-struct zswap_header {
- swp_entry_t swpentry;
+ struct list_head lru;
};
/*
@@ -238,14 +250,11 @@ static bool zswap_has_pool;
pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name, \
zpool_get_type((p)->zpool))
-static int zswap_writeback_entry(struct zpool *pool, unsigned long handle);
+static int zswap_writeback_entry(struct zswap_entry *entry,
+ struct zswap_tree *tree);
static int zswap_pool_get(struct zswap_pool *pool);
static void zswap_pool_put(struct zswap_pool *pool);
-static const struct zpool_ops zswap_zpool_ops = {
- .evict = zswap_writeback_entry
-};
-
static bool zswap_is_full(void)
{
return totalram_pages() * zswap_max_pool_percent / 100 <
@@ -302,12 +311,14 @@ static struct zswap_entry *zswap_rb_search(struct rb_root *root, pgoff_t offset)
{
struct rb_node *node = root->rb_node;
struct zswap_entry *entry;
+ pgoff_t entry_offset;
while (node) {
entry = rb_entry(node, struct zswap_entry, rbnode);
- if (entry->offset > offset)
+ entry_offset = swp_offset(entry->swpentry);
+ if (entry_offset > offset)
node = node->rb_left;
- else if (entry->offset < offset)
+ else if (entry_offset < offset)
node = node->rb_right;
else
return entry;
@@ -324,13 +335,15 @@ static int zswap_rb_insert(struct rb_root *root, struct zswap_entry *entry,
{
struct rb_node **link = &root->rb_node, *parent = NULL;
struct zswap_entry *myentry;
+ pgoff_t myentry_offset, entry_offset = swp_offset(entry->swpentry);
while (*link) {
parent = *link;
myentry = rb_entry(parent, struct zswap_entry, rbnode);
- if (myentry->offset > entry->offset)
+ myentry_offset = swp_offset(myentry->swpentry);
+ if (myentry_offset > entry_offset)
link = &(*link)->rb_left;
- else if (myentry->offset < entry->offset)
+ else if (myentry_offset < entry_offset)
link = &(*link)->rb_right;
else {
*dupentry = myentry;
@@ -342,12 +355,14 @@ static int zswap_rb_insert(struct rb_root *root, struct zswap_entry *entry,
return 0;
}
-static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry)
+static bool zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry)
{
if (!RB_EMPTY_NODE(&entry->rbnode)) {
rb_erase(&entry->rbnode, root);
RB_CLEAR_NODE(&entry->rbnode);
+ return true;
}
+ return false;
}
/*
@@ -363,6 +378,9 @@ static void zswap_free_entry(struct zswap_entry *entry)
if (!entry->length)
atomic_dec(&zswap_same_filled_pages);
else {
+ spin_lock(&entry->pool->lru_lock);
+ list_del(&entry->lru);
+ spin_unlock(&entry->pool->lru_lock);
zpool_free(entry->pool->zpool, entry->handle);
zswap_pool_put(entry->pool);
}
@@ -583,13 +601,95 @@ static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor)
return NULL;
}
+/*
+ * If the entry is still valid in the tree, drop the initial ref and remove it
+ * from the tree. This function must be called with an additional ref held,
+ * otherwise it may race with another invalidation freeing the entry.
+ */
+static void zswap_invalidate_entry(struct zswap_tree *tree,
+ struct zswap_entry *entry)
+{
+ if (zswap_rb_erase(&tree->rbroot, entry))
+ zswap_entry_put(tree, entry);
+}
+
+static int zswap_reclaim_entry(struct zswap_pool *pool)
+{
+ struct zswap_entry *entry;
+ struct zswap_tree *tree;
+ pgoff_t swpoffset;
+ int ret;
+
+ /* Get an entry off the LRU */
+ spin_lock(&pool->lru_lock);
+ if (list_empty(&pool->lru)) {
+ spin_unlock(&pool->lru_lock);
+ return -EINVAL;
+ }
+ entry = list_last_entry(&pool->lru, struct zswap_entry, lru);
+ list_del_init(&entry->lru);
+ /*
+ * Once the lru lock is dropped, the entry might get freed. The
+ * swpoffset is copied to the stack, and entry isn't deref'd again
+ * until the entry is verified to still be alive in the tree.
+ */
+ swpoffset = swp_offset(entry->swpentry);
+ tree = zswap_trees[swp_type(entry->swpentry)];
+ spin_unlock(&pool->lru_lock);
+
+ /* Check for invalidate() race */
+ spin_lock(&tree->lock);
+ if (entry != zswap_rb_search(&tree->rbroot, swpoffset)) {
+ ret = -EAGAIN;
+ goto unlock;
+ }
+ /* Hold a reference to prevent a free during writeback */
+ zswap_entry_get(entry);
+ spin_unlock(&tree->lock);
+
+ ret = zswap_writeback_entry(entry, tree);
+
+ spin_lock(&tree->lock);
+ if (ret) {
+ /* Writeback failed, put entry back on LRU */
+ spin_lock(&pool->lru_lock);
+ list_move(&entry->lru, &pool->lru);
+ spin_unlock(&pool->lru_lock);
+ goto put_unlock;
+ }
+
+ /*
+ * Writeback started successfully, the page now belongs to the
+ * swapcache. Drop the entry from zswap - unless invalidate already
+ * took it out while we had the tree->lock released for IO.
+ */
+ zswap_invalidate_entry(tree, entry);
+
+put_unlock:
+ /* Drop local reference */
+ zswap_entry_put(tree, entry);
+unlock:
+ spin_unlock(&tree->lock);
+ return ret ? -EAGAIN : 0;
+}
+
static void shrink_worker(struct work_struct *w)
{
struct zswap_pool *pool = container_of(w, typeof(*pool),
shrink_work);
+ int ret, failures = 0;
- if (zpool_shrink(pool->zpool, 1, NULL))
- zswap_reject_reclaim_fail++;
+ do {
+ ret = zswap_reclaim_entry(pool);
+ if (ret) {
+ zswap_reject_reclaim_fail++;
+ if (ret != -EAGAIN)
+ break;
+ if (++failures == MAX_RECLAIM_RETRIES)
+ break;
+ }
+ cond_resched();
+ } while (!zswap_can_accept());
zswap_pool_put(pool);
}
@@ -618,7 +718,7 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
/* unique name for each pool specifically required by zsmalloc */
snprintf(name, 38, "zswap%x", atomic_inc_return(&zswap_pools_count));
- pool->zpool = zpool_create_pool(type, name, gfp, &zswap_zpool_ops);
+ pool->zpool = zpool_create_pool(type, name, gfp);
if (!pool->zpool) {
pr_err("%s zpool not available\n", type);
goto error;
@@ -644,6 +744,8 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
*/
kref_init(&pool->kref);
INIT_LIST_HEAD(&pool->list);
+ INIT_LIST_HEAD(&pool->lru);
+ spin_lock_init(&pool->lru_lock);
INIT_WORK(&pool->shrink_work, shrink_worker);
zswap_pool_debug("created", pool);
@@ -964,16 +1066,14 @@ static int zswap_get_swap_cache_page(swp_entry_t entry,
* the swap cache, the compressed version stored by zswap can be
* freed.
*/
-static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
+static int zswap_writeback_entry(struct zswap_entry *entry,
+ struct zswap_tree *tree)
{
- struct zswap_header *zhdr;
- swp_entry_t swpentry;
- struct zswap_tree *tree;
- pgoff_t offset;
- struct zswap_entry *entry;
+ swp_entry_t swpentry = entry->swpentry;
struct page *page;
struct scatterlist input, output;
struct crypto_acomp_ctx *acomp_ctx;
+ struct zpool *pool = entry->pool->zpool;
u8 *src, *tmp = NULL;
unsigned int dlen;
@@ -988,25 +1088,6 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
return -ENOMEM;
}
- /* extract swpentry from data */
- zhdr = zpool_map_handle(pool, handle, ZPOOL_MM_RO);
- swpentry = zhdr->swpentry; /* here */
- tree = zswap_trees[swp_type(swpentry)];
- offset = swp_offset(swpentry);
- zpool_unmap_handle(pool, handle);
-
- /* find and ref zswap entry */
- spin_lock(&tree->lock);
- entry = zswap_entry_find_get(&tree->rbroot, offset);
- if (!entry) {
- /* entry was invalidated */
- spin_unlock(&tree->lock);
- kfree(tmp);
- return 0;
- }
- spin_unlock(&tree->lock);
- BUG_ON(offset != entry->offset);
-
/* try to allocate swap cache page */
switch (zswap_get_swap_cache_page(swpentry, &page)) {
case ZSWAP_SWAPCACHE_FAIL: /* no memory or invalidate happened */
@@ -1028,7 +1109,7 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
* writing.
*/
spin_lock(&tree->lock);
- if (zswap_rb_search(&tree->rbroot, entry->offset) != entry) {
+ if (zswap_rb_search(&tree->rbroot, swp_offset(entry->swpentry)) != entry) {
spin_unlock(&tree->lock);
delete_from_swap_cache(page_folio(page));
ret = -ENOMEM;
@@ -1040,12 +1121,11 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
dlen = PAGE_SIZE;
- zhdr = zpool_map_handle(pool, handle, ZPOOL_MM_RO);
- src = (u8 *)zhdr + sizeof(struct zswap_header);
+ src = zpool_map_handle(pool, entry->handle, ZPOOL_MM_RO);
if (!zpool_can_sleep_mapped(pool)) {
memcpy(tmp, src, entry->length);
src = tmp;
- zpool_unmap_handle(pool, handle);
+ zpool_unmap_handle(pool, entry->handle);
}
mutex_lock(acomp_ctx->mutex);
@@ -1060,7 +1140,7 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
if (!zpool_can_sleep_mapped(pool))
kfree(tmp);
else
- zpool_unmap_handle(pool, handle);
+ zpool_unmap_handle(pool, entry->handle);
BUG_ON(ret);
BUG_ON(dlen != PAGE_SIZE);
@@ -1077,23 +1157,7 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
put_page(page);
zswap_written_back_pages++;
- spin_lock(&tree->lock);
- /* drop local reference */
- zswap_entry_put(tree, entry);
-
- /*
- * There are two possible situations for entry here:
- * (1) refcount is 1(normal case), entry is valid and on the tree
- * (2) refcount is 0, entry is freed and not on the tree
- * because invalidate happened during writeback
- * search the tree and free the entry if find entry
- */
- if (entry == zswap_rb_search(&tree->rbroot, offset))
- zswap_entry_put(tree, entry);
- spin_unlock(&tree->lock);
-
return ret;
-
fail:
if (!zpool_can_sleep_mapped(pool))
kfree(tmp);
@@ -1102,13 +1166,8 @@ fail:
* if we get here due to ZSWAP_SWAPCACHE_EXIST
* a load may be happening concurrently.
* it is safe and okay to not free the entry.
- * if we free the entry in the following put
* it is also okay to return !0
*/
- spin_lock(&tree->lock);
- zswap_entry_put(tree, entry);
- spin_unlock(&tree->lock);
-
return ret;
}
@@ -1156,11 +1215,10 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
struct obj_cgroup *objcg = NULL;
struct zswap_pool *pool;
int ret;
- unsigned int hlen, dlen = PAGE_SIZE;
+ unsigned int dlen = PAGE_SIZE;
unsigned long handle, value;
char *buf;
u8 *src, *dst;
- struct zswap_header zhdr = { .swpentry = swp_entry(type, offset) };
gfp_t gfp;
/* THP isn't supported */
@@ -1174,9 +1232,16 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
goto reject;
}
+ /*
+ * XXX: zswap reclaim does not work with cgroups yet. Without a
+ * cgroup-aware entry LRU, we will push out entries system-wide based on
+ * local cgroup limits.
+ */
objcg = get_obj_cgroup_from_page(page);
- if (objcg && !obj_cgroup_may_zswap(objcg))
- goto shrink;
+ if (objcg && !obj_cgroup_may_zswap(objcg)) {
+ ret = -ENOMEM;
+ goto reject;
+ }
/* reclaim space if needed */
if (zswap_is_full()) {
@@ -1188,7 +1253,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
if (zswap_pool_reached_full) {
if (!zswap_can_accept()) {
ret = -ENOMEM;
- goto reject;
+ goto shrink;
} else
zswap_pool_reached_full = false;
}
@@ -1205,7 +1270,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
src = kmap_atomic(page);
if (zswap_is_page_same_filled(src, &value)) {
kunmap_atomic(src);
- entry->offset = offset;
+ entry->swpentry = swp_entry(type, offset);
entry->length = 0;
entry->value = value;
atomic_inc(&zswap_same_filled_pages);
@@ -1259,11 +1324,10 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
}
/* store */
- hlen = zpool_evictable(entry->pool->zpool) ? sizeof(zhdr) : 0;
gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
if (zpool_malloc_support_movable(entry->pool->zpool))
gfp |= __GFP_HIGHMEM | __GFP_MOVABLE;
- ret = zpool_malloc(entry->pool->zpool, hlen + dlen, gfp, &handle);
+ ret = zpool_malloc(entry->pool->zpool, dlen, gfp, &handle);
if (ret == -ENOSPC) {
zswap_reject_compress_poor++;
goto put_dstmem;
@@ -1273,13 +1337,12 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
goto put_dstmem;
}
buf = zpool_map_handle(entry->pool->zpool, handle, ZPOOL_MM_WO);
- memcpy(buf, &zhdr, hlen);
- memcpy(buf + hlen, dst, dlen);
+ memcpy(buf, dst, dlen);
zpool_unmap_handle(entry->pool->zpool, handle);
mutex_unlock(acomp_ctx->mutex);
/* populate entry */
- entry->offset = offset;
+ entry->swpentry = swp_entry(type, offset);
entry->handle = handle;
entry->length = dlen;
@@ -1302,6 +1365,11 @@ insert_entry:
zswap_entry_put(tree, dupentry);
}
} while (ret == -EEXIST);
+ if (entry->length) {
+ spin_lock(&entry->pool->lru_lock);
+ list_add(&entry->lru, &entry->pool->lru);
+ spin_unlock(&entry->pool->lru_lock);
+ }
spin_unlock(&tree->lock);
/* update stats */
@@ -1334,7 +1402,7 @@ shrink:
* return -1 on entry not found or error
*/
static int zswap_frontswap_load(unsigned type, pgoff_t offset,
- struct page *page)
+ struct page *page, bool *exclusive)
{
struct zswap_tree *tree = zswap_trees[type];
struct zswap_entry *entry;
@@ -1373,8 +1441,6 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
/* decompress */
dlen = PAGE_SIZE;
src = zpool_map_handle(entry->pool->zpool, entry->handle, ZPOOL_MM_RO);
- if (zpool_evictable(entry->pool->zpool))
- src += sizeof(struct zswap_header);
if (!zpool_can_sleep_mapped(entry->pool->zpool)) {
memcpy(tmp, src, entry->length);
@@ -1403,6 +1469,14 @@ stats:
count_objcg_event(entry->objcg, ZSWPIN);
freeentry:
spin_lock(&tree->lock);
+ if (!ret && zswap_exclusive_loads_enabled) {
+ zswap_invalidate_entry(tree, entry);
+ *exclusive = true;
+ } else if (entry->length) {
+ spin_lock(&entry->pool->lru_lock);
+ list_move(&entry->lru, &entry->pool->lru);
+ spin_unlock(&entry->pool->lru_lock);
+ }
zswap_entry_put(tree, entry);
spin_unlock(&tree->lock);
@@ -1423,13 +1497,7 @@ static void zswap_frontswap_invalidate_page(unsigned type, pgoff_t offset)
spin_unlock(&tree->lock);
return;
}
-
- /* remove from rbtree */
- zswap_rb_erase(&tree->rbroot, entry);
-
- /* drop the initial reference from entry creation */
- zswap_entry_put(tree, entry);
-
+ zswap_invalidate_entry(tree, entry);
spin_unlock(&tree->lock);
}