summaryrefslogtreecommitdiff
path: root/mm/vmscan.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2022-12-14 06:29:45 +0300
committerLinus Torvalds <torvalds@linux-foundation.org>2022-12-14 06:29:45 +0300
commite2ca6ba6ba0152361aa4fcbf6067db71b2c7a770 (patch)
treef7ed7753a2e66486a4ffe0fbbf98404ec4ba2212 /mm/vmscan.c
parent7e68dd7d07a28faa2e6574dd6b9dbd90cdeaae91 (diff)
parentc45bc55a99957b20e4e0333bcd42e12d1833a7f5 (diff)
downloadlinux-e2ca6ba6ba0152361aa4fcbf6067db71b2c7a770.tar.xz
Merge tag 'mm-stable-2022-12-13' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Pull MM updates from Andrew Morton: - More userfaultfs work from Peter Xu - Several convert-to-folios series from Sidhartha Kumar and Huang Ying - Some filemap cleanups from Vishal Moola - David Hildenbrand added the ability to selftest anon memory COW handling - Some cpuset simplifications from Liu Shixin - Addition of vmalloc tracing support by Uladzislau Rezki - Some pagecache folioifications and simplifications from Matthew Wilcox - A pagemap cleanup from Kefeng Wang: we have VM_ACCESS_FLAGS, so use it - Miguel Ojeda contributed some cleanups for our use of the __no_sanitize_thread__ gcc keyword. This series should have been in the non-MM tree, my bad - Naoya Horiguchi improved the interaction between memory poisoning and memory section removal for huge pages - DAMON cleanups and tuneups from SeongJae Park - Tony Luck fixed the handling of COW faults against poisoned pages - Peter Xu utilized the PTE marker code for handling swapin errors - Hugh Dickins reworked compound page mapcount handling, simplifying it and making it more efficient - Removal of the autonuma savedwrite infrastructure from Nadav Amit and David Hildenbrand - zram support for multiple compression streams from Sergey Senozhatsky - David Hildenbrand reworked the GUP code's R/O long-term pinning so that drivers no longer need to use the FOLL_FORCE workaround which didn't work very well anyway - Mel Gorman altered the page allocator so that local IRQs can remnain enabled during per-cpu page allocations - Vishal Moola removed the try_to_release_page() wrapper - Stefan Roesch added some per-BDI sysfs tunables which are used to prevent network block devices from dirtying excessive amounts of pagecache - David Hildenbrand did some cleanup and repair work on KSM COW breaking - Nhat Pham and Johannes Weiner have implemented writeback in zswap's zsmalloc backend - Brian Foster has fixed a longstanding corner-case oddity in file[map]_write_and_wait_range() - sparse-vmemmap changes for MIPS, LoongArch and NIOS2 from Feiyang Chen - Shiyang Ruan has done some work on fsdax, to make its reflink mode work better under xfstests. Better, but still not perfect - Christoph Hellwig has removed the .writepage() method from several filesystems. They only need .writepages() - Yosry Ahmed wrote a series which fixes the memcg reclaim target beancounting - David Hildenbrand has fixed some of our MM selftests for 32-bit machines - Many singleton patches, as usual * tag 'mm-stable-2022-12-13' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (313 commits) mm/hugetlb: set head flag before setting compound_order in __prep_compound_gigantic_folio mm: mmu_gather: allow more than one batch of delayed rmaps mm: fix typo in struct pglist_data code comment kmsan: fix memcpy tests mm: add cond_resched() in swapin_walk_pmd_entry() mm: do not show fs mm pc for VM_LOCKONFAULT pages selftests/vm: ksm_functional_tests: fixes for 32bit selftests/vm: cow: fix compile warning on 32bit selftests/vm: madv_populate: fix missing MADV_POPULATE_(READ|WRITE) definitions mm/gup_test: fix PIN_LONGTERM_TEST_READ with highmem mm,thp,rmap: fix races between updates of subpages_mapcount mm: memcg: fix swapcached stat accounting mm: add nodes= arg to memory.reclaim mm: disable top-tier fallback to reclaim on proactive reclaim selftests: cgroup: make sure reclaim target memcg is unprotected selftests: cgroup: refactor proactive reclaim code to reclaim_until() mm: memcg: fix stale protection of reclaim target memcg mm/mmap: properly unaccount memory on mas_preallocate() failure omfs: remove ->writepage jfs: remove ->writepage ...
Diffstat (limited to 'mm/vmscan.c')
-rw-r--r--mm/vmscan.c125
1 files changed, 79 insertions, 46 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 8fcc5fa768c0..aba991c505f1 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -54,6 +54,7 @@
#include <linux/shmem_fs.h>
#include <linux/ctype.h>
#include <linux/debugfs.h>
+#include <linux/khugepaged.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
@@ -1020,31 +1021,52 @@ out:
return freed;
}
-static void drop_slab_node(int nid)
+static unsigned long drop_slab_node(int nid)
{
- unsigned long freed;
- int shift = 0;
+ unsigned long freed = 0;
+ struct mem_cgroup *memcg = NULL;
+ memcg = mem_cgroup_iter(NULL, NULL, NULL);
do {
- struct mem_cgroup *memcg = NULL;
+ freed += shrink_slab(GFP_KERNEL, nid, memcg, 0);
+ } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
- if (fatal_signal_pending(current))
- return;
+ return freed;
+}
+void drop_slab(void)
+{
+ int nid;
+ int shift = 0;
+ unsigned long freed;
+
+ do {
freed = 0;
- memcg = mem_cgroup_iter(NULL, NULL, NULL);
- do {
- freed += shrink_slab(GFP_KERNEL, nid, memcg, 0);
- } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
+ for_each_online_node(nid) {
+ if (fatal_signal_pending(current))
+ return;
+
+ freed += drop_slab_node(nid);
+ }
} while ((freed >> shift++) > 1);
}
-void drop_slab(void)
+static int reclaimer_offset(void)
{
- int nid;
+ BUILD_BUG_ON(PGSTEAL_DIRECT - PGSTEAL_KSWAPD !=
+ PGDEMOTE_DIRECT - PGDEMOTE_KSWAPD);
+ BUILD_BUG_ON(PGSTEAL_DIRECT - PGSTEAL_KSWAPD !=
+ PGSCAN_DIRECT - PGSCAN_KSWAPD);
+ BUILD_BUG_ON(PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD !=
+ PGDEMOTE_KHUGEPAGED - PGDEMOTE_KSWAPD);
+ BUILD_BUG_ON(PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD !=
+ PGSCAN_KHUGEPAGED - PGSCAN_KSWAPD);
- for_each_online_node(nid)
- drop_slab_node(nid);
+ if (current_is_kswapd())
+ return 0;
+ if (current_is_khugepaged())
+ return PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD;
+ return PGSTEAL_DIRECT - PGSTEAL_KSWAPD;
}
static inline int is_page_cache_freeable(struct folio *folio)
@@ -1346,11 +1368,10 @@ static int __remove_mapping(struct address_space *mapping, struct folio *folio,
if (folio_test_swapcache(folio)) {
swp_entry_t swap = folio_swap_entry(folio);
- /* get a shadow entry before mem_cgroup_swapout() clears folio_memcg() */
if (reclaimed && !mapping_exiting(mapping))
shadow = workingset_eviction(folio, target_memcg);
- mem_cgroup_swapout(folio, swap);
__delete_from_swap_cache(folio, swap, shadow);
+ mem_cgroup_swapout(folio, swap);
xa_unlock_irq(&mapping->i_pages);
put_swap_folio(folio, swap);
} else {
@@ -1599,10 +1620,7 @@ static unsigned int demote_folio_list(struct list_head *demote_folios,
(unsigned long)&mtc, MIGRATE_ASYNC, MR_DEMOTION,
&nr_succeeded);
- if (current_is_kswapd())
- __count_vm_events(PGDEMOTE_KSWAPD, nr_succeeded);
- else
- __count_vm_events(PGDEMOTE_DIRECT, nr_succeeded);
+ __count_vm_events(PGDEMOTE_KSWAPD + reclaimer_offset(), nr_succeeded);
return nr_succeeded;
}
@@ -2069,10 +2087,29 @@ keep:
nr_reclaimed += demote_folio_list(&demote_folios, pgdat);
/* Folios that could not be demoted are still in @demote_folios */
if (!list_empty(&demote_folios)) {
- /* Folios which weren't demoted go back on @folio_list for retry: */
+ /* Folios which weren't demoted go back on @folio_list */
list_splice_init(&demote_folios, folio_list);
- do_demote_pass = false;
- goto retry;
+
+ /*
+ * goto retry to reclaim the undemoted folios in folio_list if
+ * desired.
+ *
+ * Reclaiming directly from top tier nodes is not often desired
+ * due to it breaking the LRU ordering: in general memory
+ * should be reclaimed from lower tier nodes and demoted from
+ * top tier nodes.
+ *
+ * However, disabling reclaim from top tier nodes entirely
+ * would cause ooms in edge scenarios where lower tier memory
+ * is unreclaimable for whatever reason, eg memory being
+ * mlocked or too hot to reclaim. We can disable reclaim
+ * from top tier nodes in proactive reclaim though as that is
+ * not real memory pressure.
+ */
+ if (!sc->proactive) {
+ do_demote_pass = false;
+ goto retry;
+ }
}
pgactivate = stat->nr_activate[0] + stat->nr_activate[1];
@@ -2475,7 +2512,7 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
&nr_scanned, sc, lru);
__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
- item = current_is_kswapd() ? PGSCAN_KSWAPD : PGSCAN_DIRECT;
+ item = PGSCAN_KSWAPD + reclaimer_offset();
if (!cgroup_reclaim(sc))
__count_vm_events(item, nr_scanned);
__count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned);
@@ -2492,14 +2529,14 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
move_folios_to_lru(lruvec, &folio_list);
__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
- item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT;
+ item = PGSTEAL_KSWAPD + reclaimer_offset();
if (!cgroup_reclaim(sc))
__count_vm_events(item, nr_reclaimed);
__count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed);
__count_vm_events(PGSTEAL_ANON + file, nr_reclaimed);
spin_unlock_irq(&lruvec->lru_lock);
- lru_note_cost(lruvec, file, stat.nr_pageout);
+ lru_note_cost(lruvec, file, stat.nr_pageout, nr_scanned - nr_reclaimed);
mem_cgroup_uncharge_list(&folio_list);
free_unref_page_list(&folio_list);
@@ -2651,6 +2688,8 @@ static void shrink_active_list(unsigned long nr_to_scan,
__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
spin_unlock_irq(&lruvec->lru_lock);
+ if (nr_rotated)
+ lru_note_cost(lruvec, file, 0, nr_rotated);
mem_cgroup_uncharge_list(&l_active);
free_unref_page_list(&l_active);
trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate,
@@ -3145,7 +3184,7 @@ static struct lruvec *get_lruvec(struct mem_cgroup *memcg, int nid)
if (memcg) {
struct lruvec *lruvec = &memcg->nodeinfo[nid]->lruvec;
- /* for hotadd_new_pgdat() */
+ /* see the comment in mem_cgroup_lruvec() */
if (!lruvec->pgdat)
lruvec->pgdat = pgdat;
@@ -3154,7 +3193,7 @@ static struct lruvec *get_lruvec(struct mem_cgroup *memcg, int nid)
#endif
VM_WARN_ON_ONCE(!mem_cgroup_disabled());
- return pgdat ? &pgdat->__lruvec : NULL;
+ return &pgdat->__lruvec;
}
static int get_swappiness(struct lruvec *lruvec, struct scan_control *sc)
@@ -3218,9 +3257,6 @@ void lru_gen_add_mm(struct mm_struct *mm)
for_each_node_state(nid, N_MEMORY) {
struct lruvec *lruvec = get_lruvec(memcg, nid);
- if (!lruvec)
- continue;
-
/* the first addition since the last iteration */
if (lruvec->mm_state.tail == &mm_list->fifo)
lruvec->mm_state.tail = &mm->lru_gen.list;
@@ -3250,9 +3286,6 @@ void lru_gen_del_mm(struct mm_struct *mm)
for_each_node(nid) {
struct lruvec *lruvec = get_lruvec(memcg, nid);
- if (!lruvec)
- continue;
-
/* where the last iteration ended (exclusive) */
if (lruvec->mm_state.tail == &mm->lru_gen.list)
lruvec->mm_state.tail = lruvec->mm_state.tail->next;
@@ -4498,7 +4531,7 @@ static bool age_lruvec(struct lruvec *lruvec, struct scan_control *sc, unsigned
mem_cgroup_calculate_protection(NULL, memcg);
- if (mem_cgroup_below_min(memcg))
+ if (mem_cgroup_below_min(NULL, memcg))
return false;
need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, swappiness, &nr_to_scan);
@@ -4869,7 +4902,7 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc,
break;
}
- item = current_is_kswapd() ? PGSCAN_KSWAPD : PGSCAN_DIRECT;
+ item = PGSCAN_KSWAPD + reclaimer_offset();
if (!cgroup_reclaim(sc)) {
__count_vm_events(item, isolated);
__count_vm_events(PGREFILL, sorted);
@@ -5047,7 +5080,7 @@ retry:
if (walk && walk->batched)
reset_batch_size(lruvec, walk);
- item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT;
+ item = PGSTEAL_KSWAPD + reclaimer_offset();
if (!cgroup_reclaim(sc))
__count_vm_events(item, reclaimed);
__count_memcg_events(memcg, item, reclaimed);
@@ -5085,8 +5118,9 @@ static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *
DEFINE_MAX_SEQ(lruvec);
DEFINE_MIN_SEQ(lruvec);
- if (mem_cgroup_below_min(memcg) ||
- (mem_cgroup_below_low(memcg) && !sc->memcg_low_reclaim))
+ if (mem_cgroup_below_min(sc->target_mem_cgroup, memcg) ||
+ (mem_cgroup_below_low(sc->target_mem_cgroup, memcg) &&
+ !sc->memcg_low_reclaim))
return 0;
*need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, can_swap, &nr_to_scan);
@@ -5327,9 +5361,6 @@ static void lru_gen_change_state(bool enabled)
for_each_node(nid) {
struct lruvec *lruvec = get_lruvec(memcg, nid);
- if (!lruvec)
- continue;
-
spin_lock_irq(&lruvec->lru_lock);
VM_WARN_ON_ONCE(!seq_is_valid(lruvec));
@@ -5395,7 +5426,7 @@ static ssize_t show_enabled(struct kobject *kobj, struct kobj_attribute *attr, c
if (arch_has_hw_nonleaf_pmd_young() && get_cap(LRU_GEN_NONLEAF_YOUNG))
caps |= BIT(LRU_GEN_NONLEAF_YOUNG);
- return snprintf(buf, PAGE_SIZE, "0x%04x\n", caps);
+ return sysfs_emit(buf, "0x%04x\n", caps);
}
/* see Documentation/admin-guide/mm/multigen_lru.rst for details */
@@ -6084,13 +6115,13 @@ static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
mem_cgroup_calculate_protection(target_memcg, memcg);
- if (mem_cgroup_below_min(memcg)) {
+ if (mem_cgroup_below_min(target_memcg, memcg)) {
/*
* Hard protection.
* If there is no reclaimable memory, OOM.
*/
continue;
- } else if (mem_cgroup_below_low(memcg)) {
+ } else if (mem_cgroup_below_low(target_memcg, memcg)) {
/*
* Soft protection.
* Respect the protection only as long as
@@ -6726,7 +6757,8 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
unsigned long nr_pages,
gfp_t gfp_mask,
- unsigned int reclaim_options)
+ unsigned int reclaim_options,
+ nodemask_t *nodemask)
{
unsigned long nr_reclaimed;
unsigned int noreclaim_flag;
@@ -6741,6 +6773,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
.may_unmap = 1,
.may_swap = !!(reclaim_options & MEMCG_RECLAIM_MAY_SWAP),
.proactive = !!(reclaim_options & MEMCG_RECLAIM_PROACTIVE),
+ .nodemask = nodemask,
};
/*
* Traverse the ZONELIST_FALLBACK zonelist of the current node to put