summaryrefslogtreecommitdiff
path: root/mm/vmscan.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/vmscan.c')
-rw-r--r--mm/vmscan.c381
1 files changed, 218 insertions, 163 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 506f8220c5fe..4f9c854ce6cc 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -411,10 +411,10 @@ static int reclaimer_offset(void)
{
BUILD_BUG_ON(PGSTEAL_DIRECT - PGSTEAL_KSWAPD !=
PGDEMOTE_DIRECT - PGDEMOTE_KSWAPD);
- BUILD_BUG_ON(PGSTEAL_DIRECT - PGSTEAL_KSWAPD !=
- PGSCAN_DIRECT - PGSCAN_KSWAPD);
BUILD_BUG_ON(PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD !=
PGDEMOTE_KHUGEPAGED - PGDEMOTE_KSWAPD);
+ BUILD_BUG_ON(PGSTEAL_DIRECT - PGSTEAL_KSWAPD !=
+ PGSCAN_DIRECT - PGSCAN_KSWAPD);
BUILD_BUG_ON(PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD !=
PGSCAN_KHUGEPAGED - PGSCAN_KSWAPD);
@@ -977,7 +977,8 @@ static unsigned int demote_folio_list(struct list_head *demote_folios,
(unsigned long)&mtc, MIGRATE_ASYNC, MR_DEMOTION,
&nr_succeeded);
- __count_vm_events(PGDEMOTE_KSWAPD + reclaimer_offset(), nr_succeeded);
+ mod_node_page_state(pgdat, PGDEMOTE_KSWAPD + reclaimer_offset(),
+ nr_succeeded);
return nr_succeeded;
}
@@ -2222,7 +2223,7 @@ static void prepare_scan_control(pg_data_t *pgdat, struct scan_control *sc)
* Flush the memory cgroup stats, so that we read accurate per-memcg
* lruvec stats for heuristics.
*/
- mem_cgroup_flush_stats();
+ mem_cgroup_flush_stats(sc->target_mem_cgroup);
/*
* Determine the scan balance between anon and file LRUs.
@@ -2667,13 +2668,14 @@ static void get_item_key(void *item, int *key)
key[1] = hash >> BLOOM_FILTER_SHIFT;
}
-static bool test_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item)
+static bool test_bloom_filter(struct lru_gen_mm_state *mm_state, unsigned long seq,
+ void *item)
{
int key[2];
unsigned long *filter;
int gen = filter_gen_from_seq(seq);
- filter = READ_ONCE(lruvec->mm_state.filters[gen]);
+ filter = READ_ONCE(mm_state->filters[gen]);
if (!filter)
return true;
@@ -2682,13 +2684,14 @@ static bool test_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *it
return test_bit(key[0], filter) && test_bit(key[1], filter);
}
-static void update_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item)
+static void update_bloom_filter(struct lru_gen_mm_state *mm_state, unsigned long seq,
+ void *item)
{
int key[2];
unsigned long *filter;
int gen = filter_gen_from_seq(seq);
- filter = READ_ONCE(lruvec->mm_state.filters[gen]);
+ filter = READ_ONCE(mm_state->filters[gen]);
if (!filter)
return;
@@ -2700,12 +2703,12 @@ static void update_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *
set_bit(key[1], filter);
}
-static void reset_bloom_filter(struct lruvec *lruvec, unsigned long seq)
+static void reset_bloom_filter(struct lru_gen_mm_state *mm_state, unsigned long seq)
{
unsigned long *filter;
int gen = filter_gen_from_seq(seq);
- filter = lruvec->mm_state.filters[gen];
+ filter = mm_state->filters[gen];
if (filter) {
bitmap_clear(filter, 0, BIT(BLOOM_FILTER_SHIFT));
return;
@@ -2713,13 +2716,15 @@ static void reset_bloom_filter(struct lruvec *lruvec, unsigned long seq)
filter = bitmap_zalloc(BIT(BLOOM_FILTER_SHIFT),
__GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN);
- WRITE_ONCE(lruvec->mm_state.filters[gen], filter);
+ WRITE_ONCE(mm_state->filters[gen], filter);
}
/******************************************************************************
* mm_struct list
******************************************************************************/
+#ifdef CONFIG_LRU_GEN_WALKS_MMU
+
static struct lru_gen_mm_list *get_mm_list(struct mem_cgroup *memcg)
{
static struct lru_gen_mm_list mm_list = {
@@ -2736,6 +2741,29 @@ static struct lru_gen_mm_list *get_mm_list(struct mem_cgroup *memcg)
return &mm_list;
}
+static struct lru_gen_mm_state *get_mm_state(struct lruvec *lruvec)
+{
+ return &lruvec->mm_state;
+}
+
+static struct mm_struct *get_next_mm(struct lru_gen_mm_walk *walk)
+{
+ int key;
+ struct mm_struct *mm;
+ struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
+ struct lru_gen_mm_state *mm_state = get_mm_state(walk->lruvec);
+
+ mm = list_entry(mm_state->head, struct mm_struct, lru_gen.list);
+ key = pgdat->node_id % BITS_PER_TYPE(mm->lru_gen.bitmap);
+
+ if (!walk->force_scan && !test_bit(key, &mm->lru_gen.bitmap))
+ return NULL;
+
+ clear_bit(key, &mm->lru_gen.bitmap);
+
+ return mmget_not_zero(mm) ? mm : NULL;
+}
+
void lru_gen_add_mm(struct mm_struct *mm)
{
int nid;
@@ -2751,10 +2779,11 @@ void lru_gen_add_mm(struct mm_struct *mm)
for_each_node_state(nid, N_MEMORY) {
struct lruvec *lruvec = get_lruvec(memcg, nid);
+ struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);
/* the first addition since the last iteration */
- if (lruvec->mm_state.tail == &mm_list->fifo)
- lruvec->mm_state.tail = &mm->lru_gen.list;
+ if (mm_state->tail == &mm_list->fifo)
+ mm_state->tail = &mm->lru_gen.list;
}
list_add_tail(&mm->lru_gen.list, &mm_list->fifo);
@@ -2780,14 +2809,15 @@ void lru_gen_del_mm(struct mm_struct *mm)
for_each_node(nid) {
struct lruvec *lruvec = get_lruvec(memcg, nid);
+ struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);
/* where the current iteration continues after */
- if (lruvec->mm_state.head == &mm->lru_gen.list)
- lruvec->mm_state.head = lruvec->mm_state.head->prev;
+ if (mm_state->head == &mm->lru_gen.list)
+ mm_state->head = mm_state->head->prev;
/* where the last iteration ended before */
- if (lruvec->mm_state.tail == &mm->lru_gen.list)
- lruvec->mm_state.tail = lruvec->mm_state.tail->next;
+ if (mm_state->tail == &mm->lru_gen.list)
+ mm_state->tail = mm_state->tail->next;
}
list_del_init(&mm->lru_gen.list);
@@ -2830,10 +2860,30 @@ void lru_gen_migrate_mm(struct mm_struct *mm)
}
#endif
+#else /* !CONFIG_LRU_GEN_WALKS_MMU */
+
+static struct lru_gen_mm_list *get_mm_list(struct mem_cgroup *memcg)
+{
+ return NULL;
+}
+
+static struct lru_gen_mm_state *get_mm_state(struct lruvec *lruvec)
+{
+ return NULL;
+}
+
+static struct mm_struct *get_next_mm(struct lru_gen_mm_walk *walk)
+{
+ return NULL;
+}
+
+#endif
+
static void reset_mm_stats(struct lruvec *lruvec, struct lru_gen_mm_walk *walk, bool last)
{
int i;
int hist;
+ struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);
lockdep_assert_held(&get_mm_list(lruvec_memcg(lruvec))->lock);
@@ -2841,42 +2891,18 @@ static void reset_mm_stats(struct lruvec *lruvec, struct lru_gen_mm_walk *walk,
hist = lru_hist_from_seq(walk->max_seq);
for (i = 0; i < NR_MM_STATS; i++) {
- WRITE_ONCE(lruvec->mm_state.stats[hist][i],
- lruvec->mm_state.stats[hist][i] + walk->mm_stats[i]);
+ WRITE_ONCE(mm_state->stats[hist][i],
+ mm_state->stats[hist][i] + walk->mm_stats[i]);
walk->mm_stats[i] = 0;
}
}
if (NR_HIST_GENS > 1 && last) {
- hist = lru_hist_from_seq(lruvec->mm_state.seq + 1);
+ hist = lru_hist_from_seq(mm_state->seq + 1);
for (i = 0; i < NR_MM_STATS; i++)
- WRITE_ONCE(lruvec->mm_state.stats[hist][i], 0);
- }
-}
-
-static bool should_skip_mm(struct mm_struct *mm, struct lru_gen_mm_walk *walk)
-{
- int type;
- unsigned long size = 0;
- struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
- int key = pgdat->node_id % BITS_PER_TYPE(mm->lru_gen.bitmap);
-
- if (!walk->force_scan && !test_bit(key, &mm->lru_gen.bitmap))
- return true;
-
- clear_bit(key, &mm->lru_gen.bitmap);
-
- for (type = !walk->can_swap; type < ANON_AND_FILE; type++) {
- size += type ? get_mm_counter(mm, MM_FILEPAGES) :
- get_mm_counter(mm, MM_ANONPAGES) +
- get_mm_counter(mm, MM_SHMEMPAGES);
+ WRITE_ONCE(mm_state->stats[hist][i], 0);
}
-
- if (size < MIN_LRU_BATCH)
- return true;
-
- return !mmget_not_zero(mm);
}
static bool iterate_mm_list(struct lruvec *lruvec, struct lru_gen_mm_walk *walk,
@@ -2887,7 +2913,7 @@ static bool iterate_mm_list(struct lruvec *lruvec, struct lru_gen_mm_walk *walk,
struct mm_struct *mm = NULL;
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
- struct lru_gen_mm_state *mm_state = &lruvec->mm_state;
+ struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);
/*
* mm_state->seq is incremented after each iteration of mm_list. There
@@ -2925,11 +2951,7 @@ static bool iterate_mm_list(struct lruvec *lruvec, struct lru_gen_mm_walk *walk,
mm_state->tail = mm_state->head->next;
walk->force_scan = true;
}
-
- mm = list_entry(mm_state->head, struct mm_struct, lru_gen.list);
- if (should_skip_mm(mm, walk))
- mm = NULL;
- } while (!mm);
+ } while (!(mm = get_next_mm(walk)));
done:
if (*iter || last)
reset_mm_stats(lruvec, walk, last);
@@ -2937,7 +2959,7 @@ done:
spin_unlock(&mm_list->lock);
if (mm && first)
- reset_bloom_filter(lruvec, walk->max_seq + 1);
+ reset_bloom_filter(mm_state, walk->max_seq + 1);
if (*iter)
mmput_async(*iter);
@@ -2952,7 +2974,7 @@ static bool iterate_mm_list_nowalk(struct lruvec *lruvec, unsigned long max_seq)
bool success = false;
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
- struct lru_gen_mm_state *mm_state = &lruvec->mm_state;
+ struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);
spin_lock(&mm_list->lock);
@@ -3248,7 +3270,6 @@ static unsigned long get_pte_pfn(pte_t pte, struct vm_area_struct *vma, unsigned
return pfn;
}
-#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)
static unsigned long get_pmd_pfn(pmd_t pmd, struct vm_area_struct *vma, unsigned long addr)
{
unsigned long pfn = pmd_pfn(pmd);
@@ -3266,7 +3287,6 @@ static unsigned long get_pmd_pfn(pmd_t pmd, struct vm_area_struct *vma, unsigned
return pfn;
}
-#endif
static struct folio *get_pfn_folio(unsigned long pfn, struct mem_cgroup *memcg,
struct pglist_data *pgdat, bool can_swap)
@@ -3369,7 +3389,6 @@ restart:
return suitable_to_scan(total, young);
}
-#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)
static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area_struct *vma,
struct mm_walk *args, unsigned long *bitmap, unsigned long *first)
{
@@ -3447,12 +3466,6 @@ next:
done:
*first = -1;
}
-#else
-static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area_struct *vma,
- struct mm_walk *args, unsigned long *bitmap, unsigned long *first)
-{
-}
-#endif
static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end,
struct mm_walk *args)
@@ -3465,6 +3478,7 @@ static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end,
DECLARE_BITMAP(bitmap, MIN_LRU_BATCH);
unsigned long first = -1;
struct lru_gen_mm_walk *walk = args->private;
+ struct lru_gen_mm_state *mm_state = get_mm_state(walk->lruvec);
VM_WARN_ON_ONCE(pud_leaf(*pud));
@@ -3487,7 +3501,6 @@ restart:
continue;
}
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
if (pmd_trans_huge(val)) {
unsigned long pfn = pmd_pfn(val);
struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
@@ -3506,7 +3519,7 @@ restart:
walk_pmd_range_locked(pud, addr, vma, args, bitmap, &first);
continue;
}
-#endif
+
walk->mm_stats[MM_NONLEAF_TOTAL]++;
if (should_clear_pmd_young()) {
@@ -3516,7 +3529,7 @@ restart:
walk_pmd_range_locked(pud, addr, vma, args, bitmap, &first);
}
- if (!walk->force_scan && !test_bloom_filter(walk->lruvec, walk->max_seq, pmd + i))
+ if (!walk->force_scan && !test_bloom_filter(mm_state, walk->max_seq, pmd + i))
continue;
walk->mm_stats[MM_NONLEAF_FOUND]++;
@@ -3527,7 +3540,7 @@ restart:
walk->mm_stats[MM_NONLEAF_ADDED]++;
/* carry over to the next generation */
- update_bloom_filter(walk->lruvec, walk->max_seq + 1, pmd + i);
+ update_bloom_filter(mm_state, walk->max_seq + 1, pmd + i);
}
walk_pmd_range_locked(pud, -1, vma, args, bitmap, &first);
@@ -3734,16 +3747,25 @@ next:
return success;
}
-static void inc_max_seq(struct lruvec *lruvec, bool can_swap, bool force_scan)
+static bool inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
+ bool can_swap, bool force_scan)
{
+ bool success;
int prev, next;
int type, zone;
struct lru_gen_folio *lrugen = &lruvec->lrugen;
restart:
+ if (max_seq < READ_ONCE(lrugen->max_seq))
+ return false;
+
spin_lock_irq(&lruvec->lru_lock);
VM_WARN_ON_ONCE(!seq_is_valid(lruvec));
+ success = max_seq == lrugen->max_seq;
+ if (!success)
+ goto unlock;
+
for (type = ANON_AND_FILE - 1; type >= 0; type--) {
if (get_nr_gens(lruvec, type) != MAX_NR_GENS)
continue;
@@ -3787,8 +3809,10 @@ restart:
WRITE_ONCE(lrugen->timestamps[next], jiffies);
/* make sure preceding modifications appear */
smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1);
-
+unlock:
spin_unlock_irq(&lruvec->lru_lock);
+
+ return success;
}
static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
@@ -3798,14 +3822,16 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
struct lru_gen_mm_walk *walk;
struct mm_struct *mm = NULL;
struct lru_gen_folio *lrugen = &lruvec->lrugen;
+ struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);
VM_WARN_ON_ONCE(max_seq > READ_ONCE(lrugen->max_seq));
+ if (!mm_state)
+ return inc_max_seq(lruvec, max_seq, can_swap, force_scan);
+
/* see the comment in iterate_mm_list() */
- if (max_seq <= READ_ONCE(lruvec->mm_state.seq)) {
- success = false;
- goto done;
- }
+ if (max_seq <= READ_ONCE(mm_state->seq))
+ return false;
/*
* If the hardware doesn't automatically set the accessed bit, fallback
@@ -3835,8 +3861,10 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
walk_mm(lruvec, mm, walk);
} while (mm);
done:
- if (success)
- inc_max_seq(lruvec, can_swap, force_scan);
+ if (success) {
+ success = inc_max_seq(lruvec, max_seq, can_swap, force_scan);
+ WARN_ON_ONCE(!success);
+ }
return success;
}
@@ -3955,11 +3983,13 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
int young = 0;
pte_t *pte = pvmw->pte;
unsigned long addr = pvmw->address;
+ struct vm_area_struct *vma = pvmw->vma;
struct folio *folio = pfn_folio(pvmw->pfn);
bool can_swap = !folio_is_file_lru(folio);
struct mem_cgroup *memcg = folio_memcg(folio);
struct pglist_data *pgdat = folio_pgdat(folio);
struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
+ struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);
DEFINE_MAX_SEQ(lruvec);
int old_gen, new_gen = lru_gen_from_seq(max_seq);
@@ -3969,11 +3999,15 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
if (spin_is_contended(pvmw->ptl))
return;
+ /* exclude special VMAs containing anon pages from COW */
+ if (vma->vm_flags & VM_SPECIAL)
+ return;
+
/* avoid taking the LRU lock under the PTL when possible */
walk = current->reclaim_state ? current->reclaim_state->mm_walk : NULL;
- start = max(addr & PMD_MASK, pvmw->vma->vm_start);
- end = min(addr | ~PMD_MASK, pvmw->vma->vm_end - 1) + 1;
+ start = max(addr & PMD_MASK, vma->vm_start);
+ end = min(addr | ~PMD_MASK, vma->vm_end - 1) + 1;
if (end - start > MIN_LRU_BATCH * PAGE_SIZE) {
if (addr - start < MIN_LRU_BATCH * PAGE_SIZE / 2)
@@ -3998,7 +4032,7 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
unsigned long pfn;
pte_t ptent = ptep_get(pte + i);
- pfn = get_pte_pfn(ptent, pvmw->vma, addr);
+ pfn = get_pte_pfn(ptent, vma, addr);
if (pfn == -1)
continue;
@@ -4009,7 +4043,7 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
if (!folio)
continue;
- if (!ptep_test_and_clear_young(pvmw->vma, addr, pte + i))
+ if (!ptep_test_and_clear_young(vma, addr, pte + i))
VM_WARN_ON_ONCE(true);
young++;
@@ -4038,8 +4072,8 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
mem_cgroup_unlock_pages();
/* feedback from rmap walkers to page table walkers */
- if (suitable_to_scan(i, young))
- update_bloom_filter(lruvec, max_seq, pvmw->pmd);
+ if (mm_state && suitable_to_scan(i, young))
+ update_bloom_filter(mm_state, max_seq, pvmw->pmd);
}
/******************************************************************************
@@ -4055,13 +4089,6 @@ enum {
MEMCG_LRU_YOUNG,
};
-#ifdef CONFIG_MEMCG
-
-static int lru_gen_memcg_seg(struct lruvec *lruvec)
-{
- return READ_ONCE(lruvec->lrugen.seg);
-}
-
static void lru_gen_rotate_memcg(struct lruvec *lruvec, int op)
{
int seg;
@@ -4089,6 +4116,9 @@ static void lru_gen_rotate_memcg(struct lruvec *lruvec, int op)
else
VM_WARN_ON_ONCE(true);
+ WRITE_ONCE(lruvec->lrugen.seg, seg);
+ WRITE_ONCE(lruvec->lrugen.gen, new);
+
hlist_nulls_del_rcu(&lruvec->lrugen.list);
if (op == MEMCG_LRU_HEAD || op == MEMCG_LRU_OLD)
@@ -4099,15 +4129,14 @@ static void lru_gen_rotate_memcg(struct lruvec *lruvec, int op)
pgdat->memcg_lru.nr_memcgs[old]--;
pgdat->memcg_lru.nr_memcgs[new]++;
- lruvec->lrugen.gen = new;
- WRITE_ONCE(lruvec->lrugen.seg, seg);
-
if (!pgdat->memcg_lru.nr_memcgs[old] && old == get_memcg_gen(pgdat->memcg_lru.seq))
WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1);
spin_unlock_irqrestore(&pgdat->memcg_lru.lock, flags);
}
+#ifdef CONFIG_MEMCG
+
void lru_gen_online_memcg(struct mem_cgroup *memcg)
{
int gen;
@@ -4124,11 +4153,11 @@ void lru_gen_online_memcg(struct mem_cgroup *memcg)
gen = get_memcg_gen(pgdat->memcg_lru.seq);
+ lruvec->lrugen.gen = gen;
+
hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[gen][bin]);
pgdat->memcg_lru.nr_memcgs[gen]++;
- lruvec->lrugen.gen = gen;
-
spin_unlock_irq(&pgdat->memcg_lru.lock);
}
}
@@ -4175,18 +4204,11 @@ void lru_gen_soft_reclaim(struct mem_cgroup *memcg, int nid)
struct lruvec *lruvec = get_lruvec(memcg, nid);
/* see the comment on MEMCG_NR_GENS */
- if (lru_gen_memcg_seg(lruvec) != MEMCG_LRU_HEAD)
+ if (READ_ONCE(lruvec->lrugen.seg) != MEMCG_LRU_HEAD)
lru_gen_rotate_memcg(lruvec, MEMCG_LRU_HEAD);
}
-#else /* !CONFIG_MEMCG */
-
-static int lru_gen_memcg_seg(struct lruvec *lruvec)
-{
- return 0;
-}
-
-#endif
+#endif /* CONFIG_MEMCG */
/******************************************************************************
* the eviction
@@ -4232,7 +4254,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_c
}
/* protected */
- if (tier > tier_idx) {
+ if (tier > tier_idx || refs == BIT(LRU_REFS_WIDTH)) {
int hist = lru_hist_from_seq(lrugen->min_seq[type]);
gen = folio_inc_gen(lruvec, folio, false);
@@ -4598,7 +4620,12 @@ static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq,
}
/* try to scrape all its memory if this memcg was deleted */
- *nr_to_scan = mem_cgroup_online(memcg) ? (total >> sc->priority) : total;
+ if (!mem_cgroup_online(memcg)) {
+ *nr_to_scan = total;
+ return false;
+ }
+
+ *nr_to_scan = total >> sc->priority;
/*
* The aging tries to be lazy to reduce the overhead, while the eviction
@@ -4635,7 +4662,7 @@ static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, bool
DEFINE_MAX_SEQ(lruvec);
if (mem_cgroup_below_min(sc->target_mem_cgroup, memcg))
- return 0;
+ return -1;
if (!should_run_aging(lruvec, max_seq, sc, can_swap, &nr_to_scan))
return nr_to_scan;
@@ -4648,20 +4675,41 @@ static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, bool
return try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false) ? -1 : 0;
}
-static unsigned long get_nr_to_reclaim(struct scan_control *sc)
+static bool should_abort_scan(struct lruvec *lruvec, struct scan_control *sc)
{
+ int i;
+ enum zone_watermarks mark;
+
/* don't abort memcg reclaim to ensure fairness */
if (!root_reclaim(sc))
- return -1;
+ return false;
+
+ if (sc->nr_reclaimed >= max(sc->nr_to_reclaim, compact_gap(sc->order)))
+ return true;
+
+ /* check the order to exclude compaction-induced reclaim */
+ if (!current_is_kswapd() || sc->order)
+ return false;
+
+ mark = sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING ?
+ WMARK_PROMO : WMARK_HIGH;
- return max(sc->nr_to_reclaim, compact_gap(sc->order));
+ for (i = 0; i <= sc->reclaim_idx; i++) {
+ struct zone *zone = lruvec_pgdat(lruvec)->node_zones + i;
+ unsigned long size = wmark_pages(zone, mark) + MIN_LRU_BATCH;
+
+ if (managed_zone(zone) && !zone_watermark_ok(zone, 0, size, sc->reclaim_idx, 0))
+ return false;
+ }
+
+ /* kswapd should abort if all eligible zones are safe */
+ return true;
}
static bool try_to_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
{
long nr_to_scan;
unsigned long scanned = 0;
- unsigned long nr_to_reclaim = get_nr_to_reclaim(sc);
int swappiness = get_swappiness(lruvec, sc);
/* clean file folios are more likely to exist */
@@ -4683,13 +4731,13 @@ static bool try_to_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
if (scanned >= nr_to_scan)
break;
- if (sc->nr_reclaimed >= nr_to_reclaim)
+ if (should_abort_scan(lruvec, sc))
break;
cond_resched();
}
- /* whether try_to_inc_max_seq() was successful */
+ /* whether this lruvec should be rotated */
return nr_to_scan < 0;
}
@@ -4698,14 +4746,9 @@ static int shrink_one(struct lruvec *lruvec, struct scan_control *sc)
bool success;
unsigned long scanned = sc->nr_scanned;
unsigned long reclaimed = sc->nr_reclaimed;
- int seg = lru_gen_memcg_seg(lruvec);
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
- /* see the comment on MEMCG_NR_GENS */
- if (!lruvec_is_sizable(lruvec, sc))
- return seg != MEMCG_LRU_TAIL ? MEMCG_LRU_TAIL : MEMCG_LRU_YOUNG;
-
mem_cgroup_calculate_protection(NULL, memcg);
if (mem_cgroup_below_min(NULL, memcg))
@@ -4713,7 +4756,7 @@ static int shrink_one(struct lruvec *lruvec, struct scan_control *sc)
if (mem_cgroup_below_low(NULL, memcg)) {
/* see the comment on MEMCG_NR_GENS */
- if (seg != MEMCG_LRU_TAIL)
+ if (READ_ONCE(lruvec->lrugen.seg) != MEMCG_LRU_TAIL)
return MEMCG_LRU_TAIL;
memcg_memory_event(memcg, MEMCG_LOW);
@@ -4729,10 +4772,16 @@ static int shrink_one(struct lruvec *lruvec, struct scan_control *sc)
flush_reclaim_state(sc);
- return success ? MEMCG_LRU_YOUNG : 0;
-}
+ if (success && mem_cgroup_online(memcg))
+ return MEMCG_LRU_YOUNG;
-#ifdef CONFIG_MEMCG
+ if (!success && lruvec_is_sizable(lruvec, sc))
+ return 0;
+
+ /* one retry if offlined or too small */
+ return READ_ONCE(lruvec->lrugen.seg) != MEMCG_LRU_TAIL ?
+ MEMCG_LRU_TAIL : MEMCG_LRU_YOUNG;
+}
static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc)
{
@@ -4743,14 +4792,13 @@ static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc)
struct lruvec *lruvec;
struct lru_gen_folio *lrugen;
struct mem_cgroup *memcg;
- const struct hlist_nulls_node *pos;
- unsigned long nr_to_reclaim = get_nr_to_reclaim(sc);
+ struct hlist_nulls_node *pos;
+ gen = get_memcg_gen(READ_ONCE(pgdat->memcg_lru.seq));
bin = first_bin = get_random_u32_below(MEMCG_NR_BINS);
restart:
op = 0;
memcg = NULL;
- gen = get_memcg_gen(READ_ONCE(pgdat->memcg_lru.seq));
rcu_read_lock();
@@ -4761,6 +4809,10 @@ restart:
}
mem_cgroup_put(memcg);
+ memcg = NULL;
+
+ if (gen != READ_ONCE(lrugen->gen))
+ continue;
lruvec = container_of(lrugen, struct lruvec, lrugen);
memcg = lruvec_memcg(lruvec);
@@ -4777,7 +4829,7 @@ restart:
rcu_read_lock();
- if (sc->nr_reclaimed >= nr_to_reclaim)
+ if (should_abort_scan(lruvec, sc))
break;
}
@@ -4788,7 +4840,7 @@ restart:
mem_cgroup_put(memcg);
- if (sc->nr_reclaimed >= nr_to_reclaim)
+ if (!is_a_nulls(pos))
return;
/* restart if raced with lru_gen_rotate_memcg() */
@@ -4822,20 +4874,6 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc
blk_finish_plug(&plug);
}
-#else /* !CONFIG_MEMCG */
-
-static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc)
-{
- BUILD_BUG();
-}
-
-static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
-{
- BUILD_BUG();
-}
-
-#endif
-
static void set_initial_priority(struct pglist_data *pgdat, struct scan_control *sc)
{
int priority;
@@ -4845,16 +4883,14 @@ static void set_initial_priority(struct pglist_data *pgdat, struct scan_control
if (sc->priority != DEF_PRIORITY || sc->nr_to_reclaim < MIN_LRU_BATCH)
return;
/*
- * Determine the initial priority based on ((total / MEMCG_NR_GENS) >>
- * priority) * reclaimed_to_scanned_ratio = nr_to_reclaim, where the
- * estimated reclaimed_to_scanned_ratio = inactive / total.
+ * Determine the initial priority based on
+ * (total >> priority) * reclaimed_to_scanned_ratio = nr_to_reclaim,
+ * where reclaimed_to_scanned_ratio = inactive / total.
*/
reclaimable = node_page_state(pgdat, NR_INACTIVE_FILE);
if (get_swappiness(lruvec, sc))
reclaimable += node_page_state(pgdat, NR_INACTIVE_ANON);
- reclaimable /= MEMCG_NR_GENS;
-
/* round down reclaimable and round up sc->nr_to_reclaim */
priority = fls_long(reclaimable) - 1 - fls_long(sc->nr_to_reclaim - 1);
@@ -5185,6 +5221,7 @@ static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec,
int type, tier;
int hist = lru_hist_from_seq(seq);
struct lru_gen_folio *lrugen = &lruvec->lrugen;
+ struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);
for (tier = 0; tier < MAX_NR_TIERS; tier++) {
seq_printf(m, " %10d", tier);
@@ -5210,6 +5247,9 @@ static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec,
seq_putc(m, '\n');
}
+ if (!mm_state)
+ return;
+
seq_puts(m, " ");
for (i = 0; i < NR_MM_STATS; i++) {
const char *s = " ";
@@ -5217,10 +5257,10 @@ static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec,
if (seq == max_seq && NR_HIST_GENS == 1) {
s = "LOYNFA";
- n = READ_ONCE(lruvec->mm_state.stats[hist][i]);
+ n = READ_ONCE(mm_state->stats[hist][i]);
} else if (seq != max_seq && NR_HIST_GENS > 1) {
s = "loynfa";
- n = READ_ONCE(lruvec->mm_state.stats[hist][i]);
+ n = READ_ONCE(mm_state->stats[hist][i]);
}
seq_printf(m, " %10lu%c", n, s[i]);
@@ -5484,11 +5524,24 @@ static const struct file_operations lru_gen_ro_fops = {
* initialization
******************************************************************************/
+void lru_gen_init_pgdat(struct pglist_data *pgdat)
+{
+ int i, j;
+
+ spin_lock_init(&pgdat->memcg_lru.lock);
+
+ for (i = 0; i < MEMCG_NR_GENS; i++) {
+ for (j = 0; j < MEMCG_NR_BINS; j++)
+ INIT_HLIST_NULLS_HEAD(&pgdat->memcg_lru.fifo[i][j], i);
+ }
+}
+
void lru_gen_init_lruvec(struct lruvec *lruvec)
{
int i;
int gen, type, zone;
struct lru_gen_folio *lrugen = &lruvec->lrugen;
+ struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);
lrugen->max_seq = MIN_NR_GENS + 1;
lrugen->enabled = lru_gen_enabled();
@@ -5499,47 +5552,46 @@ void lru_gen_init_lruvec(struct lruvec *lruvec)
for_each_gen_type_zone(gen, type, zone)
INIT_LIST_HEAD(&lrugen->folios[gen][type][zone]);
- lruvec->mm_state.seq = MIN_NR_GENS;
+ if (mm_state)
+ mm_state->seq = MIN_NR_GENS;
}
#ifdef CONFIG_MEMCG
-void lru_gen_init_pgdat(struct pglist_data *pgdat)
+void lru_gen_init_memcg(struct mem_cgroup *memcg)
{
- int i, j;
-
- spin_lock_init(&pgdat->memcg_lru.lock);
+ struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
- for (i = 0; i < MEMCG_NR_GENS; i++) {
- for (j = 0; j < MEMCG_NR_BINS; j++)
- INIT_HLIST_NULLS_HEAD(&pgdat->memcg_lru.fifo[i][j], i);
- }
-}
+ if (!mm_list)
+ return;
-void lru_gen_init_memcg(struct mem_cgroup *memcg)
-{
- INIT_LIST_HEAD(&memcg->mm_list.fifo);
- spin_lock_init(&memcg->mm_list.lock);
+ INIT_LIST_HEAD(&mm_list->fifo);
+ spin_lock_init(&mm_list->lock);
}
void lru_gen_exit_memcg(struct mem_cgroup *memcg)
{
int i;
int nid;
+ struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
- VM_WARN_ON_ONCE(!list_empty(&memcg->mm_list.fifo));
+ VM_WARN_ON_ONCE(mm_list && !list_empty(&mm_list->fifo));
for_each_node(nid) {
struct lruvec *lruvec = get_lruvec(memcg, nid);
+ struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);
VM_WARN_ON_ONCE(memchr_inv(lruvec->lrugen.nr_pages, 0,
sizeof(lruvec->lrugen.nr_pages)));
lruvec->lrugen.list.next = LIST_POISON1;
+ if (!mm_state)
+ continue;
+
for (i = 0; i < NR_BLOOM_FILTERS; i++) {
- bitmap_free(lruvec->mm_state.filters[i]);
- lruvec->mm_state.filters[i] = NULL;
+ bitmap_free(mm_state->filters[i]);
+ mm_state->filters[i] = NULL;
}
}
}
@@ -5565,14 +5617,17 @@ late_initcall(init_lru_gen);
static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
{
+ BUILD_BUG();
}
static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
{
+ BUILD_BUG();
}
static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *sc)
{
+ BUILD_BUG();
}
#endif /* CONFIG_LRU_GEN */
@@ -6365,7 +6420,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
* scan_control uses s8 fields for order, priority, and reclaim_idx.
* Confirm they are large enough for max values.
*/
- BUILD_BUG_ON(MAX_ORDER >= S8_MAX);
+ BUILD_BUG_ON(MAX_PAGE_ORDER >= S8_MAX);
BUILD_BUG_ON(DEF_PRIORITY > S8_MAX);
BUILD_BUG_ON(MAX_NR_ZONES > S8_MAX);