From 44383cef54c0ce1201f884d83cc2b367bc5aa4f7 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 19 Dec 2022 19:09:18 +0100 Subject: kasan: allow sampling page_alloc allocations for HW_TAGS As Hardware Tag-Based KASAN is intended to be used in production, its performance impact is crucial. As page_alloc allocations tend to be big, tagging and checking all such allocations can introduce a significant slowdown. Add two new boot parameters that allow to alleviate that slowdown: - kasan.page_alloc.sample, which makes Hardware Tag-Based KASAN tag only every Nth page_alloc allocation with the order configured by the second added parameter (default: tag every such allocation). - kasan.page_alloc.sample.order, which makes sampling enabled by the first parameter only affect page_alloc allocations with the order equal or greater than the specified value (default: 3, see below). The exact performance improvement caused by using the new parameters depends on their values and the applied workload. The chosen default value for kasan.page_alloc.sample.order is 3, which matches both PAGE_ALLOC_COSTLY_ORDER and SKB_FRAG_PAGE_ORDER. This is done for two reasons: 1. PAGE_ALLOC_COSTLY_ORDER is "the order at which allocations are deemed costly to service", which corresponds to the idea that only large and thus costly allocations are supposed to sampled. 2. One of the workloads targeted by this patch is a benchmark that sends a large amount of data over a local loopback connection. Most multi-page data allocations in the networking subsystem have the order of SKB_FRAG_PAGE_ORDER (or PAGE_ALLOC_COSTLY_ORDER). When running a local loopback test on a testing MTE-enabled device in sync mode, enabling Hardware Tag-Based KASAN introduces a ~50% slowdown. Applying this patch and setting kasan.page_alloc.sampling to a value higher than 1 allows to lower the slowdown. The performance improvement saturates around the sampling interval value of 10 with the default sampling page order of 3. This lowers the slowdown to ~20%. The slowdown in real scenarios involving the network will likely be better. Enabling page_alloc sampling has a downside: KASAN misses bad accesses to a page_alloc allocation that has not been tagged. This lowers the value of KASAN as a security mitigation. However, based on measuring the number of page_alloc allocations of different orders during boot in a test build, sampling with the default kasan.page_alloc.sample.order value affects only ~7% of allocations. The rest ~93% of allocations are still checked deterministically. Link: https://lkml.kernel.org/r/129da0614123bb85ed4dd61ae30842b2dd7c903f.1671471846.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Jann Horn Cc: Mark Brand Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- mm/page_alloc.c | 43 +++++++++++++++++++++++++++++-------------- 1 file changed, 29 insertions(+), 14 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 0745aedebb37..7d980dc0000e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1356,6 +1356,8 @@ out: * see the comment next to it. * 3. Skipping poisoning is requested via __GFP_SKIP_KASAN_POISON, * see the comment next to it. + * 4. The allocation is excluded from being checked due to sampling, + * see the call to kasan_unpoison_pages. * * Poisoning pages during deferred memory init will greatly lengthen the * process and cause problem in large memory systems as the deferred pages @@ -2468,7 +2470,8 @@ inline void post_alloc_hook(struct page *page, unsigned int order, { bool init = !want_init_on_free() && want_init_on_alloc(gfp_flags) && !should_skip_init(gfp_flags); - bool init_tags = init && (gfp_flags & __GFP_ZEROTAGS); + bool zero_tags = init && (gfp_flags & __GFP_ZEROTAGS); + bool reset_tags = !zero_tags; int i; set_page_private(page, 0); @@ -2491,30 +2494,42 @@ inline void post_alloc_hook(struct page *page, unsigned int order, */ /* - * If memory tags should be zeroed (which happens only when memory - * should be initialized as well). + * If memory tags should be zeroed + * (which happens only when memory should be initialized as well). */ - if (init_tags) { + if (zero_tags) { /* Initialize both memory and tags. */ for (i = 0; i != 1 << order; ++i) tag_clear_highpage(page + i); - /* Note that memory is already initialized by the loop above. */ + /* Take note that memory was initialized by the loop above. */ init = false; } if (!should_skip_kasan_unpoison(gfp_flags)) { - /* Unpoison shadow memory or set memory tags. */ - kasan_unpoison_pages(page, order, init); - - /* Note that memory is already initialized by KASAN. */ - if (kasan_has_integrated_init()) - init = false; - } else { - /* Ensure page_address() dereferencing does not fault. */ + /* Try unpoisoning (or setting tags) and initializing memory. */ + if (kasan_unpoison_pages(page, order, init)) { + /* Take note that memory was initialized by KASAN. */ + if (kasan_has_integrated_init()) + init = false; + /* Take note that memory tags were set by KASAN. */ + reset_tags = false; + } else { + /* + * KASAN decided to exclude this allocation from being + * poisoned due to sampling. Skip poisoning as well. + */ + SetPageSkipKASanPoison(page); + } + } + /* + * If memory tags have not been set, reset the page tags to ensure + * page_address() dereferencing does not fault. + */ + if (reset_tags) { for (i = 0; i != 1 << order; ++i) page_kasan_tag_reset(page + i); } - /* If memory is still not initialized, do it now. */ + /* If memory is still not initialized, initialize it now. */ if (init) kernel_init_pages(page, 1 << order); /* Propagate __GFP_SKIP_KASAN_POISON to page flags. */ -- cgit v1.2.3 From e4dde56cd208674ce899b47589f263499e5b8cdc Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Wed, 21 Dec 2022 21:19:04 -0700 Subject: mm: multi-gen LRU: per-node lru_gen_folio lists For each node, memcgs are divided into two generations: the old and the young. For each generation, memcgs are randomly sharded into multiple bins to improve scalability. For each bin, an RCU hlist_nulls is virtually divided into three segments: the head, the tail and the default. An onlining memcg is added to the tail of a random bin in the old generation. The eviction starts at the head of a random bin in the old generation. The per-node memcg generation counter, whose reminder (mod 2) indexes the old generation, is incremented when all its bins become empty. There are four operations: 1. MEMCG_LRU_HEAD, which moves an memcg to the head of a random bin in its current generation (old or young) and updates its "seg" to "head"; 2. MEMCG_LRU_TAIL, which moves an memcg to the tail of a random bin in its current generation (old or young) and updates its "seg" to "tail"; 3. MEMCG_LRU_OLD, which moves an memcg to the head of a random bin in the old generation, updates its "gen" to "old" and resets its "seg" to "default"; 4. MEMCG_LRU_YOUNG, which moves an memcg to the tail of a random bin in the young generation, updates its "gen" to "young" and resets its "seg" to "default". The events that trigger the above operations are: 1. Exceeding the soft limit, which triggers MEMCG_LRU_HEAD; 2. The first attempt to reclaim an memcg below low, which triggers MEMCG_LRU_TAIL; 3. The first attempt to reclaim an memcg below reclaimable size threshold, which triggers MEMCG_LRU_TAIL; 4. The second attempt to reclaim an memcg below reclaimable size threshold, which triggers MEMCG_LRU_YOUNG; 5. Attempting to reclaim an memcg below min, which triggers MEMCG_LRU_YOUNG; 6. Finishing the aging on the eviction path, which triggers MEMCG_LRU_YOUNG; 7. Offlining an memcg, which triggers MEMCG_LRU_OLD. Note that memcg LRU only applies to global reclaim, and the round-robin incrementing of their max_seq counters ensures the eventual fairness to all eligible memcgs. For memcg reclaim, it still relies on mem_cgroup_iter(). Link: https://lkml.kernel.org/r/20221222041905.2431096-7-yuzhao@google.com Signed-off-by: Yu Zhao Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Michael Larabel Cc: Michal Hocko Cc: Mike Rapoport Cc: Roman Gushchin Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 10 ++ include/linux/mm_inline.h | 17 +++ include/linux/mmzone.h | 117 +++++++++++++- mm/memcontrol.c | 16 ++ mm/page_alloc.c | 1 + mm/vmscan.c | 374 +++++++++++++++++++++++++++++++++++++++++---- 6 files changed, 500 insertions(+), 35 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index d3c8203cab6c..2e08b05bc6bf 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -794,6 +794,11 @@ static inline void obj_cgroup_put(struct obj_cgroup *objcg) percpu_ref_put(&objcg->refcnt); } +static inline bool mem_cgroup_tryget(struct mem_cgroup *memcg) +{ + return !memcg || css_tryget(&memcg->css); +} + static inline void mem_cgroup_put(struct mem_cgroup *memcg) { if (memcg) @@ -1301,6 +1306,11 @@ static inline void obj_cgroup_put(struct obj_cgroup *objcg) { } +static inline bool mem_cgroup_tryget(struct mem_cgroup *memcg) +{ + return true; +} + static inline void mem_cgroup_put(struct mem_cgroup *memcg) { } diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index eb8a2435ee80..acf03147fff8 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -122,6 +122,18 @@ static inline bool lru_gen_in_fault(void) return current->in_lru_fault; } +#ifdef CONFIG_MEMCG +static inline int lru_gen_memcg_seg(struct lruvec *lruvec) +{ + return READ_ONCE(lruvec->lrugen.seg); +} +#else +static inline int lru_gen_memcg_seg(struct lruvec *lruvec) +{ + return 0; +} +#endif + static inline int lru_gen_from_seq(unsigned long seq) { return seq % MAX_NR_GENS; @@ -297,6 +309,11 @@ static inline bool lru_gen_in_fault(void) return false; } +static inline int lru_gen_memcg_seg(struct lruvec *lruvec) +{ + return 0; +} + static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming) { return false; diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 6c96ee823dbd..815c7c2edf45 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -7,6 +7,7 @@ #include #include +#include #include #include #include @@ -367,6 +368,15 @@ struct page_vma_mapped_walk; #define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF) #define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF) +/* see the comment on MEMCG_NR_GENS */ +enum { + MEMCG_LRU_NOP, + MEMCG_LRU_HEAD, + MEMCG_LRU_TAIL, + MEMCG_LRU_OLD, + MEMCG_LRU_YOUNG, +}; + #ifdef CONFIG_LRU_GEN enum { @@ -426,6 +436,14 @@ struct lru_gen_folio { atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS]; /* whether the multi-gen LRU is enabled */ bool enabled; +#ifdef CONFIG_MEMCG + /* the memcg generation this lru_gen_folio belongs to */ + u8 gen; + /* the list segment this lru_gen_folio belongs to */ + u8 seg; + /* per-node lru_gen_folio list for global reclaim */ + struct hlist_nulls_node list; +#endif }; enum { @@ -479,12 +497,87 @@ void lru_gen_init_lruvec(struct lruvec *lruvec); void lru_gen_look_around(struct page_vma_mapped_walk *pvmw); #ifdef CONFIG_MEMCG + +/* + * For each node, memcgs are divided into two generations: the old and the + * young. For each generation, memcgs are randomly sharded into multiple bins + * to improve scalability. For each bin, the hlist_nulls is virtually divided + * into three segments: the head, the tail and the default. + * + * An onlining memcg is added to the tail of a random bin in the old generation. + * The eviction starts at the head of a random bin in the old generation. The + * per-node memcg generation counter, whose reminder (mod MEMCG_NR_GENS) indexes + * the old generation, is incremented when all its bins become empty. + * + * There are four operations: + * 1. MEMCG_LRU_HEAD, which moves an memcg to the head of a random bin in its + * current generation (old or young) and updates its "seg" to "head"; + * 2. MEMCG_LRU_TAIL, which moves an memcg to the tail of a random bin in its + * current generation (old or young) and updates its "seg" to "tail"; + * 3. MEMCG_LRU_OLD, which moves an memcg to the head of a random bin in the old + * generation, updates its "gen" to "old" and resets its "seg" to "default"; + * 4. MEMCG_LRU_YOUNG, which moves an memcg to the tail of a random bin in the + * young generation, updates its "gen" to "young" and resets its "seg" to + * "default". + * + * The events that trigger the above operations are: + * 1. Exceeding the soft limit, which triggers MEMCG_LRU_HEAD; + * 2. The first attempt to reclaim an memcg below low, which triggers + * MEMCG_LRU_TAIL; + * 3. The first attempt to reclaim an memcg below reclaimable size threshold, + * which triggers MEMCG_LRU_TAIL; + * 4. The second attempt to reclaim an memcg below reclaimable size threshold, + * which triggers MEMCG_LRU_YOUNG; + * 5. Attempting to reclaim an memcg below min, which triggers MEMCG_LRU_YOUNG; + * 6. Finishing the aging on the eviction path, which triggers MEMCG_LRU_YOUNG; + * 7. Offlining an memcg, which triggers MEMCG_LRU_OLD. + * + * Note that memcg LRU only applies to global reclaim, and the round-robin + * incrementing of their max_seq counters ensures the eventual fairness to all + * eligible memcgs. For memcg reclaim, it still relies on mem_cgroup_iter(). + */ +#define MEMCG_NR_GENS 2 +#define MEMCG_NR_BINS 8 + +struct lru_gen_memcg { + /* the per-node memcg generation counter */ + unsigned long seq; + /* each memcg has one lru_gen_folio per node */ + unsigned long nr_memcgs[MEMCG_NR_GENS]; + /* per-node lru_gen_folio list for global reclaim */ + struct hlist_nulls_head fifo[MEMCG_NR_GENS][MEMCG_NR_BINS]; + /* protects the above */ + spinlock_t lock; +}; + +void lru_gen_init_pgdat(struct pglist_data *pgdat); + void lru_gen_init_memcg(struct mem_cgroup *memcg); void lru_gen_exit_memcg(struct mem_cgroup *memcg); -#endif +void lru_gen_online_memcg(struct mem_cgroup *memcg); +void lru_gen_offline_memcg(struct mem_cgroup *memcg); +void lru_gen_release_memcg(struct mem_cgroup *memcg); +void lru_gen_rotate_memcg(struct lruvec *lruvec, int op); + +#else /* !CONFIG_MEMCG */ + +#define MEMCG_NR_GENS 1 + +struct lru_gen_memcg { +}; + +static inline void lru_gen_init_pgdat(struct pglist_data *pgdat) +{ +} + +#endif /* CONFIG_MEMCG */ #else /* !CONFIG_LRU_GEN */ +static inline void lru_gen_init_pgdat(struct pglist_data *pgdat) +{ +} + static inline void lru_gen_init_lruvec(struct lruvec *lruvec) { } @@ -494,6 +587,7 @@ static inline void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) } #ifdef CONFIG_MEMCG + static inline void lru_gen_init_memcg(struct mem_cgroup *memcg) { } @@ -501,7 +595,24 @@ static inline void lru_gen_init_memcg(struct mem_cgroup *memcg) static inline void lru_gen_exit_memcg(struct mem_cgroup *memcg) { } -#endif + +static inline void lru_gen_online_memcg(struct mem_cgroup *memcg) +{ +} + +static inline void lru_gen_offline_memcg(struct mem_cgroup *memcg) +{ +} + +static inline void lru_gen_release_memcg(struct mem_cgroup *memcg) +{ +} + +static inline void lru_gen_rotate_memcg(struct lruvec *lruvec, int op) +{ +} + +#endif /* CONFIG_MEMCG */ #endif /* CONFIG_LRU_GEN */ @@ -1243,6 +1354,8 @@ typedef struct pglist_data { #ifdef CONFIG_LRU_GEN /* kswap mm walk data */ struct lru_gen_mm_walk mm_walk; + /* lru_gen_folio list */ + struct lru_gen_memcg memcg_lru; #endif CACHELINE_PADDING(_pad2_); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 49f67176a1a2..2758b67eb169 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -478,6 +478,16 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, int nid) struct mem_cgroup_per_node *mz; struct mem_cgroup_tree_per_node *mctz; + if (lru_gen_enabled()) { + struct lruvec *lruvec = &memcg->nodeinfo[nid]->lruvec; + + /* see the comment on MEMCG_NR_GENS */ + if (soft_limit_excess(memcg) && lru_gen_memcg_seg(lruvec) != MEMCG_LRU_HEAD) + lru_gen_rotate_memcg(lruvec, MEMCG_LRU_HEAD); + + return; + } + mctz = soft_limit_tree.rb_tree_per_node[nid]; if (!mctz) return; @@ -3530,6 +3540,9 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, struct mem_cgroup_tree_per_node *mctz; unsigned long excess; + if (lru_gen_enabled()) + return 0; + if (order > 0) return 0; @@ -5391,6 +5404,7 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css) if (unlikely(mem_cgroup_is_root(memcg))) queue_delayed_work(system_unbound_wq, &stats_flush_dwork, 2UL*HZ); + lru_gen_online_memcg(memcg); return 0; offline_kmem: memcg_offline_kmem(memcg); @@ -5422,6 +5436,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) memcg_offline_kmem(memcg); reparent_shrinker_deferred(memcg); wb_memcg_offline(memcg); + lru_gen_offline_memcg(memcg); drain_all_stock(memcg); @@ -5433,6 +5448,7 @@ static void mem_cgroup_css_released(struct cgroup_subsys_state *css) struct mem_cgroup *memcg = mem_cgroup_from_css(css); invalidate_reclaim_iterators(memcg); + lru_gen_release_memcg(memcg); } static void mem_cgroup_css_free(struct cgroup_subsys_state *css) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7d980dc0000e..5668c1a2de49 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7941,6 +7941,7 @@ static void __init free_area_init_node(int nid) pgdat_set_deferred_range(pgdat); free_area_init_core(pgdat); + lru_gen_init_pgdat(pgdat); } static void __init free_area_init_memoryless_node(int nid) diff --git a/mm/vmscan.c b/mm/vmscan.c index 5a167f8efc38..178465a503db 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -55,6 +55,8 @@ #include #include #include +#include +#include #include #include @@ -135,11 +137,6 @@ struct scan_control { /* Always discard instead of demoting to lower tier memory */ unsigned int no_demotion:1; -#ifdef CONFIG_LRU_GEN - /* help kswapd make better choices among multiple memcgs */ - unsigned long last_reclaimed; -#endif - /* Allocation order */ s8 order; @@ -3185,6 +3182,9 @@ DEFINE_STATIC_KEY_ARRAY_FALSE(lru_gen_caps, NR_LRU_GEN_CAPS); for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \ for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++) +#define get_memcg_gen(seq) ((seq) % MEMCG_NR_GENS) +#define get_memcg_bin(bin) ((bin) % MEMCG_NR_BINS) + static struct lruvec *get_lruvec(struct mem_cgroup *memcg, int nid) { struct pglist_data *pgdat = NODE_DATA(nid); @@ -4453,8 +4453,7 @@ done: if (sc->priority <= DEF_PRIORITY - 2) wait_event_killable(lruvec->mm_state.wait, max_seq < READ_ONCE(lrugen->max_seq)); - - return max_seq < READ_ONCE(lrugen->max_seq); + return false; } VM_WARN_ON_ONCE(max_seq != READ_ONCE(lrugen->max_seq)); @@ -4527,8 +4526,6 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) VM_WARN_ON_ONCE(!current_is_kswapd()); - sc->last_reclaimed = sc->nr_reclaimed; - /* check the order to exclude compaction-induced reclaim */ if (!min_ttl || sc->order || sc->priority == DEF_PRIORITY) return; @@ -5117,8 +5114,7 @@ static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, * 1. Defer try_to_inc_max_seq() to workqueues to reduce latency for memcg * reclaim. */ -static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, - bool can_swap) +static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, bool can_swap) { unsigned long nr_to_scan; struct mem_cgroup *memcg = lruvec_memcg(lruvec); @@ -5136,10 +5132,8 @@ static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control * if (sc->priority == DEF_PRIORITY) return nr_to_scan; - try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false); - /* skip this lruvec as it's low on cold folios */ - return 0; + return try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false) ? -1 : 0; } static unsigned long get_nr_to_reclaim(struct scan_control *sc) @@ -5148,29 +5142,18 @@ static unsigned long get_nr_to_reclaim(struct scan_control *sc) if (!global_reclaim(sc)) return -1; - /* discount the previous progress for kswapd */ - if (current_is_kswapd()) - return sc->nr_to_reclaim + sc->last_reclaimed; - return max(sc->nr_to_reclaim, compact_gap(sc->order)); } -static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) +static bool try_to_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) { - struct blk_plug plug; + long nr_to_scan; unsigned long scanned = 0; unsigned long nr_to_reclaim = get_nr_to_reclaim(sc); - lru_add_drain(); - - blk_start_plug(&plug); - - set_mm_walk(lruvec_pgdat(lruvec)); - while (true) { int delta; int swappiness; - unsigned long nr_to_scan; if (sc->may_swap) swappiness = get_swappiness(lruvec, sc); @@ -5180,7 +5163,7 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc swappiness = 0; nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness); - if (!nr_to_scan) + if (nr_to_scan <= 0) break; delta = evict_folios(lruvec, sc, swappiness); @@ -5197,11 +5180,252 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc cond_resched(); } + /* whether try_to_inc_max_seq() was successful */ + return nr_to_scan < 0; +} + +static int shrink_one(struct lruvec *lruvec, struct scan_control *sc) +{ + bool success; + unsigned long scanned = sc->nr_scanned; + unsigned long reclaimed = sc->nr_reclaimed; + int seg = lru_gen_memcg_seg(lruvec); + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + struct pglist_data *pgdat = lruvec_pgdat(lruvec); + + /* see the comment on MEMCG_NR_GENS */ + if (!lruvec_is_sizable(lruvec, sc)) + return seg != MEMCG_LRU_TAIL ? MEMCG_LRU_TAIL : MEMCG_LRU_YOUNG; + + mem_cgroup_calculate_protection(NULL, memcg); + + if (mem_cgroup_below_min(NULL, memcg)) + return MEMCG_LRU_YOUNG; + + if (mem_cgroup_below_low(NULL, memcg)) { + /* see the comment on MEMCG_NR_GENS */ + if (seg != MEMCG_LRU_TAIL) + return MEMCG_LRU_TAIL; + + memcg_memory_event(memcg, MEMCG_LOW); + } + + success = try_to_shrink_lruvec(lruvec, sc); + + shrink_slab(sc->gfp_mask, pgdat->node_id, memcg, sc->priority); + + if (!sc->proactive) + vmpressure(sc->gfp_mask, memcg, false, sc->nr_scanned - scanned, + sc->nr_reclaimed - reclaimed); + + sc->nr_reclaimed += current->reclaim_state->reclaimed_slab; + current->reclaim_state->reclaimed_slab = 0; + + return success ? MEMCG_LRU_YOUNG : 0; +} + +#ifdef CONFIG_MEMCG + +static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc) +{ + int gen; + int bin; + int first_bin; + struct lruvec *lruvec; + struct lru_gen_folio *lrugen; + const struct hlist_nulls_node *pos; + int op = 0; + struct mem_cgroup *memcg = NULL; + unsigned long nr_to_reclaim = get_nr_to_reclaim(sc); + + bin = first_bin = get_random_u32_below(MEMCG_NR_BINS); +restart: + gen = get_memcg_gen(READ_ONCE(pgdat->memcg_lru.seq)); + + rcu_read_lock(); + + hlist_nulls_for_each_entry_rcu(lrugen, pos, &pgdat->memcg_lru.fifo[gen][bin], list) { + if (op) + lru_gen_rotate_memcg(lruvec, op); + + mem_cgroup_put(memcg); + + lruvec = container_of(lrugen, struct lruvec, lrugen); + memcg = lruvec_memcg(lruvec); + + if (!mem_cgroup_tryget(memcg)) { + op = 0; + memcg = NULL; + continue; + } + + rcu_read_unlock(); + + op = shrink_one(lruvec, sc); + + if (sc->nr_reclaimed >= nr_to_reclaim) + goto success; + + rcu_read_lock(); + } + + rcu_read_unlock(); + + /* restart if raced with lru_gen_rotate_memcg() */ + if (gen != get_nulls_value(pos)) + goto restart; + + /* try the rest of the bins of the current generation */ + bin = get_memcg_bin(bin + 1); + if (bin != first_bin) + goto restart; +success: + if (op) + lru_gen_rotate_memcg(lruvec, op); + + mem_cgroup_put(memcg); +} + +static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) +{ + struct blk_plug plug; + + VM_WARN_ON_ONCE(global_reclaim(sc)); + + lru_add_drain(); + + blk_start_plug(&plug); + + set_mm_walk(lruvec_pgdat(lruvec)); + + if (try_to_shrink_lruvec(lruvec, sc)) + lru_gen_rotate_memcg(lruvec, MEMCG_LRU_YOUNG); + clear_mm_walk(); blk_finish_plug(&plug); } +#else /* !CONFIG_MEMCG */ + +static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc) +{ + BUILD_BUG(); +} + +static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) +{ + BUILD_BUG(); +} + +#endif + +static void set_initial_priority(struct pglist_data *pgdat, struct scan_control *sc) +{ + int priority; + unsigned long reclaimable; + struct lruvec *lruvec = mem_cgroup_lruvec(NULL, pgdat); + + if (sc->priority != DEF_PRIORITY || sc->nr_to_reclaim < MIN_LRU_BATCH) + return; + /* + * Determine the initial priority based on ((total / MEMCG_NR_GENS) >> + * priority) * reclaimed_to_scanned_ratio = nr_to_reclaim, where the + * estimated reclaimed_to_scanned_ratio = inactive / total. + */ + reclaimable = node_page_state(pgdat, NR_INACTIVE_FILE); + if (get_swappiness(lruvec, sc)) + reclaimable += node_page_state(pgdat, NR_INACTIVE_ANON); + + reclaimable /= MEMCG_NR_GENS; + + /* round down reclaimable and round up sc->nr_to_reclaim */ + priority = fls_long(reclaimable) - 1 - fls_long(sc->nr_to_reclaim - 1); + + sc->priority = clamp(priority, 0, DEF_PRIORITY); +} + +static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *sc) +{ + struct blk_plug plug; + unsigned long reclaimed = sc->nr_reclaimed; + + VM_WARN_ON_ONCE(!global_reclaim(sc)); + + lru_add_drain(); + + blk_start_plug(&plug); + + set_mm_walk(pgdat); + + set_initial_priority(pgdat, sc); + + if (current_is_kswapd()) + sc->nr_reclaimed = 0; + + if (mem_cgroup_disabled()) + shrink_one(&pgdat->__lruvec, sc); + else + shrink_many(pgdat, sc); + + if (current_is_kswapd()) + sc->nr_reclaimed += reclaimed; + + clear_mm_walk(); + + blk_finish_plug(&plug); + + /* kswapd should never fail */ + pgdat->kswapd_failures = 0; +} + +#ifdef CONFIG_MEMCG +void lru_gen_rotate_memcg(struct lruvec *lruvec, int op) +{ + int seg; + int old, new; + int bin = get_random_u32_below(MEMCG_NR_BINS); + struct pglist_data *pgdat = lruvec_pgdat(lruvec); + + spin_lock(&pgdat->memcg_lru.lock); + + VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list)); + + seg = 0; + new = old = lruvec->lrugen.gen; + + /* see the comment on MEMCG_NR_GENS */ + if (op == MEMCG_LRU_HEAD) + seg = MEMCG_LRU_HEAD; + else if (op == MEMCG_LRU_TAIL) + seg = MEMCG_LRU_TAIL; + else if (op == MEMCG_LRU_OLD) + new = get_memcg_gen(pgdat->memcg_lru.seq); + else if (op == MEMCG_LRU_YOUNG) + new = get_memcg_gen(pgdat->memcg_lru.seq + 1); + else + VM_WARN_ON_ONCE(true); + + hlist_nulls_del_rcu(&lruvec->lrugen.list); + + if (op == MEMCG_LRU_HEAD || op == MEMCG_LRU_OLD) + hlist_nulls_add_head_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]); + else + hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]); + + pgdat->memcg_lru.nr_memcgs[old]--; + pgdat->memcg_lru.nr_memcgs[new]++; + + lruvec->lrugen.gen = new; + WRITE_ONCE(lruvec->lrugen.seg, seg); + + if (!pgdat->memcg_lru.nr_memcgs[old] && old == get_memcg_gen(pgdat->memcg_lru.seq)) + WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1); + + spin_unlock(&pgdat->memcg_lru.lock); +} +#endif + /****************************************************************************** * state change ******************************************************************************/ @@ -5655,11 +5879,11 @@ static int run_cmd(char cmd, int memcg_id, int nid, unsigned long seq, if (!mem_cgroup_disabled()) { rcu_read_lock(); + memcg = mem_cgroup_from_id(memcg_id); -#ifdef CONFIG_MEMCG - if (memcg && !css_tryget(&memcg->css)) + if (!mem_cgroup_tryget(memcg)) memcg = NULL; -#endif + rcu_read_unlock(); if (!memcg) @@ -5807,6 +6031,19 @@ void lru_gen_init_lruvec(struct lruvec *lruvec) } #ifdef CONFIG_MEMCG + +void lru_gen_init_pgdat(struct pglist_data *pgdat) +{ + int i, j; + + spin_lock_init(&pgdat->memcg_lru.lock); + + for (i = 0; i < MEMCG_NR_GENS; i++) { + for (j = 0; j < MEMCG_NR_BINS; j++) + INIT_HLIST_NULLS_HEAD(&pgdat->memcg_lru.fifo[i][j], i); + } +} + void lru_gen_init_memcg(struct mem_cgroup *memcg) { INIT_LIST_HEAD(&memcg->mm_list.fifo); @@ -5830,7 +6067,69 @@ void lru_gen_exit_memcg(struct mem_cgroup *memcg) } } } -#endif + +void lru_gen_online_memcg(struct mem_cgroup *memcg) +{ + int gen; + int nid; + int bin = get_random_u32_below(MEMCG_NR_BINS); + + for_each_node(nid) { + struct pglist_data *pgdat = NODE_DATA(nid); + struct lruvec *lruvec = get_lruvec(memcg, nid); + + spin_lock(&pgdat->memcg_lru.lock); + + VM_WARN_ON_ONCE(!hlist_nulls_unhashed(&lruvec->lrugen.list)); + + gen = get_memcg_gen(pgdat->memcg_lru.seq); + + hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[gen][bin]); + pgdat->memcg_lru.nr_memcgs[gen]++; + + lruvec->lrugen.gen = gen; + + spin_unlock(&pgdat->memcg_lru.lock); + } +} + +void lru_gen_offline_memcg(struct mem_cgroup *memcg) +{ + int nid; + + for_each_node(nid) { + struct lruvec *lruvec = get_lruvec(memcg, nid); + + lru_gen_rotate_memcg(lruvec, MEMCG_LRU_OLD); + } +} + +void lru_gen_release_memcg(struct mem_cgroup *memcg) +{ + int gen; + int nid; + + for_each_node(nid) { + struct pglist_data *pgdat = NODE_DATA(nid); + struct lruvec *lruvec = get_lruvec(memcg, nid); + + spin_lock(&pgdat->memcg_lru.lock); + + VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list)); + + gen = lruvec->lrugen.gen; + + hlist_nulls_del_rcu(&lruvec->lrugen.list); + pgdat->memcg_lru.nr_memcgs[gen]--; + + if (!pgdat->memcg_lru.nr_memcgs[gen] && gen == get_memcg_gen(pgdat->memcg_lru.seq)) + WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1); + + spin_unlock(&pgdat->memcg_lru.lock); + } +} + +#endif /* CONFIG_MEMCG */ static int __init init_lru_gen(void) { @@ -5857,6 +6156,10 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc { } +static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *sc) +{ +} + #endif /* CONFIG_LRU_GEN */ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) @@ -5870,7 +6173,7 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) bool proportional_reclaim; struct blk_plug plug; - if (lru_gen_enabled()) { + if (lru_gen_enabled() && !global_reclaim(sc)) { lru_gen_shrink_lruvec(lruvec, sc); return; } @@ -6113,6 +6416,11 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc) struct lruvec *target_lruvec; bool reclaimable = false; + if (lru_gen_enabled() && global_reclaim(sc)) { + lru_gen_shrink_node(pgdat, sc); + return; + } + target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat); again: -- cgit v1.2.3 From fc5744881eabcc73ed24a4229034f0fbdeb3f46f Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (IBM)" Date: Wed, 4 Jan 2023 21:18:05 +0200 Subject: mm/page_alloc: invert logic for early page initialisation checks Rename early_page_uninitialised() to early_page_initialised() and invert its logic to make the code more readable. Link: https://lkml.kernel.org/r/20230104191805.2535864-1-rppt@kernel.org Signed-off-by: Mike Rapoport (IBM) Reviewed-by: David Hildenbrand Reviewed-by: Anshuman Khandual Acked-by: Mel Gorman Signed-off-by: Andrew Morton --- mm/page_alloc.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5668c1a2de49..5514d84cc712 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -443,15 +443,15 @@ static inline bool deferred_pages_enabled(void) return static_branch_unlikely(&deferred_pages); } -/* Returns true if the struct page for the pfn is uninitialised */ -static inline bool __meminit early_page_uninitialised(unsigned long pfn) +/* Returns true if the struct page for the pfn is initialised */ +static inline bool __meminit early_page_initialised(unsigned long pfn) { int nid = early_pfn_to_nid(pfn); if (node_online(nid) && pfn >= NODE_DATA(nid)->first_deferred_pfn) - return true; + return false; - return false; + return true; } /* @@ -498,9 +498,9 @@ static inline bool deferred_pages_enabled(void) return false; } -static inline bool early_page_uninitialised(unsigned long pfn) +static inline bool early_page_initialised(unsigned long pfn) { - return false; + return true; } static inline bool defer_init(int nid, unsigned long pfn, unsigned long end_pfn) @@ -1643,7 +1643,7 @@ static void __meminit init_reserved_page(unsigned long pfn) pg_data_t *pgdat; int nid, zid; - if (!early_page_uninitialised(pfn)) + if (early_page_initialised(pfn)) return; nid = early_pfn_to_nid(pfn); @@ -1806,7 +1806,7 @@ int __meminit early_pfn_to_nid(unsigned long pfn) void __init memblock_free_pages(struct page *page, unsigned long pfn, unsigned int order) { - if (early_page_uninitialised(pfn)) + if (!early_page_initialised(pfn)) return; if (!kmsan_memblock_free_pages(page, order)) { /* KMSAN will take care of these pages. */ -- cgit v1.2.3 From 94688e8eb453e616098cb930e5f6fed4a6ea2dfa Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 11 Jan 2023 14:28:47 +0000 Subject: mm: remove folio_pincount_ptr() and head_compound_pincount() We can use folio->_pincount directly, since all users are guarded by tests of compound/large. Link: https://lkml.kernel.org/r/20230111142915.1001531-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: John Hubbard Signed-off-by: Andrew Morton --- Documentation/core-api/pin_user_pages.rst | 29 ++++++++++++++--------------- include/linux/mm.h | 14 ++------------ include/linux/mm_types.h | 5 ----- mm/debug.c | 4 ++-- mm/gup.c | 8 ++++---- mm/huge_memory.c | 4 ++-- mm/hugetlb.c | 4 ++-- mm/page_alloc.c | 9 ++++++--- 8 files changed, 32 insertions(+), 45 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/Documentation/core-api/pin_user_pages.rst b/Documentation/core-api/pin_user_pages.rst index facafbdecb95..9fb0b1080d3b 100644 --- a/Documentation/core-api/pin_user_pages.rst +++ b/Documentation/core-api/pin_user_pages.rst @@ -55,18 +55,17 @@ flags the caller provides. The caller is required to pass in a non-null struct pages* array, and the function then pins pages by incrementing each by a special value: GUP_PIN_COUNTING_BIAS. -For compound pages, the GUP_PIN_COUNTING_BIAS scheme is not used. Instead, -an exact form of pin counting is achieved, by using the 2nd struct page -in the compound page. A new struct page field, compound_pincount, has -been added in order to support this. - -This approach for compound pages avoids the counting upper limit problems that -are discussed below. Those limitations would have been aggravated severely by -huge pages, because each tail page adds a refcount to the head page. And in -fact, testing revealed that, without a separate compound_pincount field, -page overflows were seen in some huge page stress tests. - -This also means that huge pages and compound pages do not suffer +For large folios, the GUP_PIN_COUNTING_BIAS scheme is not used. Instead, +the extra space available in the struct folio is used to store the +pincount directly. + +This approach for large folios avoids the counting upper limit problems +that are discussed below. Those limitations would have been aggravated +severely by huge pages, because each tail page adds a refcount to the +head page. And in fact, testing revealed that, without a separate pincount +field, refcount overflows were seen in some huge page stress tests. + +This also means that huge pages and large folios do not suffer from the false positives problem that is mentioned below.:: Function @@ -264,9 +263,9 @@ place.) Other diagnostics ================= -dump_page() has been enhanced slightly, to handle these new counting -fields, and to better report on compound pages in general. Specifically, -for compound pages, the exact (compound_pincount) pincount is reported. +dump_page() has been enhanced slightly to handle these new counting +fields, and to better report on large folios in general. Specifically, +for large folios, the exact pincount is reported. References ========== diff --git a/include/linux/mm.h b/include/linux/mm.h index 76c97cb8ee9a..6d3945207067 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1011,11 +1011,6 @@ static inline void folio_set_compound_dtor(struct folio *folio, void destroy_large_folio(struct folio *folio); -static inline int head_compound_pincount(struct page *head) -{ - return atomic_read(compound_pincount_ptr(head)); -} - static inline void set_compound_order(struct page *page, unsigned int order) { page[1].compound_order = order; @@ -1641,11 +1636,6 @@ static inline struct folio *pfn_folio(unsigned long pfn) return page_folio(pfn_to_page(pfn)); } -static inline atomic_t *folio_pincount_ptr(struct folio *folio) -{ - return &folio_page(folio, 1)->compound_pincount; -} - /** * folio_maybe_dma_pinned - Report if a folio may be pinned for DMA. * @folio: The folio. @@ -1663,7 +1653,7 @@ static inline atomic_t *folio_pincount_ptr(struct folio *folio) * expected to be able to deal gracefully with a false positive. * * For large folios, the result will be exactly correct. That's because - * we have more tracking data available: the compound_pincount is used + * we have more tracking data available: the _pincount field is used * instead of the GUP_PIN_COUNTING_BIAS scheme. * * For more information, please see Documentation/core-api/pin_user_pages.rst. @@ -1674,7 +1664,7 @@ static inline atomic_t *folio_pincount_ptr(struct folio *folio) static inline bool folio_maybe_dma_pinned(struct folio *folio) { if (folio_test_large(folio)) - return atomic_read(folio_pincount_ptr(folio)) > 0; + return atomic_read(&folio->_pincount) > 0; /* * folio_ref_count() is signed. If that refcount overflows, then diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 10b6eb311ede..6ff1d7db00a7 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -443,11 +443,6 @@ static inline atomic_t *subpages_mapcount_ptr(struct page *page) return &page[1].subpages_mapcount; } -static inline atomic_t *compound_pincount_ptr(struct page *page) -{ - return &page[1].compound_pincount; -} - /* * Used for sizing the vmemmap region on some architectures */ diff --git a/mm/debug.c b/mm/debug.c index 7f8e5f744e42..893c9dbf76ca 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -94,11 +94,11 @@ static void __dump_page(struct page *page) page, page_ref_count(head), mapcount, mapping, page_to_pgoff(page), page_to_pfn(page)); if (compound) { - pr_warn("head:%p order:%u compound_mapcount:%d subpages_mapcount:%d compound_pincount:%d\n", + pr_warn("head:%p order:%u compound_mapcount:%d subpages_mapcount:%d pincount:%d\n", head, compound_order(head), head_compound_mapcount(head), head_subpages_mapcount(head), - head_compound_pincount(head)); + atomic_read(&folio->_pincount)); } #ifdef CONFIG_MEMCG diff --git a/mm/gup.c b/mm/gup.c index f45a3a5be53a..38ba1697dd61 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -111,7 +111,7 @@ retry: * FOLL_GET: folio's refcount will be incremented by @refs. * * FOLL_PIN on large folios: folio's refcount will be incremented by - * @refs, and its compound_pincount will be incremented by @refs. + * @refs, and its pincount will be incremented by @refs. * * FOLL_PIN on single-page folios: folio's refcount will be incremented by * @refs * GUP_PIN_COUNTING_BIAS. @@ -157,7 +157,7 @@ struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags) * try_get_folio() is left intact. */ if (folio_test_large(folio)) - atomic_add(refs, folio_pincount_ptr(folio)); + atomic_add(refs, &folio->_pincount); else folio_ref_add(folio, refs * (GUP_PIN_COUNTING_BIAS - 1)); @@ -182,7 +182,7 @@ static void gup_put_folio(struct folio *folio, int refs, unsigned int flags) if (flags & FOLL_PIN) { node_stat_mod_folio(folio, NR_FOLL_PIN_RELEASED, refs); if (folio_test_large(folio)) - atomic_sub(refs, folio_pincount_ptr(folio)); + atomic_sub(refs, &folio->_pincount); else refs *= GUP_PIN_COUNTING_BIAS; } @@ -232,7 +232,7 @@ int __must_check try_grab_page(struct page *page, unsigned int flags) */ if (folio_test_large(folio)) { folio_ref_add(folio, 1); - atomic_add(1, folio_pincount_ptr(folio)); + atomic_add(1, &folio->_pincount); } else { folio_ref_add(folio, GUP_PIN_COUNTING_BIAS); } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index c13b1f67d14e..9570f03cdee4 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2477,9 +2477,9 @@ static void __split_huge_page_tail(struct page *head, int tail, * of swap cache pages that store the swp_entry_t in tail pages. * Fix up and warn once if private is unexpectedly set. * - * What of 32-bit systems, on which head[1].compound_pincount overlays + * What of 32-bit systems, on which folio->_pincount overlays * head[1].private? No problem: THP_SWAP is not enabled on 32-bit, and - * compound_pincount must be 0 for folio_ref_freeze() to have succeeded. + * pincount must be 0 for folio_ref_freeze() to have succeeded. */ if (!folio_test_swapcache(page_folio(head))) { VM_WARN_ON_ONCE_PAGE(page_tail->private != 0, page_tail); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 273a6522aa4c..15b2707c1600 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1476,7 +1476,7 @@ static void __destroy_compound_gigantic_folio(struct folio *folio, atomic_set(folio_mapcount_ptr(folio), 0); atomic_set(folio_subpages_mapcount_ptr(folio), 0); - atomic_set(folio_pincount_ptr(folio), 0); + atomic_set(&folio->_pincount, 0); for (i = 1; i < nr_pages; i++) { p = folio_page(folio, i); @@ -1998,7 +1998,7 @@ static bool __prep_compound_gigantic_folio(struct folio *folio, } atomic_set(folio_mapcount_ptr(folio), -1); atomic_set(folio_subpages_mapcount_ptr(folio), 0); - atomic_set(folio_pincount_ptr(folio), 0); + atomic_set(&folio->_pincount, 0); return true; out_error: diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5514d84cc712..b224c2132ed1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -775,11 +775,13 @@ void free_compound_page(struct page *page) static void prep_compound_head(struct page *page, unsigned int order) { + struct folio *folio = (struct folio *)page; + set_compound_page_dtor(page, COMPOUND_PAGE_DTOR); set_compound_order(page, order); atomic_set(compound_mapcount_ptr(page), -1); atomic_set(subpages_mapcount_ptr(page), 0); - atomic_set(compound_pincount_ptr(page), 0); + atomic_set(&folio->_pincount, 0); } static void prep_compound_tail(struct page *head, int tail_idx) @@ -1291,6 +1293,7 @@ static inline bool free_page_is_bad(struct page *page) static int free_tail_pages_check(struct page *head_page, struct page *page) { + struct folio *folio = (struct folio *)head_page; int ret = 1; /* @@ -1314,8 +1317,8 @@ static int free_tail_pages_check(struct page *head_page, struct page *page) bad_page(page, "nonzero subpages_mapcount"); goto out; } - if (unlikely(head_compound_pincount(head_page))) { - bad_page(page, "nonzero compound_pincount"); + if (unlikely(atomic_read(&folio->_pincount))) { + bad_page(page, "nonzero pincount"); goto out; } break; -- cgit v1.2.3 From 65a689f35ad7ebbfb79f429c1bc290b042ebb10b Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 11 Jan 2023 14:28:55 +0000 Subject: page_alloc: use folio fields directly Rmove the uses of compound_mapcount_ptr(), head_compound_mapcount() and subpages_mapcount_ptr() Link: https://lkml.kernel.org/r/20230111142915.1001531-10-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/page_alloc.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b224c2132ed1..f15e0e15243f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -779,8 +779,8 @@ static void prep_compound_head(struct page *page, unsigned int order) set_compound_page_dtor(page, COMPOUND_PAGE_DTOR); set_compound_order(page, order); - atomic_set(compound_mapcount_ptr(page), -1); - atomic_set(subpages_mapcount_ptr(page), 0); + atomic_set(&folio->_entire_mapcount, -1); + atomic_set(&folio->_nr_pages_mapped, 0); atomic_set(&folio->_pincount, 0); } @@ -1309,12 +1309,12 @@ static int free_tail_pages_check(struct page *head_page, struct page *page) switch (page - head_page) { case 1: /* the first tail page: these may be in place of ->mapping */ - if (unlikely(head_compound_mapcount(head_page))) { - bad_page(page, "nonzero compound_mapcount"); + if (unlikely(folio_entire_mapcount(folio))) { + bad_page(page, "nonzero entire_mapcount"); goto out; } - if (unlikely(atomic_read(subpages_mapcount_ptr(head_page)))) { - bad_page(page, "nonzero subpages_mapcount"); + if (unlikely(atomic_read(&folio->_nr_pages_mapped))) { + bad_page(page, "nonzero nr_pages_mapped"); goto out; } if (unlikely(atomic_read(&folio->_pincount))) { -- cgit v1.2.3 From a60d5942cc9bd65806b69360020d9b1664c747ad Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 11 Jan 2023 14:29:06 +0000 Subject: mm: convert destroy_large_folio() to use folio_dtor Replace a use of compound_dtor. Link: https://lkml.kernel.org/r/20230111142915.1001531-21-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f15e0e15243f..88494e82843d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -807,7 +807,7 @@ void prep_compound_page(struct page *page, unsigned int order) void destroy_large_folio(struct folio *folio) { - enum compound_dtor_id dtor = folio_page(folio, 1)->compound_dtor; + enum compound_dtor_id dtor = folio->_folio_dtor; VM_BUG_ON_FOLIO(dtor >= NR_COMPOUND_DTORS, folio); compound_page_dtors[dtor](&folio->page); -- cgit v1.2.3 From 96f97c438f61ddba94117dcd1a1eb0aaafa22309 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Thu, 12 Jan 2023 12:39:31 +0000 Subject: mm: mlock: update the interface to use folios Update the mlock interface to accept folios rather than pages, bringing the interface in line with the internal implementation. munlock_vma_page() still requires a page_folio() conversion, however this is consistent with the existent mlock_vma_page() implementation and a product of rmap still dealing in pages rather than folios. Link: https://lkml.kernel.org/r/cba12777c5544305014bc0cbec56bb4cc71477d8.1673526881.git.lstoakes@gmail.com Signed-off-by: Lorenzo Stoakes Acked-by: Vlastimil Babka Cc: Christian Brauner Cc: Geert Uytterhoeven Cc: Hugh Dickins Cc: Joel Fernandes (Google) Cc: Jonathan Corbet Cc: Liam R. Howlett Cc: Matthew Wilcox Cc: Mike Rapoport (IBM) Cc: William Kucharski Signed-off-by: Andrew Morton --- mm/internal.h | 38 ++++++++++++++++++++++---------------- mm/migrate.c | 2 +- mm/mlock.c | 38 ++++++++++++++++++-------------------- mm/page_alloc.c | 2 +- mm/rmap.c | 4 ++-- mm/swap.c | 10 +++++----- 6 files changed, 49 insertions(+), 45 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/internal.h b/mm/internal.h index 583e15357e09..973b48e8b1af 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -533,10 +533,9 @@ extern int mlock_future_check(struct mm_struct *mm, unsigned long flags, * should be called with vma's mmap_lock held for read or write, * under page table lock for the pte/pmd being added or removed. * - * mlock is usually called at the end of page_add_*_rmap(), - * munlock at the end of page_remove_rmap(); but new anon - * pages are managed by lru_cache_add_inactive_or_unevictable() - * calling mlock_new_page(). + * mlock is usually called at the end of page_add_*_rmap(), munlock at + * the end of page_remove_rmap(); but new anon folios are managed by + * folio_add_lru_vma() calling mlock_new_folio(). * * @compound is used to include pmd mappings of THPs, but filter out * pte mappings of THPs, which cannot be consistently counted: a pte @@ -565,18 +564,25 @@ static inline void mlock_vma_page(struct page *page, mlock_vma_folio(page_folio(page), vma, compound); } -void munlock_page(struct page *page); -static inline void munlock_vma_page(struct page *page, +void munlock_folio(struct folio *folio); + +static inline void munlock_vma_folio(struct folio *folio, struct vm_area_struct *vma, bool compound) { if (unlikely(vma->vm_flags & VM_LOCKED) && - (compound || !PageTransCompound(page))) - munlock_page(page); + (compound || !folio_test_large(folio))) + munlock_folio(folio); +} + +static inline void munlock_vma_page(struct page *page, + struct vm_area_struct *vma, bool compound) +{ + munlock_vma_folio(page_folio(page), vma, compound); } -void mlock_new_page(struct page *page); -bool need_mlock_page_drain(int cpu); -void mlock_page_drain_local(void); -void mlock_page_drain_remote(int cpu); +void mlock_new_folio(struct folio *folio); +bool need_mlock_drain(int cpu); +void mlock_drain_local(void); +void mlock_drain_remote(int cpu); extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma); @@ -665,10 +671,10 @@ static inline void mlock_vma_page(struct page *page, struct vm_area_struct *vma, bool compound) { } static inline void munlock_vma_page(struct page *page, struct vm_area_struct *vma, bool compound) { } -static inline void mlock_new_page(struct page *page) { } -static inline bool need_mlock_page_drain(int cpu) { return false; } -static inline void mlock_page_drain_local(void) { } -static inline void mlock_page_drain_remote(int cpu) { } +static inline void mlock_new_folio(struct folio *folio) { } +static inline bool need_mlock_drain(int cpu) { return false; } +static inline void mlock_drain_local(void) { } +static inline void mlock_drain_remote(int cpu) { } static inline void vunmap_range_noflush(unsigned long start, unsigned long end) { } diff --git a/mm/migrate.c b/mm/migrate.c index 98de7ce2b576..206fcdbe67f3 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -265,7 +265,7 @@ static bool remove_migration_pte(struct folio *folio, set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte); } if (vma->vm_flags & VM_LOCKED) - mlock_page_drain_local(); + mlock_drain_local(); trace_remove_migration_pte(pvmw.address, pte_val(pte), compound_order(new)); diff --git a/mm/mlock.c b/mm/mlock.c index f8e8d30ab08a..9e9c8be58277 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -210,7 +210,7 @@ static void mlock_folio_batch(struct folio_batch *fbatch) folio_batch_reinit(fbatch); } -void mlock_page_drain_local(void) +void mlock_drain_local(void) { struct folio_batch *fbatch; @@ -221,7 +221,7 @@ void mlock_page_drain_local(void) local_unlock(&mlock_fbatch.lock); } -void mlock_page_drain_remote(int cpu) +void mlock_drain_remote(int cpu) { struct folio_batch *fbatch; @@ -231,7 +231,7 @@ void mlock_page_drain_remote(int cpu) mlock_folio_batch(fbatch); } -bool need_mlock_page_drain(int cpu) +bool need_mlock_drain(int cpu) { return folio_batch_count(&per_cpu(mlock_fbatch.fbatch, cpu)); } @@ -262,13 +262,12 @@ void mlock_folio(struct folio *folio) } /** - * mlock_new_page - mlock a newly allocated page not yet on LRU - * @page: page to be mlocked, either a normal page or a THP head. + * mlock_new_folio - mlock a newly allocated folio not yet on LRU + * @folio: folio to be mlocked, either normal or a THP head. */ -void mlock_new_page(struct page *page) +void mlock_new_folio(struct folio *folio) { struct folio_batch *fbatch; - struct folio *folio = page_folio(page); int nr_pages = folio_nr_pages(folio); local_lock(&mlock_fbatch.lock); @@ -286,13 +285,12 @@ void mlock_new_page(struct page *page) } /** - * munlock_page - munlock a page - * @page: page to be munlocked, either a normal page or a THP head. + * munlock_folio - munlock a folio + * @folio: folio to be munlocked, either normal or a THP head. */ -void munlock_page(struct page *page) +void munlock_folio(struct folio *folio) { struct folio_batch *fbatch; - struct folio *folio = page_folio(page); local_lock(&mlock_fbatch.lock); fbatch = this_cpu_ptr(&mlock_fbatch.fbatch); @@ -314,7 +312,7 @@ static int mlock_pte_range(pmd_t *pmd, unsigned long addr, struct vm_area_struct *vma = walk->vma; spinlock_t *ptl; pte_t *start_pte, *pte; - struct page *page; + struct folio *folio; ptl = pmd_trans_huge_lock(pmd, vma); if (ptl) { @@ -322,11 +320,11 @@ static int mlock_pte_range(pmd_t *pmd, unsigned long addr, goto out; if (is_huge_zero_pmd(*pmd)) goto out; - page = pmd_page(*pmd); + folio = page_folio(pmd_page(*pmd)); if (vma->vm_flags & VM_LOCKED) - mlock_folio(page_folio(page)); + mlock_folio(folio); else - munlock_page(page); + munlock_folio(folio); goto out; } @@ -334,15 +332,15 @@ static int mlock_pte_range(pmd_t *pmd, unsigned long addr, for (pte = start_pte; addr != end; pte++, addr += PAGE_SIZE) { if (!pte_present(*pte)) continue; - page = vm_normal_page(vma, addr, *pte); - if (!page || is_zone_device_page(page)) + folio = vm_normal_folio(vma, addr, *pte); + if (!folio || folio_is_zone_device(folio)) continue; - if (PageTransCompound(page)) + if (folio_test_large(folio)) continue; if (vma->vm_flags & VM_LOCKED) - mlock_folio(page_folio(page)); + mlock_folio(folio); else - munlock_page(page); + munlock_folio(folio); } pte_unmap(start_pte); out: diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 88494e82843d..83be3b571fd0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -8587,7 +8587,7 @@ static int page_alloc_cpu_dead(unsigned int cpu) struct zone *zone; lru_add_drain_cpu(cpu); - mlock_page_drain_remote(cpu); + mlock_drain_remote(cpu); drain_pages(cpu); /* diff --git a/mm/rmap.c b/mm/rmap.c index a079d9964b9c..073999f78adf 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1764,7 +1764,7 @@ discard: */ page_remove_rmap(subpage, vma, folio_test_hugetlb(folio)); if (vma->vm_flags & VM_LOCKED) - mlock_page_drain_local(); + mlock_drain_local(); folio_put(folio); } @@ -2105,7 +2105,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, */ page_remove_rmap(subpage, vma, folio_test_hugetlb(folio)); if (vma->vm_flags & VM_LOCKED) - mlock_page_drain_local(); + mlock_drain_local(); folio_put(folio); } diff --git a/mm/swap.c b/mm/swap.c index e54e2a252e27..42d67f9baa8c 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -562,7 +562,7 @@ void folio_add_lru_vma(struct folio *folio, struct vm_area_struct *vma) VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); if (unlikely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) == VM_LOCKED)) - mlock_new_page(&folio->page); + mlock_new_folio(folio); else folio_add_lru(folio); } @@ -781,7 +781,7 @@ void lru_add_drain(void) local_lock(&cpu_fbatches.lock); lru_add_drain_cpu(smp_processor_id()); local_unlock(&cpu_fbatches.lock); - mlock_page_drain_local(); + mlock_drain_local(); } /* @@ -796,7 +796,7 @@ static void lru_add_and_bh_lrus_drain(void) lru_add_drain_cpu(smp_processor_id()); local_unlock(&cpu_fbatches.lock); invalidate_bh_lrus_cpu(); - mlock_page_drain_local(); + mlock_drain_local(); } void lru_add_drain_cpu_zone(struct zone *zone) @@ -805,7 +805,7 @@ void lru_add_drain_cpu_zone(struct zone *zone) lru_add_drain_cpu(smp_processor_id()); drain_local_pages(zone); local_unlock(&cpu_fbatches.lock); - mlock_page_drain_local(); + mlock_drain_local(); } #ifdef CONFIG_SMP @@ -828,7 +828,7 @@ static bool cpu_needs_drain(unsigned int cpu) folio_batch_count(&fbatches->lru_deactivate) || folio_batch_count(&fbatches->lru_lazyfree) || folio_batch_count(&fbatches->activate) || - need_mlock_page_drain(cpu) || + need_mlock_drain(cpu) || has_bh_in_lru(cpu, NULL); } -- cgit v1.2.3 From 524c48072e5673f4511f1ad81493e2485863fd65 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 13 Jan 2023 11:12:12 +0000 Subject: mm/page_alloc: rename ALLOC_HIGH to ALLOC_MIN_RESERVE Patch series "Discard __GFP_ATOMIC", v3. Neil's patch has been residing in mm-unstable as commit 2fafb4fe8f7a ("mm: discard __GFP_ATOMIC") for a long time and recently brought up again. Most recently, I was worried that __GFP_HIGH allocations could use high-order atomic reserves which is unintentional but there was no response so lets revisit -- this series reworks how min reserves are used, protects highorder reserves and then finishes with Neil's patch with very minor modifications so it fits on top. There was a review discussion on renaming __GFP_DIRECT_RECLAIM to __GFP_ALLOW_BLOCKING but I didn't think it was that big an issue and is orthogonal to the removal of __GFP_ATOMIC. There were some concerns about how the gfp flags affect the min reserves but it never reached a solid conclusion so I made my own attempt. The series tries to iron out some of the details on how reserves are used. ALLOC_HIGH becomes ALLOC_MIN_RESERVE and ALLOC_HARDER becomes ALLOC_NON_BLOCK and documents how the reserves are affected. For example, ALLOC_NON_BLOCK (no direct reclaim) on its own allows 25% of the min reserve. ALLOC_MIN_RESERVE (__GFP_HIGH) allows 50% and both combined allows deeper access again. ALLOC_OOM allows access to 75%. High-order atomic allocations are explicitly handled with the caveat that no __GFP_ATOMIC flag means that any high-order allocation that specifies GFP_HIGH and cannot enter direct reclaim will be treated as if it was GFP_ATOMIC. This patch (of 6): __GFP_HIGH aliases to ALLOC_HIGH but the name does not really hint what it means. As ALLOC_HIGH is internal to the allocator, rename it to ALLOC_MIN_RESERVE to document that the min reserves can be depleted. Link: https://lkml.kernel.org/r/20230113111217.14134-1-mgorman@techsingularity.net Link: https://lkml.kernel.org/r/20230113111217.14134-2-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Acked-by: Michal Hocko Cc: Matthew Wilcox Cc: NeilBrown Cc: Thierry Reding Signed-off-by: Andrew Morton --- mm/internal.h | 4 +++- mm/page_alloc.c | 8 ++++---- 2 files changed, 7 insertions(+), 5 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/internal.h b/mm/internal.h index 973b48e8b1af..99eb544fbded 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -779,7 +779,9 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone, #endif #define ALLOC_HARDER 0x10 /* try to alloc harder */ -#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ +#define ALLOC_MIN_RESERVE 0x20 /* __GFP_HIGH set. Allow access to 50% + * of the min watermark. + */ #define ALLOC_CPUSET 0x40 /* check for correct cpuset */ #define ALLOC_CMA 0x80 /* allow allocations from CMA areas */ #ifdef CONFIG_ZONE_DMA32 diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 83be3b571fd0..4c1f1d487c3e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3994,7 +3994,7 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, /* free_pages may go negative - that's OK */ free_pages -= __zone_watermark_unusable_free(z, order, alloc_flags); - if (alloc_flags & ALLOC_HIGH) + if (alloc_flags & ALLOC_MIN_RESERVE) min -= min / 2; if (unlikely(alloc_harder)) { @@ -4836,18 +4836,18 @@ gfp_to_alloc_flags(gfp_t gfp_mask) unsigned int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET; /* - * __GFP_HIGH is assumed to be the same as ALLOC_HIGH + * __GFP_HIGH is assumed to be the same as ALLOC_MIN_RESERVE * and __GFP_KSWAPD_RECLAIM is assumed to be the same as ALLOC_KSWAPD * to save two branches. */ - BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH); + BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_MIN_RESERVE); BUILD_BUG_ON(__GFP_KSWAPD_RECLAIM != (__force gfp_t) ALLOC_KSWAPD); /* * The caller may dip into page reserves a bit more if the caller * cannot run direct reclaim, or if the caller has realtime scheduling * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will - * set both ALLOC_HARDER (__GFP_ATOMIC) and ALLOC_HIGH (__GFP_HIGH). + * set both ALLOC_HARDER (__GFP_ATOMIC) and ALLOC_MIN_RESERVE(__GFP_HIGH). */ alloc_flags |= (__force int) (gfp_mask & (__GFP_HIGH | __GFP_KSWAPD_RECLAIM)); -- cgit v1.2.3 From c988dcbecf3fd5430921eaa3fe9054754f76d185 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 13 Jan 2023 11:12:13 +0000 Subject: mm/page_alloc: treat RT tasks similar to __GFP_HIGH RT tasks are allowed to dip below the min reserve but ALLOC_HARDER is typically combined with ALLOC_MIN_RESERVE so RT tasks are a little unusual. While there is some justification for allowing RT tasks access to memory reserves, there is a strong chance that a RT task that is also under memory pressure is at risk of missing deadlines anyway. Relax how much reserves an RT task can access by treating it the same as __GFP_HIGH allocations. Note that in a future kernel release that the RT special casing will be removed. Hard realtime tasks should be locking down resources in advance and ensuring enough memory is available. Even a soft-realtime task like audio or video live decoding which cannot jitter should be allocating both memory and any disk space required up-front before the recording starts instead of relying on reserves. At best, reserve access will only delay the problem by a very short interval. Link: https://lkml.kernel.org/r/20230113111217.14134-3-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Acked-by: Michal Hocko Cc: Matthew Wilcox Cc: NeilBrown Cc: Thierry Reding Signed-off-by: Andrew Morton --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4c1f1d487c3e..4f5c2e83cb7b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4865,7 +4865,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask) */ alloc_flags &= ~ALLOC_CPUSET; } else if (unlikely(rt_task(current)) && in_task()) - alloc_flags |= ALLOC_HARDER; + alloc_flags |= ALLOC_MIN_RESERVE; alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, alloc_flags); -- cgit v1.2.3 From eb2e2b425c6984ca8034448a3f2c680622bd3d4d Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 13 Jan 2023 11:12:14 +0000 Subject: mm/page_alloc: explicitly record high-order atomic allocations in alloc_flags A high-order ALLOC_HARDER allocation is assumed to be atomic. While that is accurate, it changes later in the series. In preparation, explicitly record high-order atomic allocations in gfp_to_alloc_flags(). Link: https://lkml.kernel.org/r/20230113111217.14134-4-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Acked-by: Michal Hocko Cc: Matthew Wilcox Cc: NeilBrown Cc: Thierry Reding Signed-off-by: Andrew Morton --- mm/internal.h | 1 + mm/page_alloc.c | 29 +++++++++++++++++++++++------ 2 files changed, 24 insertions(+), 6 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/internal.h b/mm/internal.h index 99eb544fbded..62428467d7cf 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -789,6 +789,7 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone, #else #define ALLOC_NOFRAGMENT 0x0 #endif +#define ALLOC_HIGHATOMIC 0x200 /* Allows access to MIGRATE_HIGHATOMIC */ #define ALLOC_KSWAPD 0x800 /* allow waking of kswapd, __GFP_KSWAPD_RECLAIM set */ enum ttu_flags; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4f5c2e83cb7b..8a7e1cfa8353 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3724,10 +3724,20 @@ struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone, * reserved for high-order atomic allocation, so order-0 * request should skip it. */ - if (order > 0 && alloc_flags & ALLOC_HARDER) + if (alloc_flags & ALLOC_HIGHATOMIC) page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC); if (!page) { page = __rmqueue(zone, order, migratetype, alloc_flags); + + /* + * If the allocation fails, allow OOM handling access + * to HIGHATOMIC reserves as failing now is worse than + * failing a high-order atomic allocation in the + * future. + */ + if (!page && (alloc_flags & ALLOC_OOM)) + page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC); + if (!page) { spin_unlock_irqrestore(&zone->lock, flags); return NULL; @@ -4041,8 +4051,10 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, return true; } #endif - if (alloc_harder && !free_area_empty(area, MIGRATE_HIGHATOMIC)) + if ((alloc_flags & (ALLOC_HIGHATOMIC|ALLOC_OOM)) && + !free_area_empty(area, MIGRATE_HIGHATOMIC)) { return true; + } } return false; } @@ -4304,7 +4316,7 @@ try_this_zone: * If this is a high-order atomic allocation then check * if the pageblock should be reserved for the future */ - if (unlikely(order && (alloc_flags & ALLOC_HARDER))) + if (unlikely(alloc_flags & ALLOC_HIGHATOMIC)) reserve_highatomic_pageblock(page, zone, order); return page; @@ -4831,7 +4843,7 @@ static void wake_all_kswapds(unsigned int order, gfp_t gfp_mask, } static inline unsigned int -gfp_to_alloc_flags(gfp_t gfp_mask) +gfp_to_alloc_flags(gfp_t gfp_mask, unsigned int order) { unsigned int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET; @@ -4857,8 +4869,13 @@ gfp_to_alloc_flags(gfp_t gfp_mask) * Not worth trying to allocate harder for __GFP_NOMEMALLOC even * if it can't schedule. */ - if (!(gfp_mask & __GFP_NOMEMALLOC)) + if (!(gfp_mask & __GFP_NOMEMALLOC)) { alloc_flags |= ALLOC_HARDER; + + if (order > 0) + alloc_flags |= ALLOC_HIGHATOMIC; + } + /* * Ignore cpuset mems for GFP_ATOMIC rather than fail, see the * comment for __cpuset_node_allowed(). @@ -5066,7 +5083,7 @@ restart: * kswapd needs to be woken up, and to avoid the cost of setting up * alloc_flags precisely. So we do that now. */ - alloc_flags = gfp_to_alloc_flags(gfp_mask); + alloc_flags = gfp_to_alloc_flags(gfp_mask, order); /* * We need to recalculate the starting point for the zonelist iterator -- cgit v1.2.3 From ab3508854353793cd35e348fde89a5c09b2fd8b5 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 13 Jan 2023 11:12:15 +0000 Subject: mm/page_alloc: explicitly define what alloc flags deplete min reserves As there are more ALLOC_ flags that affect reserves, define what flags affect reserves and clarify the effect of each flag. Link: https://lkml.kernel.org/r/20230113111217.14134-5-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Acked-by: Michal Hocko Cc: Matthew Wilcox Cc: NeilBrown Cc: Thierry Reding Signed-off-by: Andrew Morton --- mm/internal.h | 3 +++ mm/page_alloc.c | 34 ++++++++++++++++++++++------------ 2 files changed, 25 insertions(+), 12 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/internal.h b/mm/internal.h index 62428467d7cf..f7c3bc80c475 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -792,6 +792,9 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone, #define ALLOC_HIGHATOMIC 0x200 /* Allows access to MIGRATE_HIGHATOMIC */ #define ALLOC_KSWAPD 0x800 /* allow waking of kswapd, __GFP_KSWAPD_RECLAIM set */ +/* Flags that allow allocations below the min watermark. */ +#define ALLOC_RESERVES (ALLOC_HARDER|ALLOC_MIN_RESERVE|ALLOC_HIGHATOMIC|ALLOC_OOM) + enum ttu_flags; struct tlbflush_unmap_batch; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8a7e1cfa8353..ffe2151d11ee 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3967,15 +3967,14 @@ ALLOW_ERROR_INJECTION(should_fail_alloc_page, TRUE); static inline long __zone_watermark_unusable_free(struct zone *z, unsigned int order, unsigned int alloc_flags) { - const bool alloc_harder = (alloc_flags & (ALLOC_HARDER|ALLOC_OOM)); long unusable_free = (1 << order) - 1; /* - * If the caller does not have rights to ALLOC_HARDER then subtract - * the high-atomic reserves. This will over-estimate the size of the - * atomic reserve but it avoids a search. + * If the caller does not have rights to reserves below the min + * watermark then subtract the high-atomic reserves. This will + * over-estimate the size of the atomic reserve but it avoids a search. */ - if (likely(!alloc_harder)) + if (likely(!(alloc_flags & ALLOC_RESERVES))) unusable_free += z->nr_reserved_highatomic; #ifdef CONFIG_CMA @@ -3999,25 +3998,36 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, { long min = mark; int o; - const bool alloc_harder = (alloc_flags & (ALLOC_HARDER|ALLOC_OOM)); /* free_pages may go negative - that's OK */ free_pages -= __zone_watermark_unusable_free(z, order, alloc_flags); - if (alloc_flags & ALLOC_MIN_RESERVE) - min -= min / 2; + if (unlikely(alloc_flags & ALLOC_RESERVES)) { + /* + * __GFP_HIGH allows access to 50% of the min reserve as well + * as OOM. + */ + if (alloc_flags & ALLOC_MIN_RESERVE) + min -= min / 2; - if (unlikely(alloc_harder)) { /* - * OOM victims can try even harder than normal ALLOC_HARDER + * Non-blocking allocations can access some of the reserve + * with more access if also __GFP_HIGH. The reasoning is that + * a non-blocking caller may incur a more severe penalty + * if it cannot get memory quickly, particularly if it's + * also __GFP_HIGH. + */ + if (alloc_flags & ALLOC_HARDER) + min -= min / 4; + + /* + * OOM victims can try even harder than the normal reserve * users on the grounds that it's definitely going to be in * the exit path shortly and free memory. Any allocation it * makes during the free path will be small and short-lived. */ if (alloc_flags & ALLOC_OOM) min -= min / 2; - else - min -= min / 4; } /* -- cgit v1.2.3 From 1ebbb21811b76c3b932959787f37985af36f62fa Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 13 Jan 2023 11:12:16 +0000 Subject: mm/page_alloc: explicitly define how __GFP_HIGH non-blocking allocations accesses reserves GFP_ATOMIC allocations get flagged ALLOC_HARDER which is a vague description. In preparation for the removal of GFP_ATOMIC redefine __GFP_ATOMIC to simply mean non-blocking and renaming ALLOC_HARDER to ALLOC_NON_BLOCK accordingly. __GFP_HIGH is required for access to reserves but non-blocking is granted more access. For example, GFP_NOWAIT is non-blocking but has no special access to reserves. A __GFP_NOFAIL blocking allocation is granted access similar to __GFP_HIGH if the only alternative is an OOM kill. Link: https://lkml.kernel.org/r/20230113111217.14134-6-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Michal Hocko Cc: Matthew Wilcox Cc: NeilBrown Cc: Thierry Reding Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/internal.h | 7 +++++-- mm/page_alloc.c | 44 ++++++++++++++++++++++++-------------------- 2 files changed, 29 insertions(+), 22 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/internal.h b/mm/internal.h index f7c3bc80c475..b0b88a95347f 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -778,7 +778,10 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone, #define ALLOC_OOM ALLOC_NO_WATERMARKS #endif -#define ALLOC_HARDER 0x10 /* try to alloc harder */ +#define ALLOC_NON_BLOCK 0x10 /* Caller cannot block. Allow access + * to 25% of the min watermark or + * 62.5% if __GFP_HIGH is set. + */ #define ALLOC_MIN_RESERVE 0x20 /* __GFP_HIGH set. Allow access to 50% * of the min watermark. */ @@ -793,7 +796,7 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone, #define ALLOC_KSWAPD 0x800 /* allow waking of kswapd, __GFP_KSWAPD_RECLAIM set */ /* Flags that allow allocations below the min watermark. */ -#define ALLOC_RESERVES (ALLOC_HARDER|ALLOC_MIN_RESERVE|ALLOC_HIGHATOMIC|ALLOC_OOM) +#define ALLOC_RESERVES (ALLOC_NON_BLOCK|ALLOC_MIN_RESERVE|ALLOC_HIGHATOMIC|ALLOC_OOM) enum ttu_flags; struct tlbflush_unmap_batch; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ffe2151d11ee..18ca33a1945d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4007,18 +4007,19 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, * __GFP_HIGH allows access to 50% of the min reserve as well * as OOM. */ - if (alloc_flags & ALLOC_MIN_RESERVE) + if (alloc_flags & ALLOC_MIN_RESERVE) { min -= min / 2; - /* - * Non-blocking allocations can access some of the reserve - * with more access if also __GFP_HIGH. The reasoning is that - * a non-blocking caller may incur a more severe penalty - * if it cannot get memory quickly, particularly if it's - * also __GFP_HIGH. - */ - if (alloc_flags & ALLOC_HARDER) - min -= min / 4; + /* + * Non-blocking allocations (e.g. GFP_ATOMIC) can + * access more reserves than just __GFP_HIGH. Other + * non-blocking allocations requests such as GFP_NOWAIT + * or (GFP_KERNEL & ~__GFP_DIRECT_RECLAIM) do not get + * access to the min reserve. + */ + if (alloc_flags & ALLOC_NON_BLOCK) + min -= min / 4; + } /* * OOM victims can try even harder than the normal reserve @@ -4869,28 +4870,30 @@ gfp_to_alloc_flags(gfp_t gfp_mask, unsigned int order) * The caller may dip into page reserves a bit more if the caller * cannot run direct reclaim, or if the caller has realtime scheduling * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will - * set both ALLOC_HARDER (__GFP_ATOMIC) and ALLOC_MIN_RESERVE(__GFP_HIGH). + * set both ALLOC_NON_BLOCK and ALLOC_MIN_RESERVE(__GFP_HIGH). */ alloc_flags |= (__force int) (gfp_mask & (__GFP_HIGH | __GFP_KSWAPD_RECLAIM)); - if (gfp_mask & __GFP_ATOMIC) { + if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) { /* * Not worth trying to allocate harder for __GFP_NOMEMALLOC even * if it can't schedule. */ if (!(gfp_mask & __GFP_NOMEMALLOC)) { - alloc_flags |= ALLOC_HARDER; + alloc_flags |= ALLOC_NON_BLOCK; if (order > 0) alloc_flags |= ALLOC_HIGHATOMIC; } /* - * Ignore cpuset mems for GFP_ATOMIC rather than fail, see the - * comment for __cpuset_node_allowed(). + * Ignore cpuset mems for non-blocking __GFP_HIGH (probably + * GFP_ATOMIC) rather than fail, see the comment for + * __cpuset_node_allowed(). */ - alloc_flags &= ~ALLOC_CPUSET; + if (alloc_flags & ALLOC_MIN_RESERVE) + alloc_flags &= ~ALLOC_CPUSET; } else if (unlikely(rt_task(current)) && in_task()) alloc_flags |= ALLOC_MIN_RESERVE; @@ -5321,12 +5324,13 @@ nopage: WARN_ON_ONCE_GFP(costly_order, gfp_mask); /* - * Help non-failing allocations by giving them access to memory - * reserves but do not use ALLOC_NO_WATERMARKS because this + * Help non-failing allocations by giving some access to memory + * reserves normally used for high priority non-blocking + * allocations but do not use ALLOC_NO_WATERMARKS because this * could deplete whole memory reserves which would just make - * the situation worse + * the situation worse. */ - page = __alloc_pages_cpuset_fallback(gfp_mask, order, ALLOC_HARDER, ac); + page = __alloc_pages_cpuset_fallback(gfp_mask, order, ALLOC_MIN_RESERVE, ac); if (page) goto got_pg; -- cgit v1.2.3 From 2973d8229b78d3f148e0c45916a1e8b237dc6167 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 13 Jan 2023 11:12:17 +0000 Subject: mm: discard __GFP_ATOMIC __GFP_ATOMIC serves little purpose. Its main effect is to set ALLOC_HARDER which adds a few little boosts to increase the chance of an allocation succeeding, one of which is to lower the water-mark at which it will succeed. It is *always* paired with __GFP_HIGH which sets ALLOC_HIGH which also adjusts this watermark. It is probable that other users of __GFP_HIGH should benefit from the other little bonuses that __GFP_ATOMIC gets. __GFP_ATOMIC also gives a warning if used with __GFP_DIRECT_RECLAIM. There is little point to this. We already get a might_sleep() warning if __GFP_DIRECT_RECLAIM is set. __GFP_ATOMIC allows the "watermark_boost" to be side-stepped. It is probable that testing ALLOC_HARDER is a better fit here. __GFP_ATOMIC is used by tegra-smmu.c to check if the allocation might sleep. This should test __GFP_DIRECT_RECLAIM instead. This patch: - removes __GFP_ATOMIC - allows __GFP_HIGH allocations to ignore watermark boosting as well as GFP_ATOMIC requests. - makes other adjustments as suggested by the above. The net result is not change to GFP_ATOMIC allocations. Other allocations that use __GFP_HIGH will benefit from a few different extra privileges. This affects: xen, dm, md, ntfs3 the vermillion frame buffer hibernation ksm swap all of which likely produce more benefit than cost if these selected allocation are more likely to succeed quickly. [mgorman: Minor adjustments to rework on top of a series] Link: https://lkml.kernel.org/r/163712397076.13692.4727608274002939094@noble.neil.brown.name Link: https://lkml.kernel.org/r/20230113111217.14134-7-mgorman@techsingularity.net Signed-off-by: NeilBrown Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Acked-by: Michal Hocko Cc: Matthew Wilcox Cc: Thierry Reding Signed-off-by: Andrew Morton --- Documentation/mm/balance.rst | 2 +- drivers/iommu/tegra-smmu.c | 4 ++-- include/linux/gfp_types.h | 12 ++++-------- include/trace/events/mmflags.h | 1 - lib/test_printf.c | 8 ++++---- mm/internal.h | 2 +- mm/page_alloc.c | 13 +++---------- tools/perf/builtin-kmem.c | 1 - 8 files changed, 15 insertions(+), 28 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/Documentation/mm/balance.rst b/Documentation/mm/balance.rst index 6a1fadf3e173..e38e9d83c1c7 100644 --- a/Documentation/mm/balance.rst +++ b/Documentation/mm/balance.rst @@ -6,7 +6,7 @@ Memory Balancing Started Jan 2000 by Kanoj Sarcar -Memory balancing is needed for !__GFP_ATOMIC and !__GFP_KSWAPD_RECLAIM as +Memory balancing is needed for !__GFP_HIGH and !__GFP_KSWAPD_RECLAIM as well as for non __GFP_IO allocations. The first reason why a caller may avoid reclaim is that the caller can not diff --git a/drivers/iommu/tegra-smmu.c b/drivers/iommu/tegra-smmu.c index 5b1af40221ec..af8d0e685260 100644 --- a/drivers/iommu/tegra-smmu.c +++ b/drivers/iommu/tegra-smmu.c @@ -671,12 +671,12 @@ static struct page *as_get_pde_page(struct tegra_smmu_as *as, * allocate page in a sleeping context if GFP flags permit. Hence * spinlock needs to be unlocked and re-locked after allocation. */ - if (!(gfp & __GFP_ATOMIC)) + if (gfpflags_allow_blocking(gfp)) spin_unlock_irqrestore(&as->lock, *flags); page = alloc_page(gfp | __GFP_DMA | __GFP_ZERO); - if (!(gfp & __GFP_ATOMIC)) + if (gfpflags_allow_blocking(gfp)) spin_lock_irqsave(&as->lock, *flags); /* diff --git a/include/linux/gfp_types.h b/include/linux/gfp_types.h index d88c46ca82e1..5088637fe5c2 100644 --- a/include/linux/gfp_types.h +++ b/include/linux/gfp_types.h @@ -31,7 +31,7 @@ typedef unsigned int __bitwise gfp_t; #define ___GFP_IO 0x40u #define ___GFP_FS 0x80u #define ___GFP_ZERO 0x100u -#define ___GFP_ATOMIC 0x200u +/* 0x200u unused */ #define ___GFP_DIRECT_RECLAIM 0x400u #define ___GFP_KSWAPD_RECLAIM 0x800u #define ___GFP_WRITE 0x1000u @@ -116,11 +116,8 @@ typedef unsigned int __bitwise gfp_t; * * %__GFP_HIGH indicates that the caller is high-priority and that granting * the request is necessary before the system can make forward progress. - * For example, creating an IO context to clean pages. - * - * %__GFP_ATOMIC indicates that the caller cannot reclaim or sleep and is - * high priority. Users are typically interrupt handlers. This may be - * used in conjunction with %__GFP_HIGH + * For example creating an IO context to clean pages and requests + * from atomic context. * * %__GFP_MEMALLOC allows access to all memory. This should only be used when * the caller guarantees the allocation will allow more memory to be freed @@ -135,7 +132,6 @@ typedef unsigned int __bitwise gfp_t; * %__GFP_NOMEMALLOC is used to explicitly forbid access to emergency reserves. * This takes precedence over the %__GFP_MEMALLOC flag if both are set. */ -#define __GFP_ATOMIC ((__force gfp_t)___GFP_ATOMIC) #define __GFP_HIGH ((__force gfp_t)___GFP_HIGH) #define __GFP_MEMALLOC ((__force gfp_t)___GFP_MEMALLOC) #define __GFP_NOMEMALLOC ((__force gfp_t)___GFP_NOMEMALLOC) @@ -329,7 +325,7 @@ typedef unsigned int __bitwise gfp_t; * version does not attempt reclaim/compaction at all and is by default used * in page fault path, while the non-light is used by khugepaged. */ -#define GFP_ATOMIC (__GFP_HIGH|__GFP_ATOMIC|__GFP_KSWAPD_RECLAIM) +#define GFP_ATOMIC (__GFP_HIGH|__GFP_KSWAPD_RECLAIM) #define GFP_KERNEL (__GFP_RECLAIM | __GFP_IO | __GFP_FS) #define GFP_KERNEL_ACCOUNT (GFP_KERNEL | __GFP_ACCOUNT) #define GFP_NOWAIT (__GFP_KSWAPD_RECLAIM) diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index 412b5a46374c..9db52bc4ce19 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -31,7 +31,6 @@ gfpflag_string(__GFP_HIGHMEM), \ gfpflag_string(GFP_DMA32), \ gfpflag_string(__GFP_HIGH), \ - gfpflag_string(__GFP_ATOMIC), \ gfpflag_string(__GFP_IO), \ gfpflag_string(__GFP_FS), \ gfpflag_string(__GFP_NOWARN), \ diff --git a/lib/test_printf.c b/lib/test_printf.c index d34dc636b81c..46b4e6c414a3 100644 --- a/lib/test_printf.c +++ b/lib/test_printf.c @@ -674,17 +674,17 @@ flags(void) gfp = GFP_ATOMIC|__GFP_DMA; test("GFP_ATOMIC|GFP_DMA", "%pGg", &gfp); - gfp = __GFP_ATOMIC; - test("__GFP_ATOMIC", "%pGg", &gfp); + gfp = __GFP_HIGH; + test("__GFP_HIGH", "%pGg", &gfp); /* Any flags not translated by the table should remain numeric */ gfp = ~__GFP_BITS_MASK; snprintf(cmp_buffer, BUF_SIZE, "%#lx", (unsigned long) gfp); test(cmp_buffer, "%pGg", &gfp); - snprintf(cmp_buffer, BUF_SIZE, "__GFP_ATOMIC|%#lx", + snprintf(cmp_buffer, BUF_SIZE, "__GFP_HIGH|%#lx", (unsigned long) gfp); - gfp |= __GFP_ATOMIC; + gfp |= __GFP_HIGH; test(cmp_buffer, "%pGg", &gfp); kfree(cmp_buffer); diff --git a/mm/internal.h b/mm/internal.h index b0b88a95347f..2d09a7a0600a 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -24,7 +24,7 @@ struct folio_batch; #define GFP_RECLAIM_MASK (__GFP_RECLAIM|__GFP_HIGH|__GFP_IO|__GFP_FS|\ __GFP_NOWARN|__GFP_RETRY_MAYFAIL|__GFP_NOFAIL|\ __GFP_NORETRY|__GFP_MEMALLOC|__GFP_NOMEMALLOC|\ - __GFP_ATOMIC|__GFP_NOLOCKDEP) + __GFP_NOLOCKDEP) /* The GFP flags allowed during early boot */ #define GFP_BOOT_MASK (__GFP_BITS_MASK & ~(__GFP_RECLAIM|__GFP_IO|__GFP_FS)) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 18ca33a1945d..0cfad30fb44c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4105,13 +4105,14 @@ static inline bool zone_watermark_fast(struct zone *z, unsigned int order, if (__zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags, free_pages)) return true; + /* - * Ignore watermark boosting for GFP_ATOMIC order-0 allocations + * Ignore watermark boosting for __GFP_HIGH order-0 allocations * when checking the min watermark. The min watermark is the * point where boosting is ignored so that kswapd is woken up * when below the low watermark. */ - if (unlikely(!order && (gfp_mask & __GFP_ATOMIC) && z->watermark_boost + if (unlikely(!order && (alloc_flags & ALLOC_MIN_RESERVE) && z->watermark_boost && ((alloc_flags & ALLOC_WMARK_MASK) == WMARK_MIN))) { mark = z->_watermark[WMARK_MIN]; return __zone_watermark_ok(z, order, mark, highest_zoneidx, @@ -5076,14 +5077,6 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, unsigned int zonelist_iter_cookie; int reserve_flags; - /* - * We also sanity check to catch abuse of atomic reserves being used by - * callers that are not in atomic context. - */ - if (WARN_ON_ONCE((gfp_mask & (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)) == - (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM))) - gfp_mask &= ~__GFP_ATOMIC; - restart: compaction_retries = 0; no_progress_loops = 0; diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c index 8ae0a1535293..f3029742b800 100644 --- a/tools/perf/builtin-kmem.c +++ b/tools/perf/builtin-kmem.c @@ -653,7 +653,6 @@ static const struct { { "__GFP_HIGHMEM", "HM" }, { "GFP_DMA32", "D32" }, { "__GFP_HIGH", "H" }, - { "__GFP_ATOMIC", "_A" }, { "__GFP_IO", "I" }, { "__GFP_FS", "F" }, { "__GFP_NOWARN", "NWR" }, -- cgit v1.2.3 From 7ec7096b8577d3b899c1dae456a414f2d08c7ddb Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Tue, 17 Jan 2023 20:46:17 +0000 Subject: mm/page_ext: init page_ext early if there are no deferred struct pages page_ext must be initialized after all struct pages are initialized. Therefore, page_ext is initialized after page_alloc_init_late(), and can optionally be initialized earlier via early_page_ext kernel parameter which as a side effect also disables deferred struct pages. Allow to automatically init page_ext early when there are no deferred struct pages in order to be able to use page_ext during kernel boot and track for example page allocations early. [pasha.tatashin@soleen.com: fix build with CONFIG_PAGE_EXTENSION=n] Link: https://lkml.kernel.org/r/20230118155251.2522985-1-pasha.tatashin@soleen.com Link: https://lkml.kernel.org/r/20230117204617.1553748-1-pasha.tatashin@soleen.com Signed-off-by: Pasha Tatashin Acked-by: Mike Rapoport (IBM) Acked-by: Vlastimil Babka Cc: Charan Teja Kalla Cc: David Hildenbrand Cc: Li Zhe Cc: Michal Hocko Signed-off-by: Andrew Morton --- include/linux/page_ext.h | 2 ++ init/main.c | 6 +++--- mm/page_alloc.c | 6 +++++- mm/page_ext.c | 2 +- 4 files changed, 11 insertions(+), 5 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h index 67314f648aeb..bc2e39090a1f 100644 --- a/include/linux/page_ext.h +++ b/include/linux/page_ext.h @@ -29,6 +29,8 @@ struct page_ext_operations { bool need_shared_flags; }; +extern bool deferred_struct_pages; + #ifdef CONFIG_PAGE_EXTENSION /* diff --git a/init/main.c b/init/main.c index e1c3911d7c70..64cd2ff051c4 100644 --- a/init/main.c +++ b/init/main.c @@ -855,8 +855,8 @@ static void __init mm_init(void) pgtable_init(); debug_objects_mem_init(); vmalloc_init(); - /* Should be run after vmap initialization */ - if (early_page_ext_enabled()) + /* If no deferred init page_ext now, as vmap is fully initialized */ + if (!deferred_struct_pages) page_ext_init(); /* Should be run before the first non-init thread is created */ init_espfix_bsp(); @@ -1628,7 +1628,7 @@ static noinline void __init kernel_init_freeable(void) padata_init(); page_alloc_init_late(); /* Initialize page ext after all struct pages are initialized. */ - if (!early_page_ext_enabled()) + if (deferred_struct_pages) page_ext_init(); do_basic_setup(); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 0cfad30fb44c..ecb9e9acfe7f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -430,6 +430,8 @@ EXPORT_SYMBOL(nr_online_nodes); int page_group_by_mobility_disabled __read_mostly; +bool deferred_struct_pages __meminitdata; + #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT /* * During boot we initialize deferred pages on-demand, as needed, but once @@ -6803,8 +6805,10 @@ void __meminit memmap_init_range(unsigned long size, int nid, unsigned long zone if (context == MEMINIT_EARLY) { if (overlap_memmap_init(zone, &pfn)) continue; - if (defer_init(nid, pfn, zone_end_pfn)) + if (defer_init(nid, pfn, zone_end_pfn)) { + deferred_struct_pages = true; break; + } } page = pfn_to_page(pfn); diff --git a/mm/page_ext.c b/mm/page_ext.c index e2c22ffdbb81..dc1626be458b 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -92,7 +92,7 @@ unsigned long page_ext_size; static unsigned long total_usage; static struct page_ext *lookup_page_ext(const struct page *page); -bool early_page_ext; +bool early_page_ext __meminitdata; static int __init setup_early_page_ext(char *str) { early_page_ext = true; -- cgit v1.2.3 From 076cf7ea67010d11a97912423f4cdbeff1bd1f5f Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Thu, 5 Jan 2023 13:55:06 +0530 Subject: mm/page_alloc: use deferred_pages_enabled() wherever applicable Instead of directly accessing static deferred_pages, replace such instances with the helper deferred_pages_enabled(). No functional change is intended. Link: https://lkml.kernel.org/r/20230105082506.241529-1-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Reviewed-by: Mike Rapoport (IBM) Reviewed-by: David Hildenbrand Acked-by: Mel Gorman Signed-off-by: Andrew Morton --- mm/page_alloc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ecb9e9acfe7f..717f12e83b85 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4288,7 +4288,7 @@ retry: * Watermark failed for this zone, but see if we can * grow this zone if it contains deferred pages. */ - if (static_branch_unlikely(&deferred_pages)) { + if (deferred_pages_enabled()) { if (_deferred_grow_zone(zone, order)) goto try_this_zone; } @@ -4337,7 +4337,7 @@ try_this_zone: } else { #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT /* Try again if zone has deferred pages */ - if (static_branch_unlikely(&deferred_pages)) { + if (deferred_pages_enabled()) { if (_deferred_grow_zone(zone, order)) goto try_this_zone; } -- cgit v1.2.3 From 420ef683b5217338bc679c33fd9361b52f53a526 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Tue, 24 Jan 2023 21:35:26 +0100 Subject: kasan: reset page tags properly with sampling The implementation of page_alloc poisoning sampling assumed that tag_clear_highpage resets page tags for __GFP_ZEROTAGS allocations. However, this is no longer the case since commit 70c248aca9e7 ("mm: kasan: Skip unpoisoning of user pages"). This leads to kernel crashes when MTE-enabled userspace mappings are used with Hardware Tag-Based KASAN enabled. Reset page tags for __GFP_ZEROTAGS allocations in post_alloc_hook(). Also clarify and fix related comments. [andreyknvl@google.com: update comment] Link: https://lkml.kernel.org/r/5dbd866714b4839069e2d8469ac45b60953db290.1674592780.git.andreyknvl@google.com Link: https://lkml.kernel.org/r/24ea20c1b19c2b4b56cf9f5b354915f8dbccfc77.1674592496.git.andreyknvl@google.com Fixes: 44383cef54c0 ("kasan: allow sampling page_alloc allocations for HW_TAGS") Signed-off-by: Andrey Konovalov Reported-by: Peter Collingbourne Tested-by: Peter Collingbourne Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Marco Elver Signed-off-by: Andrew Morton --- mm/page_alloc.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 717f12e83b85..5ebce58026f1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2476,7 +2476,7 @@ inline void post_alloc_hook(struct page *page, unsigned int order, bool init = !want_init_on_free() && want_init_on_alloc(gfp_flags) && !should_skip_init(gfp_flags); bool zero_tags = init && (gfp_flags & __GFP_ZEROTAGS); - bool reset_tags = !zero_tags; + bool reset_tags = true; int i; set_page_private(page, 0); @@ -2503,7 +2503,7 @@ inline void post_alloc_hook(struct page *page, unsigned int order, * (which happens only when memory should be initialized as well). */ if (zero_tags) { - /* Initialize both memory and tags. */ + /* Initialize both memory and memory tags. */ for (i = 0; i != 1 << order; ++i) tag_clear_highpage(page + i); @@ -2521,14 +2521,15 @@ inline void post_alloc_hook(struct page *page, unsigned int order, } else { /* * KASAN decided to exclude this allocation from being - * poisoned due to sampling. Skip poisoning as well. + * (un)poisoned due to sampling. Make KASAN skip + * poisoning when the allocation is freed. */ SetPageSkipKASanPoison(page); } } /* - * If memory tags have not been set, reset the page tags to ensure - * page_address() dereferencing does not fault. + * If memory tags have not been set by KASAN, reset the page tags to + * ensure page_address() dereferencing does not fault. */ if (reset_tags) { for (i = 0; i != 1 << order; ++i) -- cgit v1.2.3 From aa02d3c174abfc506058d3e43f471bc4280563d0 Mon Sep 17 00:00:00 2001 From: Yajun Deng Date: Fri, 3 Feb 2023 18:01:32 +0800 Subject: mm/page_alloc: reduce fallbacks to (MIGRATE_PCPTYPES - 1) The commit 1dd214b8f21c ("mm: page_alloc: avoid merging non-fallbackable pageblocks with others") has removed MIGRATE_CMA and MIGRATE_ISOLATE from fallbacks list. so there is no need to add an element at the end of every type. Reduce fallbacks to (MIGRATE_PCPTYPES - 1). Link: https://lkml.kernel.org/r/20230203100132.1627787-1-yajun.deng@linux.dev Signed-off-by: Yajun Deng Acked-by: Vlastimil Babka Cc: Zi Yan Cc: Mel Gorman Cc: David Hildenbrand Cc: Mike Rapoport Cc: Oscar Salvador Cc: Mike Rapoport Signed-off-by: Andrew Morton --- mm/page_alloc.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5ebce58026f1..21d820c42900 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2603,10 +2603,10 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, * * The other migratetypes do not have fallbacks. */ -static int fallbacks[MIGRATE_TYPES][3] = { - [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_TYPES }, - [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_TYPES }, - [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_TYPES }, +static int fallbacks[MIGRATE_TYPES][MIGRATE_PCPTYPES - 1] = { + [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE }, + [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE }, + [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE }, }; #ifdef CONFIG_CMA @@ -2865,11 +2865,8 @@ int find_suitable_fallback(struct free_area *area, unsigned int order, return -1; *can_steal = false; - for (i = 0;; i++) { + for (i = 0; i < MIGRATE_PCPTYPES - 1 ; i++) { fallback_mt = fallbacks[migratetype][i]; - if (fallback_mt == MIGRATE_TYPES) - break; - if (free_area_empty(area, fallback_mt)) continue; -- cgit v1.2.3 From 1bc67ca65b31bcb669c4eaca79b3c8d205bb212a Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Sun, 12 Feb 2023 19:10:27 +0800 Subject: mm: page_alloc: call panic() when memoryless node allocation fails In free_area_init(), we will continue to run after allocation of memoryless node pgdat fails. However, in the subsequent process (such as when initializing zonelist), the case that NODE_DATA(nid) is NULL is not handled, which will cause panic. Instead of this, it's better to call panic() directly when the memory allocation fails during system boot. Link: https://lkml.kernel.org/r/20230212111027.95520-1-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Signed-off-by: Andrew Morton --- mm/page_alloc.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 21d820c42900..4b6bcec41c8f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -8405,11 +8405,9 @@ void __init free_area_init(unsigned long *max_zone_pfn) /* Allocator not initialized yet */ pgdat = arch_alloc_nodedata(nid); - if (!pgdat) { - pr_err("Cannot allocate %zuB for node %d.\n", - sizeof(*pgdat), nid); - continue; - } + if (!pgdat) + panic("Cannot allocate %zuB for node %d.\n", + sizeof(*pgdat), nid); arch_refresh_nodedata(nid, pgdat); free_area_init_memoryless_node(nid); -- cgit v1.2.3 From f7a449f779608efe1941a0e0c4bd7b5f57000be7 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Mon, 13 Feb 2023 11:29:22 -0800 Subject: mm: memcontrol: rename memcg_kmem_enabled() Currently there are two kmem-related helper functions with a confusing semantics: memcg_kmem_enabled() and mem_cgroup_kmem_disabled(). The problem is that an obvious expectation memcg_kmem_enabled() == !mem_cgroup_kmem_disabled(), can be false. mem_cgroup_kmem_disabled() is similar to mem_cgroup_disabled(): it returns true only if CONFIG_MEMCG_KMEM is not set or the kmem accounting is disabled using a boot time kernel option "cgroup.memory=nokmem". It never changes the value dynamically. memcg_kmem_enabled() is different: it always returns false until the first non-root memory cgroup will get online (assuming the kernel memory accounting is enabled). It's goal is to improve the performance on systems without the cgroupfs mounted/memory controller enabled or on the systems with only the root memory cgroup. To make things more obvious and avoid potential bugs, let's rename memcg_kmem_enabled() to memcg_kmem_online(). Link: https://lkml.kernel.org/r/20230213192922.1146370-1-roman.gushchin@linux.dev Signed-off-by: Roman Gushchin Acked-by: Muchun Song Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Shakeel Butt Cc: Dennis Zhou Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 14 +++++++------- mm/memcontrol.c | 8 ++++---- mm/page_alloc.c | 8 ++++---- mm/percpu.c | 2 +- mm/slab.h | 10 +++++----- mm/vmscan.c | 2 +- 6 files changed, 22 insertions(+), 22 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 35478695cabf..5567319027d1 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1776,24 +1776,24 @@ struct obj_cgroup *get_obj_cgroup_from_page(struct page *page); int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size); void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size); -extern struct static_key_false memcg_kmem_enabled_key; +extern struct static_key_false memcg_kmem_online_key; -static inline bool memcg_kmem_enabled(void) +static inline bool memcg_kmem_online(void) { - return static_branch_likely(&memcg_kmem_enabled_key); + return static_branch_likely(&memcg_kmem_online_key); } static inline int memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order) { - if (memcg_kmem_enabled()) + if (memcg_kmem_online()) return __memcg_kmem_charge_page(page, gfp, order); return 0; } static inline void memcg_kmem_uncharge_page(struct page *page, int order) { - if (memcg_kmem_enabled()) + if (memcg_kmem_online()) __memcg_kmem_uncharge_page(page, order); } @@ -1814,7 +1814,7 @@ static inline void count_objcg_event(struct obj_cgroup *objcg, { struct mem_cgroup *memcg; - if (!memcg_kmem_enabled()) + if (!memcg_kmem_online()) return; rcu_read_lock(); @@ -1854,7 +1854,7 @@ static inline struct obj_cgroup *get_obj_cgroup_from_page(struct page *page) return NULL; } -static inline bool memcg_kmem_enabled(void) +static inline bool memcg_kmem_online(void) { return false; } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 17335459d8dc..3e3cdb9bed95 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -345,8 +345,8 @@ static void memcg_reparent_objcgs(struct mem_cgroup *memcg, * conditional to this static branch, we'll have to allow modules that does * kmem_cache_alloc and the such to see this symbol as well */ -DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key); -EXPORT_SYMBOL(memcg_kmem_enabled_key); +DEFINE_STATIC_KEY_FALSE(memcg_kmem_online_key); +EXPORT_SYMBOL(memcg_kmem_online_key); #endif /** @@ -3034,7 +3034,7 @@ struct obj_cgroup *get_obj_cgroup_from_page(struct page *page) { struct obj_cgroup *objcg; - if (!memcg_kmem_enabled()) + if (!memcg_kmem_online()) return NULL; if (PageMemcgKmem(page)) { @@ -3746,7 +3746,7 @@ static int memcg_online_kmem(struct mem_cgroup *memcg) objcg->memcg = memcg; rcu_assign_pointer(memcg->objcg, objcg); - static_branch_enable(&memcg_kmem_enabled_key); + static_branch_enable(&memcg_kmem_online_key); memcg->kmemcg_id = memcg->id.id; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4b6bcec41c8f..4c9ab8b93b1a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1410,7 +1410,7 @@ static __always_inline bool free_pages_prepare(struct page *page, * Do not let hwpoison pages hit pcplists/buddy * Untie memcg state and reset page's owner */ - if (memcg_kmem_enabled() && PageMemcgKmem(page)) + if (memcg_kmem_online() && PageMemcgKmem(page)) __memcg_kmem_uncharge_page(page, order); reset_page_owner(page, order); page_table_check_free(page, order); @@ -1441,7 +1441,7 @@ static __always_inline bool free_pages_prepare(struct page *page, } if (PageMappingFlags(page)) page->mapping = NULL; - if (memcg_kmem_enabled() && PageMemcgKmem(page)) + if (memcg_kmem_online() && PageMemcgKmem(page)) __memcg_kmem_uncharge_page(page, order); if (check_free && free_page_is_bad(page)) bad++; @@ -5432,7 +5432,7 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, goto out; /* Bulk allocator does not support memcg accounting. */ - if (memcg_kmem_enabled() && (gfp & __GFP_ACCOUNT)) + if (memcg_kmem_online() && (gfp & __GFP_ACCOUNT)) goto failed; /* Use the single page allocator for one page. */ @@ -5604,7 +5604,7 @@ struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid, page = __alloc_pages_slowpath(alloc_gfp, order, &ac); out: - if (memcg_kmem_enabled() && (gfp & __GFP_ACCOUNT) && page && + if (memcg_kmem_online() && (gfp & __GFP_ACCOUNT) && page && unlikely(__memcg_kmem_charge_page(page, gfp, order) != 0)) { __free_pages(page, order); page = NULL; diff --git a/mm/percpu.c b/mm/percpu.c index acd78da0493b..28e07ede46f6 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1625,7 +1625,7 @@ static bool pcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp, { struct obj_cgroup *objcg; - if (!memcg_kmem_enabled() || !(gfp & __GFP_ACCOUNT)) + if (!memcg_kmem_online() || !(gfp & __GFP_ACCOUNT)) return true; objcg = get_obj_cgroup_from_current(); diff --git a/mm/slab.h b/mm/slab.h index 63fb4c00d529..43966aa5fadf 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -494,7 +494,7 @@ static inline bool memcg_slab_pre_alloc_hook(struct kmem_cache *s, { struct obj_cgroup *objcg; - if (!memcg_kmem_enabled()) + if (!memcg_kmem_online()) return true; if (!(flags & __GFP_ACCOUNT) && !(s->flags & SLAB_ACCOUNT)) @@ -535,7 +535,7 @@ static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s, unsigned long off; size_t i; - if (!memcg_kmem_enabled() || !objcg) + if (!memcg_kmem_online() || !objcg) return; for (i = 0; i < size; i++) { @@ -567,7 +567,7 @@ static inline void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, struct obj_cgroup **objcgs; int i; - if (!memcg_kmem_enabled()) + if (!memcg_kmem_online()) return; objcgs = slab_objcgs(slab); @@ -649,7 +649,7 @@ static inline struct kmem_cache *virt_to_cache(const void *obj) static __always_inline void account_slab(struct slab *slab, int order, struct kmem_cache *s, gfp_t gfp) { - if (memcg_kmem_enabled() && (s->flags & SLAB_ACCOUNT)) + if (memcg_kmem_online() && (s->flags & SLAB_ACCOUNT)) memcg_alloc_slab_cgroups(slab, s, gfp, true); mod_node_page_state(slab_pgdat(slab), cache_vmstat_idx(s), @@ -659,7 +659,7 @@ static __always_inline void account_slab(struct slab *slab, int order, static __always_inline void unaccount_slab(struct slab *slab, int order, struct kmem_cache *s) { - if (memcg_kmem_enabled()) + if (memcg_kmem_online()) memcg_free_slab_cgroups(slab); mod_node_page_state(slab_pgdat(slab), cache_vmstat_idx(s), diff --git a/mm/vmscan.c b/mm/vmscan.c index 34535bbd4fe9..098c79129c42 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -915,7 +915,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, } /* Call non-slab shrinkers even though kmem is disabled */ - if (!memcg_kmem_enabled() && + if (!memcg_kmem_online() && !(shrinker->flags & SHRINKER_NONSLAB)) continue; -- cgit v1.2.3