diff options
Diffstat (limited to 'mm/vmscan.c')
-rw-r--r-- | mm/vmscan.c | 327 |
1 files changed, 169 insertions, 158 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c index d257916f39e5..1080209a568b 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -35,7 +35,7 @@ #include <linux/cpuset.h> #include <linux/compaction.h> #include <linux/notifier.h> -#include <linux/mutex.h> +#include <linux/rwsem.h> #include <linux/delay.h> #include <linux/kthread.h> #include <linux/freezer.h> @@ -57,7 +57,6 @@ #include <linux/khugepaged.h> #include <linux/rculist_nulls.h> #include <linux/random.h> -#include <linux/srcu.h> #include <asm/tlbflush.h> #include <asm/div64.h> @@ -190,9 +189,7 @@ struct scan_control { int vm_swappiness = 60; LIST_HEAD(shrinker_list); -DEFINE_MUTEX(shrinker_mutex); -DEFINE_SRCU(shrinker_srcu); -static atomic_t shrinker_srcu_generation = ATOMIC_INIT(0); +DECLARE_RWSEM(shrinker_rwsem); #ifdef CONFIG_MEMCG static int shrinker_nr_max; @@ -211,21 +208,8 @@ static inline int shrinker_defer_size(int nr_items) static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg, int nid) { - return srcu_dereference_check(memcg->nodeinfo[nid]->shrinker_info, - &shrinker_srcu, - lockdep_is_held(&shrinker_mutex)); -} - -static struct shrinker_info *shrinker_info_srcu(struct mem_cgroup *memcg, - int nid) -{ - return srcu_dereference(memcg->nodeinfo[nid]->shrinker_info, - &shrinker_srcu); -} - -static void free_shrinker_info_rcu(struct rcu_head *head) -{ - kvfree(container_of(head, struct shrinker_info, rcu)); + return rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_info, + lockdep_is_held(&shrinker_rwsem)); } static int expand_one_shrinker_info(struct mem_cgroup *memcg, @@ -266,7 +250,7 @@ static int expand_one_shrinker_info(struct mem_cgroup *memcg, defer_size - old_defer_size); rcu_assign_pointer(pn->shrinker_info, new); - call_srcu(&shrinker_srcu, &old->rcu, free_shrinker_info_rcu); + kvfree_rcu(old, rcu); } return 0; @@ -292,7 +276,7 @@ int alloc_shrinker_info(struct mem_cgroup *memcg) int nid, size, ret = 0; int map_size, defer_size = 0; - mutex_lock(&shrinker_mutex); + down_write(&shrinker_rwsem); map_size = shrinker_map_size(shrinker_nr_max); defer_size = shrinker_defer_size(shrinker_nr_max); size = map_size + defer_size; @@ -308,7 +292,7 @@ int alloc_shrinker_info(struct mem_cgroup *memcg) info->map_nr_max = shrinker_nr_max; rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_info, info); } - mutex_unlock(&shrinker_mutex); + up_write(&shrinker_rwsem); return ret; } @@ -324,7 +308,7 @@ static int expand_shrinker_info(int new_id) if (!root_mem_cgroup) goto out; - lockdep_assert_held(&shrinker_mutex); + lockdep_assert_held(&shrinker_rwsem); map_size = shrinker_map_size(new_nr_max); defer_size = shrinker_defer_size(new_nr_max); @@ -352,16 +336,15 @@ void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id) { if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) { struct shrinker_info *info; - int srcu_idx; - srcu_idx = srcu_read_lock(&shrinker_srcu); - info = shrinker_info_srcu(memcg, nid); + rcu_read_lock(); + info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info); if (!WARN_ON_ONCE(shrinker_id >= info->map_nr_max)) { /* Pairs with smp mb in shrink_slab() */ smp_mb__before_atomic(); set_bit(shrinker_id, info->map); } - srcu_read_unlock(&shrinker_srcu, srcu_idx); + rcu_read_unlock(); } } @@ -374,7 +357,8 @@ static int prealloc_memcg_shrinker(struct shrinker *shrinker) if (mem_cgroup_disabled()) return -ENOSYS; - mutex_lock(&shrinker_mutex); + down_write(&shrinker_rwsem); + /* This may call shrinker, so it must use down_read_trylock() */ id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL); if (id < 0) goto unlock; @@ -388,7 +372,7 @@ static int prealloc_memcg_shrinker(struct shrinker *shrinker) shrinker->id = id; ret = 0; unlock: - mutex_unlock(&shrinker_mutex); + up_write(&shrinker_rwsem); return ret; } @@ -398,7 +382,7 @@ static void unregister_memcg_shrinker(struct shrinker *shrinker) BUG_ON(id < 0); - lockdep_assert_held(&shrinker_mutex); + lockdep_assert_held(&shrinker_rwsem); idr_remove(&shrinker_idr, id); } @@ -408,7 +392,7 @@ static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker, { struct shrinker_info *info; - info = shrinker_info_srcu(memcg, nid); + info = shrinker_info_protected(memcg, nid); return atomic_long_xchg(&info->nr_deferred[shrinker->id], 0); } @@ -417,7 +401,7 @@ static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker, { struct shrinker_info *info; - info = shrinker_info_srcu(memcg, nid); + info = shrinker_info_protected(memcg, nid); return atomic_long_add_return(nr, &info->nr_deferred[shrinker->id]); } @@ -433,7 +417,7 @@ void reparent_shrinker_deferred(struct mem_cgroup *memcg) parent = root_mem_cgroup; /* Prevent from concurrent shrinker_info expand */ - mutex_lock(&shrinker_mutex); + down_read(&shrinker_rwsem); for_each_node(nid) { child_info = shrinker_info_protected(memcg, nid); parent_info = shrinker_info_protected(parent, nid); @@ -442,15 +426,20 @@ void reparent_shrinker_deferred(struct mem_cgroup *memcg) atomic_long_add(nr, &parent_info->nr_deferred[i]); } } - mutex_unlock(&shrinker_mutex); + up_read(&shrinker_rwsem); } +/* Returns true for reclaim through cgroup limits or cgroup interfaces. */ static bool cgroup_reclaim(struct scan_control *sc) { return sc->target_mem_cgroup; } -static bool global_reclaim(struct scan_control *sc) +/* + * Returns true for reclaim on the root cgroup. This is true for direct + * allocator reclaim and reclaim through cgroup interfaces on the root cgroup. + */ +static bool root_reclaim(struct scan_control *sc) { return !sc->target_mem_cgroup || mem_cgroup_is_root(sc->target_mem_cgroup); } @@ -505,7 +494,7 @@ static bool cgroup_reclaim(struct scan_control *sc) return false; } -static bool global_reclaim(struct scan_control *sc) +static bool root_reclaim(struct scan_control *sc) { return true; } @@ -562,7 +551,7 @@ static void flush_reclaim_state(struct scan_control *sc) * memcg reclaim, to make reporting more accurate and reduce * underestimation, but it's probably not worth the complexity for now. */ - if (current->reclaim_state && global_reclaim(sc)) { + if (current->reclaim_state && root_reclaim(sc)) { sc->nr_reclaimed += current->reclaim_state->reclaimed; current->reclaim_state->reclaimed = 0; } @@ -743,9 +732,9 @@ void free_prealloced_shrinker(struct shrinker *shrinker) shrinker->name = NULL; #endif if (shrinker->flags & SHRINKER_MEMCG_AWARE) { - mutex_lock(&shrinker_mutex); + down_write(&shrinker_rwsem); unregister_memcg_shrinker(shrinker); - mutex_unlock(&shrinker_mutex); + up_write(&shrinker_rwsem); return; } @@ -755,11 +744,11 @@ void free_prealloced_shrinker(struct shrinker *shrinker) void register_shrinker_prepared(struct shrinker *shrinker) { - mutex_lock(&shrinker_mutex); - list_add_tail_rcu(&shrinker->list, &shrinker_list); + down_write(&shrinker_rwsem); + list_add_tail(&shrinker->list, &shrinker_list); shrinker->flags |= SHRINKER_REGISTERED; shrinker_debugfs_add(shrinker); - mutex_unlock(&shrinker_mutex); + up_write(&shrinker_rwsem); } static int __register_shrinker(struct shrinker *shrinker) @@ -805,22 +794,20 @@ EXPORT_SYMBOL(register_shrinker); void unregister_shrinker(struct shrinker *shrinker) { struct dentry *debugfs_entry; + int debugfs_id; if (!(shrinker->flags & SHRINKER_REGISTERED)) return; - mutex_lock(&shrinker_mutex); - list_del_rcu(&shrinker->list); + down_write(&shrinker_rwsem); + list_del(&shrinker->list); shrinker->flags &= ~SHRINKER_REGISTERED; if (shrinker->flags & SHRINKER_MEMCG_AWARE) unregister_memcg_shrinker(shrinker); - debugfs_entry = shrinker_debugfs_remove(shrinker); - mutex_unlock(&shrinker_mutex); - - atomic_inc(&shrinker_srcu_generation); - synchronize_srcu(&shrinker_srcu); + debugfs_entry = shrinker_debugfs_detach(shrinker, &debugfs_id); + up_write(&shrinker_rwsem); - debugfs_remove_recursive(debugfs_entry); + shrinker_debugfs_remove(debugfs_entry, debugfs_id); kfree(shrinker->nr_deferred); shrinker->nr_deferred = NULL; @@ -830,13 +817,15 @@ EXPORT_SYMBOL(unregister_shrinker); /** * synchronize_shrinkers - Wait for all running shrinkers to complete. * - * This is useful to guarantee that all shrinker invocations have seen an - * update, before freeing memory. + * This is equivalent to calling unregister_shrink() and register_shrinker(), + * but atomically and with less overhead. This is useful to guarantee that all + * shrinker invocations have seen an update, before freeing memory, similar to + * rcu. */ void synchronize_shrinkers(void) { - atomic_inc(&shrinker_srcu_generation); - synchronize_srcu(&shrinker_srcu); + down_write(&shrinker_rwsem); + up_write(&shrinker_rwsem); } EXPORT_SYMBOL(synchronize_shrinkers); @@ -945,20 +934,19 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, { struct shrinker_info *info; unsigned long ret, freed = 0; - int srcu_idx, generation; - int i = 0; + int i; if (!mem_cgroup_online(memcg)) return 0; -again: - srcu_idx = srcu_read_lock(&shrinker_srcu); - info = shrinker_info_srcu(memcg, nid); + if (!down_read_trylock(&shrinker_rwsem)) + return 0; + + info = shrinker_info_protected(memcg, nid); if (unlikely(!info)) goto unlock; - generation = atomic_read(&shrinker_srcu_generation); - for_each_set_bit_from(i, info->map, info->map_nr_max) { + for_each_set_bit(i, info->map, info->map_nr_max) { struct shrink_control sc = { .gfp_mask = gfp_mask, .nid = nid, @@ -1004,14 +992,14 @@ again: set_shrinker_bit(memcg, nid, i); } freed += ret; - if (atomic_read(&shrinker_srcu_generation) != generation) { - srcu_read_unlock(&shrinker_srcu, srcu_idx); - i++; - goto again; + + if (rwsem_is_contended(&shrinker_rwsem)) { + freed = freed ? : 1; + break; } } unlock: - srcu_read_unlock(&shrinker_srcu, srcu_idx); + up_read(&shrinker_rwsem); return freed; } #else /* CONFIG_MEMCG */ @@ -1048,7 +1036,6 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid, { unsigned long ret, freed = 0; struct shrinker *shrinker; - int srcu_idx, generation; /* * The root memcg might be allocated even though memcg is disabled @@ -1060,11 +1047,10 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid, if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg)) return shrink_slab_memcg(gfp_mask, nid, memcg, priority); - srcu_idx = srcu_read_lock(&shrinker_srcu); + if (!down_read_trylock(&shrinker_rwsem)) + goto out; - generation = atomic_read(&shrinker_srcu_generation); - list_for_each_entry_srcu(shrinker, &shrinker_list, list, - srcu_read_lock_held(&shrinker_srcu)) { + list_for_each_entry(shrinker, &shrinker_list, list) { struct shrink_control sc = { .gfp_mask = gfp_mask, .nid = nid, @@ -1075,14 +1061,19 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid, if (ret == SHRINK_EMPTY) ret = 0; freed += ret; - - if (atomic_read(&shrinker_srcu_generation) != generation) { + /* + * Bail out if someone want to register a new shrinker to + * prevent the registration from being stalled for long periods + * by parallel ongoing shrinking. + */ + if (rwsem_is_contended(&shrinker_rwsem)) { freed = freed ? : 1; break; } } - srcu_read_unlock(&shrinker_srcu, srcu_idx); + up_read(&shrinker_rwsem); +out: cond_resched(); return freed; } @@ -1620,9 +1611,10 @@ static void folio_check_dirty_writeback(struct folio *folio, mapping->a_ops->is_dirty_writeback(folio, dirty, writeback); } -static struct page *alloc_demote_page(struct page *page, unsigned long private) +static struct folio *alloc_demote_folio(struct folio *src, + unsigned long private) { - struct page *target_page; + struct folio *dst; nodemask_t *allowed_mask; struct migration_target_control *mtc; @@ -1640,14 +1632,14 @@ static struct page *alloc_demote_page(struct page *page, unsigned long private) */ mtc->nmask = NULL; mtc->gfp_mask |= __GFP_THISNODE; - target_page = alloc_migration_target(page, (unsigned long)mtc); - if (target_page) - return target_page; + dst = alloc_migration_target(src, (unsigned long)mtc); + if (dst) + return dst; mtc->gfp_mask &= ~__GFP_THISNODE; mtc->nmask = allowed_mask; - return alloc_migration_target(page, (unsigned long)mtc); + return alloc_migration_target(src, (unsigned long)mtc); } /* @@ -1682,7 +1674,7 @@ static unsigned int demote_folio_list(struct list_head *demote_folios, node_get_allowed_targets(pgdat, &allowed_mask); /* Demotion ignores all cpuset and mempolicy settings */ - migrate_pages(demote_folios, alloc_demote_page, NULL, + migrate_pages(demote_folios, alloc_demote_folio, NULL, (unsigned long)&mtc, MIGRATE_ASYNC, MR_DEMOTION, &nr_succeeded); @@ -2269,6 +2261,25 @@ static __always_inline void update_lru_sizes(struct lruvec *lruvec, } +#ifdef CONFIG_CMA +/* + * It is waste of effort to scan and reclaim CMA pages if it is not available + * for current allocation context. Kswapd can not be enrolled as it can not + * distinguish this scenario by using sc->gfp_mask = GFP_KERNEL + */ +static bool skip_cma(struct folio *folio, struct scan_control *sc) +{ + return !current_is_kswapd() && + gfp_migratetype(sc->gfp_mask) != MIGRATE_MOVABLE && + get_pageblock_migratetype(&folio->page) == MIGRATE_CMA; +} +#else +static bool skip_cma(struct folio *folio, struct scan_control *sc) +{ + return false; +} +#endif + /* * Isolating page from the lruvec to fill in @dst list by nr_to_scan times. * @@ -2315,7 +2326,8 @@ static unsigned long isolate_lru_folios(unsigned long nr_to_scan, nr_pages = folio_nr_pages(folio); total_scan += nr_pages; - if (folio_zonenum(folio) > sc->reclaim_idx) { + if (folio_zonenum(folio) > sc->reclaim_idx || + skip_cma(folio, sc)) { nr_skipped[folio_zonenum(folio)] += nr_pages; move_to = &folios_skipped; goto move; @@ -2457,7 +2469,7 @@ static int too_many_isolated(struct pglist_data *pgdat, int file, * won't get blocked by normal direct-reclaimers, forming a circular * deadlock. */ - if ((sc->gfp_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS)) + if (gfp_has_io_fs(sc->gfp_mask)) inactive >>= 3; too_many = isolated > inactive; @@ -3232,6 +3244,16 @@ DEFINE_STATIC_KEY_ARRAY_FALSE(lru_gen_caps, NR_LRU_GEN_CAPS); #define get_cap(cap) static_branch_unlikely(&lru_gen_caps[cap]) #endif +static bool should_walk_mmu(void) +{ + return arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK); +} + +static bool should_clear_pmd_young(void) +{ + return arch_has_hw_nonleaf_pmd_young() && get_cap(LRU_GEN_NONLEAF_YOUNG); +} + /****************************************************************************** * shorthand helpers ******************************************************************************/ @@ -3992,28 +4014,29 @@ static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end, struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec); int old_gen, new_gen = lru_gen_from_seq(walk->max_seq); - VM_WARN_ON_ONCE(pmd_leaf(*pmd)); - - ptl = pte_lockptr(args->mm, pmd); - if (!spin_trylock(ptl)) + pte = pte_offset_map_nolock(args->mm, pmd, start & PMD_MASK, &ptl); + if (!pte) + return false; + if (!spin_trylock(ptl)) { + pte_unmap(pte); return false; + } arch_enter_lazy_mmu_mode(); - - pte = pte_offset_map(pmd, start & PMD_MASK); restart: for (i = pte_index(start), addr = start; addr != end; i++, addr += PAGE_SIZE) { unsigned long pfn; struct folio *folio; + pte_t ptent = ptep_get(pte + i); total++; walk->mm_stats[MM_LEAF_TOTAL]++; - pfn = get_pte_pfn(pte[i], args->vma, addr); + pfn = get_pte_pfn(ptent, args->vma, addr); if (pfn == -1) continue; - if (!pte_young(pte[i])) { + if (!pte_young(ptent)) { walk->mm_stats[MM_LEAF_OLD]++; continue; } @@ -4028,7 +4051,7 @@ restart: young++; walk->mm_stats[MM_LEAF_YOUNG]++; - if (pte_dirty(pte[i]) && !folio_test_dirty(folio) && + if (pte_dirty(ptent) && !folio_test_dirty(folio) && !(folio_test_anon(folio) && folio_test_swapbacked(folio) && !folio_test_swapcache(folio))) folio_mark_dirty(folio); @@ -4041,10 +4064,8 @@ restart: if (i < PTRS_PER_PTE && get_next_vma(PMD_MASK, PAGE_SIZE, args, &start, &end)) goto restart; - pte_unmap(pte); - arch_leave_lazy_mmu_mode(); - spin_unlock(ptl); + pte_unmap_unlock(pte, ptl); return suitable_to_scan(total, young); } @@ -4096,7 +4117,7 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area goto next; if (!pmd_trans_huge(pmd[i])) { - if (arch_has_hw_nonleaf_pmd_young() && get_cap(LRU_GEN_NONLEAF_YOUNG)) + if (should_clear_pmd_young()) pmdp_test_and_clear_young(vma, addr, pmd + i); goto next; } @@ -4142,7 +4163,7 @@ static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end, unsigned long next; unsigned long addr; struct vm_area_struct *vma; - unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)]; + DECLARE_BITMAP(bitmap, MIN_LRU_BATCH); unsigned long first = -1; struct lru_gen_mm_walk *walk = args->private; @@ -4189,7 +4210,7 @@ restart: #endif walk->mm_stats[MM_NONLEAF_TOTAL]++; - if (arch_has_hw_nonleaf_pmd_young() && get_cap(LRU_GEN_NONLEAF_YOUNG)) { + if (should_clear_pmd_young()) { if (!pmd_young(val)) continue; @@ -4491,7 +4512,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, * handful of PTEs. Spreading the work out over a period of time usually * is less efficient, but it avoids bursty page faults. */ - if (!arch_has_hw_pte_young() || !get_cap(LRU_GEN_MM_WALK)) { + if (!should_walk_mmu()) { success = iterate_mm_list_nowalk(lruvec, max_seq); goto done; } @@ -4673,12 +4694,13 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) for (i = 0, addr = start; addr != end; i++, addr += PAGE_SIZE) { unsigned long pfn; + pte_t ptent = ptep_get(pte + i); - pfn = get_pte_pfn(pte[i], pvmw->vma, addr); + pfn = get_pte_pfn(ptent, pvmw->vma, addr); if (pfn == -1) continue; - if (!pte_young(pte[i])) + if (!pte_young(ptent)) continue; folio = get_pfn_folio(pfn, memcg, pgdat, !walk || walk->can_swap); @@ -4690,7 +4712,7 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) young++; - if (pte_dirty(pte[i]) && !folio_test_dirty(folio) && + if (pte_dirty(ptent) && !folio_test_dirty(folio) && !(folio_test_anon(folio) && folio_test_swapbacked(folio) && !folio_test_swapcache(folio))) folio_mark_dirty(folio); @@ -4742,10 +4764,11 @@ static void lru_gen_rotate_memcg(struct lruvec *lruvec, int op) { int seg; int old, new; + unsigned long flags; int bin = get_random_u32_below(MEMCG_NR_BINS); struct pglist_data *pgdat = lruvec_pgdat(lruvec); - spin_lock(&pgdat->memcg_lru.lock); + spin_lock_irqsave(&pgdat->memcg_lru.lock, flags); VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list)); @@ -4780,7 +4803,7 @@ static void lru_gen_rotate_memcg(struct lruvec *lruvec, int op) if (!pgdat->memcg_lru.nr_memcgs[old] && old == get_memcg_gen(pgdat->memcg_lru.seq)) WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1); - spin_unlock(&pgdat->memcg_lru.lock); + spin_unlock_irqrestore(&pgdat->memcg_lru.lock, flags); } void lru_gen_online_memcg(struct mem_cgroup *memcg) @@ -4793,7 +4816,7 @@ void lru_gen_online_memcg(struct mem_cgroup *memcg) struct pglist_data *pgdat = NODE_DATA(nid); struct lruvec *lruvec = get_lruvec(memcg, nid); - spin_lock(&pgdat->memcg_lru.lock); + spin_lock_irq(&pgdat->memcg_lru.lock); VM_WARN_ON_ONCE(!hlist_nulls_unhashed(&lruvec->lrugen.list)); @@ -4804,7 +4827,7 @@ void lru_gen_online_memcg(struct mem_cgroup *memcg) lruvec->lrugen.gen = gen; - spin_unlock(&pgdat->memcg_lru.lock); + spin_unlock_irq(&pgdat->memcg_lru.lock); } } @@ -4828,7 +4851,7 @@ void lru_gen_release_memcg(struct mem_cgroup *memcg) struct pglist_data *pgdat = NODE_DATA(nid); struct lruvec *lruvec = get_lruvec(memcg, nid); - spin_lock(&pgdat->memcg_lru.lock); + spin_lock_irq(&pgdat->memcg_lru.lock); VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list)); @@ -4840,12 +4863,14 @@ void lru_gen_release_memcg(struct mem_cgroup *memcg) if (!pgdat->memcg_lru.nr_memcgs[gen] && gen == get_memcg_gen(pgdat->memcg_lru.seq)) WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1); - spin_unlock(&pgdat->memcg_lru.lock); + spin_unlock_irq(&pgdat->memcg_lru.lock); } } -void lru_gen_soft_reclaim(struct lruvec *lruvec) +void lru_gen_soft_reclaim(struct mem_cgroup *memcg, int nid) { + struct lruvec *lruvec = get_lruvec(memcg, nid); + /* see the comment on MEMCG_NR_GENS */ if (lru_gen_memcg_seg(lruvec) != MEMCG_LRU_HEAD) lru_gen_rotate_memcg(lruvec, MEMCG_LRU_HEAD); @@ -4911,7 +4936,6 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, int tier_idx) WRITE_ONCE(lrugen->protected[hist][type][tier - 1], lrugen->protected[hist][type][tier - 1] + delta); - __mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + type, delta); return true; } @@ -5306,7 +5330,7 @@ static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, bool static unsigned long get_nr_to_reclaim(struct scan_control *sc) { /* don't abort memcg reclaim to ensure fairness */ - if (!global_reclaim(sc)) + if (!root_reclaim(sc)) return -1; return max(sc->nr_to_reclaim, compact_gap(sc->order)); @@ -5458,7 +5482,7 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc { struct blk_plug plug; - VM_WARN_ON_ONCE(global_reclaim(sc)); + VM_WARN_ON_ONCE(root_reclaim(sc)); VM_WARN_ON_ONCE(!sc->may_writepage || !sc->may_unmap); lru_add_drain(); @@ -5519,7 +5543,7 @@ static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control * struct blk_plug plug; unsigned long reclaimed = sc->nr_reclaimed; - VM_WARN_ON_ONCE(!global_reclaim(sc)); + VM_WARN_ON_ONCE(!root_reclaim(sc)); /* * Unmapped clean folios are already prioritized. Scanning for more of @@ -5726,10 +5750,10 @@ static ssize_t enabled_show(struct kobject *kobj, struct kobj_attribute *attr, c if (get_cap(LRU_GEN_CORE)) caps |= BIT(LRU_GEN_CORE); - if (arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK)) + if (should_walk_mmu()) caps |= BIT(LRU_GEN_MM_WALK); - if (arch_has_hw_nonleaf_pmd_young() && get_cap(LRU_GEN_NONLEAF_YOUNG)) + if (should_clear_pmd_young()) caps |= BIT(LRU_GEN_NONLEAF_YOUNG); return sysfs_emit(buf, "0x%04x\n", caps); @@ -6241,7 +6265,7 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) bool proportional_reclaim; struct blk_plug plug; - if (lru_gen_enabled() && !global_reclaim(sc)) { + if (lru_gen_enabled() && !root_reclaim(sc)) { lru_gen_shrink_lruvec(lruvec, sc); return; } @@ -6397,14 +6421,13 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat, if (!managed_zone(zone)) continue; - switch (compaction_suitable(zone, sc->order, 0, sc->reclaim_idx)) { - case COMPACT_SUCCESS: - case COMPACT_CONTINUE: + /* Allocation can already succeed, nothing to do */ + if (zone_watermark_ok(zone, sc->order, min_wmark_pages(zone), + sc->reclaim_idx, 0)) + return false; + + if (compaction_suitable(zone, sc->order, sc->reclaim_idx)) return false; - default: - /* check next zone */ - ; - } } /* @@ -6483,7 +6506,7 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc) struct lruvec *target_lruvec; bool reclaimable = false; - if (lru_gen_enabled() && global_reclaim(sc)) { + if (lru_gen_enabled() && root_reclaim(sc)) { lru_gen_shrink_node(pgdat, sc); return; } @@ -6555,10 +6578,13 @@ again: * Legacy memcg will stall in page writeback so avoid forcibly * stalling in reclaim_throttle(). */ - if ((current_is_kswapd() || - (cgroup_reclaim(sc) && writeback_throttling_sane(sc))) && - sc->nr.dirty && sc->nr.dirty == sc->nr.congested) - set_bit(LRUVEC_CONGESTED, &target_lruvec->flags); + if (sc->nr.dirty && sc->nr.dirty == sc->nr.congested) { + if (cgroup_reclaim(sc) && writeback_throttling_sane(sc)) + set_bit(LRUVEC_CGROUP_CONGESTED, &target_lruvec->flags); + + if (current_is_kswapd()) + set_bit(LRUVEC_NODE_CONGESTED, &target_lruvec->flags); + } /* * Stall direct reclaim for IO completions if the lruvec is @@ -6568,7 +6594,8 @@ again: */ if (!current_is_kswapd() && current_may_throttle() && !sc->hibernation_mode && - test_bit(LRUVEC_CONGESTED, &target_lruvec->flags)) + (test_bit(LRUVEC_CGROUP_CONGESTED, &target_lruvec->flags) || + test_bit(LRUVEC_NODE_CONGESTED, &target_lruvec->flags))) reclaim_throttle(pgdat, VMSCAN_THROTTLE_CONGESTED); if (should_continue_reclaim(pgdat, nr_node_reclaimed, sc)) @@ -6592,14 +6619,14 @@ again: static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) { unsigned long watermark; - enum compact_result suitable; - suitable = compaction_suitable(zone, sc->order, 0, sc->reclaim_idx); - if (suitable == COMPACT_SUCCESS) - /* Allocation should succeed already. Don't reclaim. */ + /* Allocation can already succeed, nothing to do */ + if (zone_watermark_ok(zone, sc->order, min_wmark_pages(zone), + sc->reclaim_idx, 0)) return true; - if (suitable == COMPACT_SKIPPED) - /* Compaction cannot yet proceed. Do reclaim. */ + + /* Compaction cannot yet proceed. Do reclaim. */ + if (!compaction_suitable(zone, sc->order, sc->reclaim_idx)) return false; /* @@ -6825,7 +6852,7 @@ retry: lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, zone->zone_pgdat); - clear_bit(LRUVEC_CONGESTED, &lruvec->flags); + clear_bit(LRUVEC_CGROUP_CONGESTED, &lruvec->flags); } } @@ -6886,7 +6913,7 @@ static bool allow_direct_reclaim(pg_data_t *pgdat) continue; pfmemalloc_reserve += min_wmark_pages(zone); - free_pages += zone_page_state(zone, NR_FREE_PAGES); + free_pages += zone_page_state_snapshot(zone, NR_FREE_PAGES); } /* If there are no reserves (unexpected config) then do not throttle */ @@ -7214,7 +7241,8 @@ static void clear_pgdat_congested(pg_data_t *pgdat) { struct lruvec *lruvec = mem_cgroup_lruvec(NULL, pgdat); - clear_bit(LRUVEC_CONGESTED, &lruvec->flags); + clear_bit(LRUVEC_NODE_CONGESTED, &lruvec->flags); + clear_bit(LRUVEC_CGROUP_CONGESTED, &lruvec->flags); clear_bit(PGDAT_DIRTY, &pgdat->flags); clear_bit(PGDAT_WRITEBACK, &pgdat->flags); } @@ -7839,7 +7867,7 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) /* * This kswapd start function will be called by init and node-hot-add. */ -void kswapd_run(int nid) +void __meminit kswapd_run(int nid) { pg_data_t *pgdat = NODE_DATA(nid); @@ -7860,7 +7888,7 @@ void kswapd_run(int nid) * Called by memory hotplug when all memory in a node is offlined. Caller must * be holding mem_hotplug_begin/done(). */ -void kswapd_stop(int nid) +void __meminit kswapd_stop(int nid) { pg_data_t *pgdat = NODE_DATA(nid); struct task_struct *kswapd; @@ -8057,23 +8085,6 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order) } #endif -void check_move_unevictable_pages(struct pagevec *pvec) -{ - struct folio_batch fbatch; - unsigned i; - - folio_batch_init(&fbatch); - for (i = 0; i < pvec->nr; i++) { - struct page *page = pvec->pages[i]; - - if (PageTransTail(page)) - continue; - folio_batch_add(&fbatch, page_folio(page)); - } - check_move_unevictable_folios(&fbatch); -} -EXPORT_SYMBOL_GPL(check_move_unevictable_pages); - /** * check_move_unevictable_folios - Move evictable folios to appropriate zone * lru list |