From 4f3eaf452a14ff3982f71c1ca8bdf757254231fa Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 2 Sep 2021 14:52:58 -0700 Subject: mm: report a more useful address for reclaim acquisition A recent lockdep report included these lines: [ 96.177910] 3 locks held by containerd/770: [ 96.177934] #0: ffff88810815ea28 (&mm->mmap_lock#2){++++}-{3:3}, at: do_user_addr_fault+0x115/0x770 [ 96.177999] #1: ffffffff82915020 (rcu_read_lock){....}-{1:2}, at: get_swap_device+0x33/0x140 [ 96.178057] #2: ffffffff82955ba0 (fs_reclaim){+.+.}-{0:0}, at: __fs_reclaim_acquire+0x5/0x30 While it was not useful to that bug report to know where the reclaim lock had been acquired, it might be useful under other circumstances. Allow the caller of __fs_reclaim_acquire to specify the instruction pointer to use. Link: https://lkml.kernel.org/r/20210719185709.1755149-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Omar Sandoval Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Boqun Feng Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched/mm.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index e24b1fe348e3..8894825cc4db 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -174,13 +174,13 @@ static inline gfp_t current_gfp_context(gfp_t flags) } #ifdef CONFIG_LOCKDEP -extern void __fs_reclaim_acquire(void); -extern void __fs_reclaim_release(void); +extern void __fs_reclaim_acquire(unsigned long ip); +extern void __fs_reclaim_release(unsigned long ip); extern void fs_reclaim_acquire(gfp_t gfp_mask); extern void fs_reclaim_release(gfp_t gfp_mask); #else -static inline void __fs_reclaim_acquire(void) { } -static inline void __fs_reclaim_release(void) { } +static inline void __fs_reclaim_acquire(unsigned long ip) { } +static inline void __fs_reclaim_release(unsigned long ip) { } static inline void fs_reclaim_acquire(gfp_t gfp_mask) { } static inline void fs_reclaim_release(gfp_t gfp_mask) { } #endif -- cgit v1.2.3 From 633a2abb9e1cd5c95f3b600f4b2c12cce22ae4a0 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 2 Sep 2021 14:53:04 -0700 Subject: writeback: track number of inodes under writeback Patch series "writeback: Fix bandwidth estimates", v4. Fix estimate of writeback throughput when device is not fully busy doing writeback. Michael Stapelberg has reported that such workload (e.g. generated by linking) tends to push estimated throughput down to 0 and as a result writeback on the device is practically stalled. The first three patches fix the reported issue, the remaining two patches are unrelated cleanups of problems I've noticed when reading the code. This patch (of 4): Track number of inodes under writeback for each bdi_writeback structure. We will use this to decide whether wb does any IO and so we can estimate its writeback throughput. In principle we could use number of pages under writeback (WB_WRITEBACK counter) for this however normal percpu counter reads are too inaccurate for our purposes and summing the counter is too expensive. Link: https://lkml.kernel.org/r/20210713104519.16394-1-jack@suse.cz Link: https://lkml.kernel.org/r/20210713104716.22868-1-jack@suse.cz Signed-off-by: Jan Kara Cc: Wu Fengguang Cc: Michael Stapelberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fs-writeback.c | 5 +++++ include/linux/backing-dev-defs.h | 1 + mm/backing-dev.c | 1 + mm/page-writeback.c | 22 ++++++++++++++++++++-- 4 files changed, 27 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 4c3370548982..7439ecd44ac9 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -406,6 +406,11 @@ static bool inode_do_switch_wbs(struct inode *inode, inc_wb_stat(new_wb, WB_WRITEBACK); } + if (mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) { + atomic_dec(&old_wb->writeback_inodes); + atomic_inc(&new_wb->writeback_inodes); + } + wb_get(new_wb); /* diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index 1d7edad9914f..06fb8e13f6bc 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -116,6 +116,7 @@ struct bdi_writeback { struct list_head b_dirty_time; /* time stamps are dirty */ spinlock_t list_lock; /* protects the b_* lists */ + atomic_t writeback_inodes; /* number of inodes under writeback */ struct percpu_counter stat[NR_WB_STAT_ITEMS]; unsigned long congested; /* WB_[a]sync_congested flags */ diff --git a/mm/backing-dev.c b/mm/backing-dev.c index f5561ea7d90a..b4c707ddedb1 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -293,6 +293,7 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi, INIT_LIST_HEAD(&wb->b_dirty_time); spin_lock_init(&wb->list_lock); + atomic_set(&wb->writeback_inodes, 0); wb->bw_time_stamp = jiffies; wb->balanced_dirty_ratelimit = INIT_BW; wb->dirty_ratelimit = INIT_BW; diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 9f63548f247c..e1aa1c9d8e36 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2731,6 +2731,16 @@ int clear_page_dirty_for_io(struct page *page) } EXPORT_SYMBOL(clear_page_dirty_for_io); +static void wb_inode_writeback_start(struct bdi_writeback *wb) +{ + atomic_inc(&wb->writeback_inodes); +} + +static void wb_inode_writeback_end(struct bdi_writeback *wb) +{ + atomic_dec(&wb->writeback_inodes); +} + int test_clear_page_writeback(struct page *page) { struct address_space *mapping = page_mapping(page); @@ -2752,6 +2762,9 @@ int test_clear_page_writeback(struct page *page) dec_wb_stat(wb, WB_WRITEBACK); __wb_writeout_inc(wb); + if (!mapping_tagged(mapping, + PAGECACHE_TAG_WRITEBACK)) + wb_inode_writeback_end(wb); } } @@ -2794,8 +2807,13 @@ int __test_set_page_writeback(struct page *page, bool keep_write) PAGECACHE_TAG_WRITEBACK); xas_set_mark(&xas, PAGECACHE_TAG_WRITEBACK); - if (bdi->capabilities & BDI_CAP_WRITEBACK_ACCT) - inc_wb_stat(inode_to_wb(inode), WB_WRITEBACK); + if (bdi->capabilities & BDI_CAP_WRITEBACK_ACCT) { + struct bdi_writeback *wb = inode_to_wb(inode); + + inc_wb_stat(wb, WB_WRITEBACK); + if (!on_wblist) + wb_inode_writeback_start(wb); + } /* * We can come through here when swapping anonymous -- cgit v1.2.3 From fee468fdf41cdf36ba6b5a780e2474d0a3e066ac Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 2 Sep 2021 14:53:06 -0700 Subject: writeback: reliably update bandwidth estimation Currently we trigger writeback bandwidth estimation from balance_dirty_pages() and from wb_writeback(). However neither of these need to trigger when the system is relatively idle and writeback is triggered e.g. from fsync(2). Make sure writeback estimates happen reliably by triggering them from do_writepages(). Link: https://lkml.kernel.org/r/20210713104716.22868-2-jack@suse.cz Signed-off-by: Jan Kara Cc: Michael Stapelberg Cc: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fs-writeback.c | 3 --- include/linux/backing-dev.h | 19 +++++++++++++++++++ include/linux/writeback.h | 1 - mm/page-writeback.c | 39 +++++++++++++++++++++++++++------------ 4 files changed, 46 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 7439ecd44ac9..867984e778c3 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -2004,7 +2004,6 @@ static long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages, static long wb_writeback(struct bdi_writeback *wb, struct wb_writeback_work *work) { - unsigned long wb_start = jiffies; long nr_pages = work->nr_pages; unsigned long dirtied_before = jiffies; struct inode *inode; @@ -2058,8 +2057,6 @@ static long wb_writeback(struct bdi_writeback *wb, progress = __writeback_inodes_wb(wb, work); trace_writeback_written(wb, work); - wb_update_bandwidth(wb, wb_start); - /* * Did we write something? Try for more * diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 44df4fcef65c..8a886bca51e5 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -288,6 +288,17 @@ static inline struct bdi_writeback *inode_to_wb(const struct inode *inode) return inode->i_wb; } +static inline struct bdi_writeback *inode_to_wb_wbc( + struct inode *inode, + struct writeback_control *wbc) +{ + /* + * If wbc does not have inode attached, it means cgroup writeback was + * disabled when wbc started. Just use the default wb in that case. + */ + return wbc->wb ? wbc->wb : &inode_to_bdi(inode)->wb; +} + /** * unlocked_inode_to_wb_begin - begin unlocked inode wb access transaction * @inode: target inode @@ -366,6 +377,14 @@ static inline struct bdi_writeback *inode_to_wb(struct inode *inode) return &inode_to_bdi(inode)->wb; } +static inline struct bdi_writeback *inode_to_wb_wbc( + struct inode *inode, + struct writeback_control *wbc) +{ + return inode_to_wb(inode); +} + + static inline struct bdi_writeback * unlocked_inode_to_wb_begin(struct inode *inode, struct wb_lock_cookie *cookie) { diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 667e86cfbdcf..2480322c06a7 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -379,7 +379,6 @@ int dirty_writeback_centisecs_handler(struct ctl_table *table, int write, void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty); unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh); -void wb_update_bandwidth(struct bdi_writeback *wb, unsigned long start_time); void balance_dirty_pages_ratelimited(struct address_space *mapping); bool wb_over_bg_thresh(struct bdi_writeback *wb); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index e1aa1c9d8e36..e4a381b8944b 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1332,7 +1332,6 @@ static void wb_update_dirty_ratelimit(struct dirty_throttle_control *dtc, static void __wb_update_bandwidth(struct dirty_throttle_control *gdtc, struct dirty_throttle_control *mdtc, - unsigned long start_time, bool update_ratelimit) { struct bdi_writeback *wb = gdtc->wb; @@ -1352,13 +1351,6 @@ static void __wb_update_bandwidth(struct dirty_throttle_control *gdtc, dirtied = percpu_counter_read(&wb->stat[WB_DIRTIED]); written = percpu_counter_read(&wb->stat[WB_WRITTEN]); - /* - * Skip quiet periods when disk bandwidth is under-utilized. - * (at least 1s idle time between two flusher runs) - */ - if (elapsed > HZ && time_before(wb->bw_time_stamp, start_time)) - goto snapshot; - if (update_ratelimit) { domain_update_bandwidth(gdtc, now); wb_update_dirty_ratelimit(gdtc, dirtied, elapsed); @@ -1374,17 +1366,36 @@ static void __wb_update_bandwidth(struct dirty_throttle_control *gdtc, } wb_update_write_bandwidth(wb, elapsed, written); -snapshot: wb->dirtied_stamp = dirtied; wb->written_stamp = written; wb->bw_time_stamp = now; } -void wb_update_bandwidth(struct bdi_writeback *wb, unsigned long start_time) +static void wb_update_bandwidth(struct bdi_writeback *wb) { struct dirty_throttle_control gdtc = { GDTC_INIT(wb) }; - __wb_update_bandwidth(&gdtc, NULL, start_time, false); + spin_lock(&wb->list_lock); + __wb_update_bandwidth(&gdtc, NULL, false); + spin_unlock(&wb->list_lock); +} + +/* Interval after which we consider wb idle and don't estimate bandwidth */ +#define WB_BANDWIDTH_IDLE_JIF (HZ) + +static void wb_bandwidth_estimate_start(struct bdi_writeback *wb) +{ + unsigned long now = jiffies; + unsigned long elapsed = now - READ_ONCE(wb->bw_time_stamp); + + if (elapsed > WB_BANDWIDTH_IDLE_JIF && + !atomic_read(&wb->writeback_inodes)) { + spin_lock(&wb->list_lock); + wb->dirtied_stamp = wb_stat(wb, WB_DIRTIED); + wb->written_stamp = wb_stat(wb, WB_WRITTEN); + wb->bw_time_stamp = now; + spin_unlock(&wb->list_lock); + } } /* @@ -1713,7 +1724,7 @@ free_running: if (time_is_before_jiffies(wb->bw_time_stamp + BANDWIDTH_INTERVAL)) { spin_lock(&wb->list_lock); - __wb_update_bandwidth(gdtc, mdtc, start_time, true); + __wb_update_bandwidth(gdtc, mdtc, true); spin_unlock(&wb->list_lock); } @@ -2347,9 +2358,12 @@ EXPORT_SYMBOL(generic_writepages); int do_writepages(struct address_space *mapping, struct writeback_control *wbc) { int ret; + struct bdi_writeback *wb; if (wbc->nr_to_write <= 0) return 0; + wb = inode_to_wb_wbc(mapping->host, wbc); + wb_bandwidth_estimate_start(wb); while (1) { if (mapping->a_ops->writepages) ret = mapping->a_ops->writepages(mapping, wbc); @@ -2360,6 +2374,7 @@ int do_writepages(struct address_space *mapping, struct writeback_control *wbc) cond_resched(); congestion_wait(BLK_RW_ASYNC, HZ/50); } + wb_update_bandwidth(wb); return ret; } -- cgit v1.2.3 From 45a2966fd64147518dc5bca25f447bd0fb5359ac Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 2 Sep 2021 14:53:09 -0700 Subject: writeback: fix bandwidth estimate for spiky workload Michael Stapelberg has reported that for workload with short big spikes of writes (GCC linker seem to trigger this frequently) the write throughput is heavily underestimated and tends to steadily sink until it reaches zero. This has rather bad impact on writeback throttling (causing stalls). The problem is that writeback throughput estimate gets updated at most once per 200 ms. One update happens early after we submit pages for writeback (at that point writeout of only small fraction of pages is completed and thus observed throughput is tiny). Next update happens only during the next write spike (updates happen only from inode writeback and dirty throttling code) and if that is more than 1s after previous spike, we decide system was idle and just ignore whatever was written until this moment. Fix the problem by making sure writeback throughput estimate is also updated shortly after writeback completes to get reasonable estimate of throughput for spiky workloads. [jack@suse.cz: avoid division by 0 in wb_update_dirty_ratelimit()] Link: https://lore.kernel.org/lkml/20210617095309.3542373-1-stapelberg+linux@google.com Link: https://lkml.kernel.org/r/20210713104716.22868-3-jack@suse.cz Signed-off-by: Jan Kara Reported-by: Michael Stapelberg Tested-by: Michael Stapelberg Cc: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/backing-dev-defs.h | 1 + include/linux/writeback.h | 1 + mm/backing-dev.c | 10 ++++++++++ mm/page-writeback.c | 39 +++++++++++++++++++++++++-------------- 4 files changed, 37 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index 06fb8e13f6bc..33207004cfde 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -143,6 +143,7 @@ struct bdi_writeback { spinlock_t work_lock; /* protects work_list & dwork scheduling */ struct list_head work_list; struct delayed_work dwork; /* work item used for writeback */ + struct delayed_work bw_dwork; /* work item used for bandwidth estimate */ unsigned long dirty_sleep; /* last wait */ diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 2480322c06a7..cbaef099645e 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -379,6 +379,7 @@ int dirty_writeback_centisecs_handler(struct ctl_table *table, int write, void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty); unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh); +void wb_update_bandwidth(struct bdi_writeback *wb); void balance_dirty_pages_ratelimited(struct address_space *mapping); bool wb_over_bg_thresh(struct bdi_writeback *wb); diff --git a/mm/backing-dev.c b/mm/backing-dev.c index b4c707ddedb1..6122c78ce914 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -271,6 +271,14 @@ void wb_wakeup_delayed(struct bdi_writeback *wb) spin_unlock_bh(&wb->work_lock); } +static void wb_update_bandwidth_workfn(struct work_struct *work) +{ + struct bdi_writeback *wb = container_of(to_delayed_work(work), + struct bdi_writeback, bw_dwork); + + wb_update_bandwidth(wb); +} + /* * Initial write bandwidth: 100 MB/s */ @@ -303,6 +311,7 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi, spin_lock_init(&wb->work_lock); INIT_LIST_HEAD(&wb->work_list); INIT_DELAYED_WORK(&wb->dwork, wb_workfn); + INIT_DELAYED_WORK(&wb->bw_dwork, wb_update_bandwidth_workfn); wb->dirty_sleep = jiffies; err = fprop_local_init_percpu(&wb->completions, gfp); @@ -351,6 +360,7 @@ static void wb_shutdown(struct bdi_writeback *wb) mod_delayed_work(bdi_wq, &wb->dwork, 0); flush_delayed_work(&wb->dwork); WARN_ON(!list_empty(&wb->work_list)); + flush_delayed_work(&wb->bw_dwork); } static void wb_exit(struct bdi_writeback *wb) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index e4a381b8944b..156f5888c09d 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1336,18 +1336,19 @@ static void __wb_update_bandwidth(struct dirty_throttle_control *gdtc, { struct bdi_writeback *wb = gdtc->wb; unsigned long now = jiffies; - unsigned long elapsed = now - wb->bw_time_stamp; + unsigned long elapsed; unsigned long dirtied; unsigned long written; - lockdep_assert_held(&wb->list_lock); + spin_lock(&wb->list_lock); /* - * rate-limit, only update once every 200ms. + * Lockless checks for elapsed time are racy and delayed update after + * IO completion doesn't do it at all (to make sure written pages are + * accounted reasonably quickly). Make sure elapsed >= 1 to avoid + * division errors. */ - if (elapsed < BANDWIDTH_INTERVAL) - return; - + elapsed = max(now - wb->bw_time_stamp, 1UL); dirtied = percpu_counter_read(&wb->stat[WB_DIRTIED]); written = percpu_counter_read(&wb->stat[WB_WRITTEN]); @@ -1369,15 +1370,14 @@ static void __wb_update_bandwidth(struct dirty_throttle_control *gdtc, wb->dirtied_stamp = dirtied; wb->written_stamp = written; wb->bw_time_stamp = now; + spin_unlock(&wb->list_lock); } -static void wb_update_bandwidth(struct bdi_writeback *wb) +void wb_update_bandwidth(struct bdi_writeback *wb) { struct dirty_throttle_control gdtc = { GDTC_INIT(wb) }; - spin_lock(&wb->list_lock); __wb_update_bandwidth(&gdtc, NULL, false); - spin_unlock(&wb->list_lock); } /* Interval after which we consider wb idle and don't estimate bandwidth */ @@ -1722,11 +1722,8 @@ free_running: wb->dirty_exceeded = 1; if (time_is_before_jiffies(wb->bw_time_stamp + - BANDWIDTH_INTERVAL)) { - spin_lock(&wb->list_lock); + BANDWIDTH_INTERVAL)) __wb_update_bandwidth(gdtc, mdtc, true); - spin_unlock(&wb->list_lock); - } /* throttle according to the chosen dtc */ dirty_ratelimit = wb->dirty_ratelimit; @@ -2374,7 +2371,13 @@ int do_writepages(struct address_space *mapping, struct writeback_control *wbc) cond_resched(); congestion_wait(BLK_RW_ASYNC, HZ/50); } - wb_update_bandwidth(wb); + /* + * Usually few pages are written by now from those we've just submitted + * but if there's constant writeback being submitted, this makes sure + * writeback bandwidth is updated once in a while. + */ + if (time_is_before_jiffies(wb->bw_time_stamp + BANDWIDTH_INTERVAL)) + wb_update_bandwidth(wb); return ret; } @@ -2754,6 +2757,14 @@ static void wb_inode_writeback_start(struct bdi_writeback *wb) static void wb_inode_writeback_end(struct bdi_writeback *wb) { atomic_dec(&wb->writeback_inodes); + /* + * Make sure estimate of writeback throughput gets updated after + * writeback completed. We delay the update by BANDWIDTH_INTERVAL + * (which is the interval other bandwidth updates use for batching) so + * that if multiple inodes end writeback at a similar time, they get + * batched into one bandwidth update. + */ + queue_delayed_work(bdi_wq, &wb->bw_dwork, BANDWIDTH_INTERVAL); } int test_clear_page_writeback(struct page *page) -- cgit v1.2.3 From 7490a2d248145d8694e1e9828801b496250fd697 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 2 Sep 2021 14:53:27 -0700 Subject: writeback: memcg: simplify cgroup_writeback_by_id Currently cgroup_writeback_by_id calls mem_cgroup_wb_stats() to get dirty pages for a memcg. However mem_cgroup_wb_stats() does a lot more than just get the number of dirty pages. Just directly get the number of dirty pages instead of calling mem_cgroup_wb_stats(). Also cgroup_writeback_by_id() is only called for best-effort dirty flushing, so remove the unused 'nr' parameter and no need to explicitly flush memcg stats. Link: https://lkml.kernel.org/r/20210722182627.2267368-1-shakeelb@google.com Signed-off-by: Shakeel Butt Reviewed-by: Jan Kara Cc: Tejun Heo Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fs-writeback.c | 20 +++++++++----------- include/linux/memcontrol.h | 15 +++++++++++++++ include/linux/writeback.h | 2 +- mm/memcontrol.c | 13 +------------ 4 files changed, 26 insertions(+), 24 deletions(-) (limited to 'include/linux') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 867984e778c3..35894a2dba75 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -1039,20 +1039,20 @@ restart: * cgroup_writeback_by_id - initiate cgroup writeback from bdi and memcg IDs * @bdi_id: target bdi id * @memcg_id: target memcg css id - * @nr: number of pages to write, 0 for best-effort dirty flushing * @reason: reason why some writeback work initiated * @done: target wb_completion * * Initiate flush of the bdi_writeback identified by @bdi_id and @memcg_id * with the specified parameters. */ -int cgroup_writeback_by_id(u64 bdi_id, int memcg_id, unsigned long nr, +int cgroup_writeback_by_id(u64 bdi_id, int memcg_id, enum wb_reason reason, struct wb_completion *done) { struct backing_dev_info *bdi; struct cgroup_subsys_state *memcg_css; struct bdi_writeback *wb; struct wb_writeback_work *work; + unsigned long dirty; int ret; /* lookup bdi and memcg */ @@ -1081,24 +1081,22 @@ int cgroup_writeback_by_id(u64 bdi_id, int memcg_id, unsigned long nr, } /* - * If @nr is zero, the caller is attempting to write out most of + * The caller is attempting to write out most of * the currently dirty pages. Let's take the current dirty page * count and inflate it by 25% which should be large enough to * flush out most dirty pages while avoiding getting livelocked by * concurrent dirtiers. + * + * BTW the memcg stats are flushed periodically and this is best-effort + * estimation, so some potential error is ok. */ - if (!nr) { - unsigned long filepages, headroom, dirty, writeback; - - mem_cgroup_wb_stats(wb, &filepages, &headroom, &dirty, - &writeback); - nr = dirty * 10 / 8; - } + dirty = memcg_page_state(mem_cgroup_from_css(memcg_css), NR_FILE_DIRTY); + dirty = dirty * 10 / 8; /* issue the writeback work */ work = kzalloc(sizeof(*work), GFP_NOWAIT | __GFP_NOWARN); if (work) { - work->nr_pages = nr; + work->nr_pages = dirty; work->sync_mode = WB_SYNC_NONE; work->range_cyclic = 1; work->reason = reason; diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 24797929d8a1..3403ec77528a 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -955,6 +955,16 @@ static inline void mod_memcg_state(struct mem_cgroup *memcg, local_irq_restore(flags); } +static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) +{ + long x = READ_ONCE(memcg->vmstats.state[idx]); +#ifdef CONFIG_SMP + if (x < 0) + x = 0; +#endif + return x; +} + static inline unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx) { @@ -1391,6 +1401,11 @@ static inline void mod_memcg_state(struct mem_cgroup *memcg, { } +static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) +{ + return 0; +} + static inline unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx) { diff --git a/include/linux/writeback.h b/include/linux/writeback.h index cbaef099645e..aeda2c0c9986 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -218,7 +218,7 @@ void wbc_attach_and_unlock_inode(struct writeback_control *wbc, void wbc_detach_inode(struct writeback_control *wbc); void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page, size_t bytes); -int cgroup_writeback_by_id(u64 bdi_id, int memcg_id, unsigned long nr_pages, +int cgroup_writeback_by_id(u64 bdi_id, int memcg_id, enum wb_reason reason, struct wb_completion *done); void cgroup_writeback_umount(void); bool cleanup_offline_cgwb(struct bdi_writeback *wb); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 702a81dfe72d..1047f0271ff8 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -645,17 +645,6 @@ void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val) cgroup_rstat_updated(memcg->css.cgroup, smp_processor_id()); } -/* idx can be of type enum memcg_stat_item or node_stat_item. */ -static unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) -{ - long x = READ_ONCE(memcg->vmstats.state[idx]); -#ifdef CONFIG_SMP - if (x < 0) - x = 0; -#endif - return x; -} - /* idx can be of type enum memcg_stat_item or node_stat_item. */ static unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx) { @@ -4668,7 +4657,7 @@ void mem_cgroup_flush_foreign(struct bdi_writeback *wb) atomic_read(&frn->done.cnt) == 1) { frn->at = 0; trace_flush_foreign(wb, frn->bdi_id, frn->memcg_id); - cgroup_writeback_by_id(frn->bdi_id, frn->memcg_id, 0, + cgroup_writeback_by_id(frn->bdi_id, frn->memcg_id, WB_REASON_FOREIGN_FLUSH, &frn->done); } -- cgit v1.2.3 From 6de522d1667f628376517e4b177af10c739d745b Mon Sep 17 00:00:00 2001 From: Jing Yangyang Date: Thu, 2 Sep 2021 14:53:30 -0700 Subject: include/linux/buffer_head.h: fix boolreturn.cocci warnings ./include/linux/buffer_head.h:412:64-65:WARNING:return of 0/1 in function 'has_bh_in_lru' with return type bool Return statements in functions returning bool should use true/false instead of 1/0. Generated by: scripts/coccinelle/misc/boolreturn.cocci Link: https://lkml.kernel.org/r/20210824055828.58783-1-deng.changcheng@zte.com.cn Signed-off-by: Jing Yangyang Reported-by: Zeal Robot Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/buffer_head.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index e7e99da31349..6486d3c19463 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -409,7 +409,7 @@ static inline void invalidate_inode_buffers(struct inode *inode) {} static inline int remove_inode_buffers(struct inode *inode) { return 1; } static inline int sync_mapping_buffers(struct address_space *mapping) { return 0; } static inline void invalidate_bh_lrus_cpu(int cpu) {} -static inline bool has_bh_in_lru(int cpu, void *dummy) { return 0; } +static inline bool has_bh_in_lru(int cpu, void *dummy) { return false; } #define buffer_heads_over_limit 0 #endif /* CONFIG_BLOCK */ -- cgit v1.2.3 From 54d516b1d62ff8f17cee2da06e5e4706a0d00b8a Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Thu, 2 Sep 2021 14:53:51 -0700 Subject: mm/gup: small refactoring: simplify try_grab_page() try_grab_page() does the same thing as try_grab_compound_head(..., refs=1, ...), just with a different API. So there is a lot of code duplication there. Change try_grab_page() to call try_grab_compound_head(), while keeping the API contract identical for callers. Also, now that try_grab_compound_head() always has a caller, remove the __maybe_unused annotation. Link: https://lkml.kernel.org/r/20210813044133.1536842-3-jhubbard@nvidia.com Signed-off-by: John Hubbard Reviewed-by: Christoph Hellwig Cc: Matthew Wilcox Cc: Christian Borntraeger Cc: Heiko Carstens Cc: Vasily Gorbik Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 4 ++-- mm/gup.c | 35 +++++------------------------------ 2 files changed, 7 insertions(+), 32 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 7ca22e6e694a..af4845e81b84 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1214,8 +1214,8 @@ static inline void get_page(struct page *page) } bool __must_check try_grab_page(struct page *page, unsigned int flags); -__maybe_unused struct page *try_grab_compound_head(struct page *page, int refs, - unsigned int flags); +struct page *try_grab_compound_head(struct page *page, int refs, + unsigned int flags); static inline __must_check bool try_get_page(struct page *page) diff --git a/mm/gup.c b/mm/gup.c index 26ce6bb52044..d60419ed9262 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -124,8 +124,8 @@ static inline struct page *try_get_compound_head(struct page *page, int refs) * considered failure, and furthermore, a likely bug in the caller, so a warning * is also emitted. */ -__maybe_unused struct page *try_grab_compound_head(struct page *page, - int refs, unsigned int flags) +struct page *try_grab_compound_head(struct page *page, + int refs, unsigned int flags) { if (flags & FOLL_GET) return try_get_compound_head(page, refs); @@ -208,35 +208,10 @@ static void put_compound_head(struct page *page, int refs, unsigned int flags) */ bool __must_check try_grab_page(struct page *page, unsigned int flags) { - WARN_ON_ONCE((flags & (FOLL_GET | FOLL_PIN)) == (FOLL_GET | FOLL_PIN)); + if (!(flags & (FOLL_GET | FOLL_PIN))) + return true; - if (flags & FOLL_GET) - return try_get_page(page); - else if (flags & FOLL_PIN) { - int refs = 1; - - page = compound_head(page); - - if (WARN_ON_ONCE(page_ref_count(page) <= 0)) - return false; - - if (hpage_pincount_available(page)) - hpage_pincount_add(page, 1); - else - refs = GUP_PIN_COUNTING_BIAS; - - /* - * Similar to try_grab_compound_head(): even if using the - * hpage_pincount_add/_sub() routines, be sure to - * *also* increment the normal page refcount field at least - * once, so that the page really is pinned. - */ - page_ref_add(page, refs); - - mod_node_page_state(page_pgdat(page), NR_FOLL_PIN_ACQUIRED, 1); - } - - return true; + return try_grab_compound_head(page, 1, flags); } /** -- cgit v1.2.3 From 9857a17f206ff374aea78bccfb687f145368be2e Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Thu, 2 Sep 2021 14:53:54 -0700 Subject: mm/gup: remove try_get_page(), call try_get_compound_head() directly try_get_page() is very similar to try_get_compound_head(), and in fact try_get_page() has fallen a little behind in terms of maintenance: try_get_compound_head() handles speculative page references more thoroughly. There are only two try_get_page() callsites, so just call try_get_compound_head() directly from those, and remove try_get_page() entirely. Also, seeing as how this changes try_get_compound_head() into a non-static function, provide some kerneldoc documentation for it. Link: https://lkml.kernel.org/r/20210813044133.1536842-4-jhubbard@nvidia.com Signed-off-by: John Hubbard Reviewed-by: Christoph Hellwig Cc: Matthew Wilcox Cc: Christian Borntraeger Cc: Heiko Carstens Cc: Vasily Gorbik Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/s390/mm/fault.c | 2 +- fs/pipe.c | 2 +- include/linux/mm.h | 10 +--------- mm/gup.c | 21 +++++++++++++++++---- 4 files changed, 20 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index e33c43b38afe..81d760749987 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -817,7 +817,7 @@ void do_secure_storage_access(struct pt_regs *regs) break; case KERNEL_FAULT: page = phys_to_page(addr); - if (unlikely(!try_get_page(page))) + if (unlikely(!try_get_compound_head(page, 1))) break; rc = arch_make_page_accessible(page); put_page(page); diff --git a/fs/pipe.c b/fs/pipe.c index 6d4342bad9f1..1fa1f52763f0 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -191,7 +191,7 @@ EXPORT_SYMBOL(generic_pipe_buf_try_steal); */ bool generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { - return try_get_page(buf->page); + return try_get_compound_head(buf->page, 1); } EXPORT_SYMBOL(generic_pipe_buf_get); diff --git a/include/linux/mm.h b/include/linux/mm.h index af4845e81b84..d3439dd4f4ba 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1217,15 +1217,7 @@ bool __must_check try_grab_page(struct page *page, unsigned int flags); struct page *try_grab_compound_head(struct page *page, int refs, unsigned int flags); - -static inline __must_check bool try_get_page(struct page *page) -{ - page = compound_head(page); - if (WARN_ON_ONCE(page_ref_count(page) <= 0)) - return false; - page_ref_inc(page); - return true; -} +struct page *try_get_compound_head(struct page *page, int refs); static inline void put_page(struct page *page) { diff --git a/mm/gup.c b/mm/gup.c index d60419ed9262..1c7f4ec6990b 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -62,11 +62,24 @@ static void put_page_refs(struct page *page, int refs) put_page(page); } -/* - * Return the compound head page with ref appropriately incremented, - * or NULL if that failed. +/** + * try_get_compound_head() - return the compound head page with refcount + * appropriately incremented, or NULL if that failed. + * + * This handles potential refcount overflow correctly. It also works correctly + * for various lockless get_user_pages()-related callers, due to the use of + * page_cache_add_speculative(). + * + * Even though the name includes "compound_head", this function is still + * appropriate for callers that have a non-compound @page to get. + * + * @page: pointer to page to be gotten + * @refs: the value to add to the page's refcount + * + * Return: head page (with refcount appropriately incremented) for success, or + * NULL upon failure. */ -static inline struct page *try_get_compound_head(struct page *page, int refs) +struct page *try_get_compound_head(struct page *page, int refs) { struct page *head = compound_head(page); -- cgit v1.2.3 From 3969b1a654fb09b7915efc1aa4ad45daf932e12f Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Thu, 2 Sep 2021 14:54:00 -0700 Subject: mm: delete unused get_kernel_page() get_kernel_page() was added in 2012 by [1]. It was used for a while for NFS, but then in 2014, a refactoring [2] removed all callers, and it has apparently not been used since. Remove get_kernel_page() because it has no callers. [1] commit 18022c5d8627 ("mm: add get_kernel_page[s] for pinning of kernel addresses for I/O") [2] commit 91f79c43d1b5 ("new helper: iov_iter_get_pages_alloc()") Link: https://lkml.kernel.org/r/20210729221847.1165665-1-jhubbard@nvidia.com Signed-off-by: John Hubbard Reviewed-by: Christoph Hellwig Reviewed-by: David Hildenbrand Cc: Mel Gorman Cc: Rik van Riel Cc: David S. Miller Cc: Eric B Munson Cc: Eric Paris Cc: James Morris Cc: Mike Christie Cc: Neil Brown Cc: Peter Zijlstra Cc: Sebastian Andrzej Siewior Cc: Trond Myklebust Cc: Xiaotian Feng Cc: Mark Salter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 1 - mm/swap.c | 22 ---------------------- 2 files changed, 23 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index d3439dd4f4ba..35bbac32b6f6 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1839,7 +1839,6 @@ int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc, struct kvec; int get_kernel_pages(const struct kvec *iov, int nr_pages, int write, struct page **pages); -int get_kernel_page(unsigned long start, int write, struct page **pages); struct page *get_dump_page(unsigned long addr); extern int try_to_release_page(struct page * page, gfp_t gfp_mask); diff --git a/mm/swap.c b/mm/swap.c index 19600430e536..897200d27dd0 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -179,28 +179,6 @@ int get_kernel_pages(const struct kvec *kiov, int nr_segs, int write, } EXPORT_SYMBOL_GPL(get_kernel_pages); -/* - * get_kernel_page() - pin a kernel page in memory - * @start: starting kernel address - * @write: pinning for read/write, currently ignored - * @pages: array that receives pointer to the page pinned. - * Must be at least nr_segs long. - * - * Returns 1 if page is pinned. If the page was not pinned, returns - * -errno. The page returned must be released with a put_page() call - * when it is finished with. - */ -int get_kernel_page(unsigned long start, int write, struct page **pages) -{ - const struct kvec kiov = { - .iov_base = (void *)start, - .iov_len = PAGE_SIZE - }; - - return get_kernel_pages(&kiov, 1, write, pages); -} -EXPORT_SYMBOL_GPL(get_kernel_page); - static void pagevec_lru_move_fn(struct pagevec *pvec, void (*move_fn)(struct page *page, struct lruvec *lruvec)) { -- cgit v1.2.3 From bf11b9a8e9a93c1fc0ebfc2929622d5cf7d43888 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 2 Sep 2021 14:54:03 -0700 Subject: shmem: use raw_spinlock_t for ->stat_lock Each CPU has SHMEM_INO_BATCH inodes available in `->ino_batch' which is per-CPU. Access here is serialized by disabling preemption. If the pool is empty, it gets reloaded from `->next_ino'. Access here is serialized by ->stat_lock which is a spinlock_t and can not be acquired with disabled preemption. One way around it would make per-CPU ino_batch struct containing the inode number a local_lock_t. Another solution is to promote ->stat_lock to a raw_spinlock_t. The critical sections are short. The mpol_put() must be moved outside of the critical section to avoid invoking the destructor with disabled preemption. Link: https://lkml.kernel.org/r/20210806142916.jdwkb5bx62q5fwfo@linutronix.de Signed-off-by: Sebastian Andrzej Siewior Acked-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/shmem_fs.h | 2 +- mm/shmem.c | 31 +++++++++++++++++-------------- 2 files changed, 18 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index 8e775ce517bb..0a8499fb9c3c 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -31,7 +31,7 @@ struct shmem_sb_info { struct percpu_counter used_blocks; /* How many are allocated */ unsigned long max_inodes; /* How many inodes are allowed */ unsigned long free_inodes; /* How many are left for allocation */ - spinlock_t stat_lock; /* Serialize shmem_sb_info changes */ + raw_spinlock_t stat_lock; /* Serialize shmem_sb_info changes */ umode_t mode; /* Mount mode for root directory */ unsigned char huge; /* Whether to try for hugepages */ kuid_t uid; /* Mount uid for root directory */ diff --git a/mm/shmem.c b/mm/shmem.c index dacda7463d54..7eec13e39ac3 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -278,10 +278,10 @@ static int shmem_reserve_inode(struct super_block *sb, ino_t *inop) ino_t ino; if (!(sb->s_flags & SB_KERNMOUNT)) { - spin_lock(&sbinfo->stat_lock); + raw_spin_lock(&sbinfo->stat_lock); if (sbinfo->max_inodes) { if (!sbinfo->free_inodes) { - spin_unlock(&sbinfo->stat_lock); + raw_spin_unlock(&sbinfo->stat_lock); return -ENOSPC; } sbinfo->free_inodes--; @@ -304,7 +304,7 @@ static int shmem_reserve_inode(struct super_block *sb, ino_t *inop) } *inop = ino; } - spin_unlock(&sbinfo->stat_lock); + raw_spin_unlock(&sbinfo->stat_lock); } else if (inop) { /* * __shmem_file_setup, one of our callers, is lock-free: it @@ -319,13 +319,14 @@ static int shmem_reserve_inode(struct super_block *sb, ino_t *inop) * to worry about things like glibc compatibility. */ ino_t *next_ino; + next_ino = per_cpu_ptr(sbinfo->ino_batch, get_cpu()); ino = *next_ino; if (unlikely(ino % SHMEM_INO_BATCH == 0)) { - spin_lock(&sbinfo->stat_lock); + raw_spin_lock(&sbinfo->stat_lock); ino = sbinfo->next_ino; sbinfo->next_ino += SHMEM_INO_BATCH; - spin_unlock(&sbinfo->stat_lock); + raw_spin_unlock(&sbinfo->stat_lock); if (unlikely(is_zero_ino(ino))) ino++; } @@ -341,9 +342,9 @@ static void shmem_free_inode(struct super_block *sb) { struct shmem_sb_info *sbinfo = SHMEM_SB(sb); if (sbinfo->max_inodes) { - spin_lock(&sbinfo->stat_lock); + raw_spin_lock(&sbinfo->stat_lock); sbinfo->free_inodes++; - spin_unlock(&sbinfo->stat_lock); + raw_spin_unlock(&sbinfo->stat_lock); } } @@ -1453,10 +1454,10 @@ static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) { struct mempolicy *mpol = NULL; if (sbinfo->mpol) { - spin_lock(&sbinfo->stat_lock); /* prevent replace/use races */ + raw_spin_lock(&sbinfo->stat_lock); /* prevent replace/use races */ mpol = sbinfo->mpol; mpol_get(mpol); - spin_unlock(&sbinfo->stat_lock); + raw_spin_unlock(&sbinfo->stat_lock); } return mpol; } @@ -3488,9 +3489,10 @@ static int shmem_reconfigure(struct fs_context *fc) struct shmem_options *ctx = fc->fs_private; struct shmem_sb_info *sbinfo = SHMEM_SB(fc->root->d_sb); unsigned long inodes; + struct mempolicy *mpol = NULL; const char *err; - spin_lock(&sbinfo->stat_lock); + raw_spin_lock(&sbinfo->stat_lock); inodes = sbinfo->max_inodes - sbinfo->free_inodes; if ((ctx->seen & SHMEM_SEEN_BLOCKS) && ctx->blocks) { if (!sbinfo->max_blocks) { @@ -3535,14 +3537,15 @@ static int shmem_reconfigure(struct fs_context *fc) * Preserve previous mempolicy unless mpol remount option was specified. */ if (ctx->mpol) { - mpol_put(sbinfo->mpol); + mpol = sbinfo->mpol; sbinfo->mpol = ctx->mpol; /* transfers initial ref */ ctx->mpol = NULL; } - spin_unlock(&sbinfo->stat_lock); + raw_spin_unlock(&sbinfo->stat_lock); + mpol_put(mpol); return 0; out: - spin_unlock(&sbinfo->stat_lock); + raw_spin_unlock(&sbinfo->stat_lock); return invalfc(fc, "%s", err); } @@ -3659,7 +3662,7 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc) sbinfo->mpol = ctx->mpol; ctx->mpol = NULL; - spin_lock_init(&sbinfo->stat_lock); + raw_spin_lock_init(&sbinfo->stat_lock); if (percpu_counter_init(&sbinfo->used_blocks, 0, GFP_KERNEL)) goto failed; spin_lock_init(&sbinfo->shrinklist_lock); -- cgit v1.2.3 From d144bf6205342a4b5fed5d204ae18849a4de741b Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 2 Sep 2021 14:54:21 -0700 Subject: huge tmpfs: fix split_huge_page() after FALLOC_FL_KEEP_SIZE A successful shmem_fallocate() guarantees that the extent has been reserved, even beyond i_size when the FALLOC_FL_KEEP_SIZE flag was used. But that guarantee is broken by shmem_unused_huge_shrink()'s attempts to split huge pages and free their excess beyond i_size; and by other uses of split_huge_page() near i_size. It's sad to add a shmem inode field just for this, but I did not find a better way to keep the guarantee. A flag to say KEEP_SIZE has been used would be cheaper, but I'm averse to unclearable flags. The fallocend field is not perfect either (many disjoint ranges might be fallocated), but good enough; and gains another use later on. Link: https://lkml.kernel.org/r/ca9a146-3a59-6cd3-7f28-e9a044bb1052@google.com Fixes: 779750d20b93 ("shmem: split huge pages beyond i_size under memory pressure") Signed-off-by: Hugh Dickins Reviewed-by: Yang Shi Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Michal Hocko Cc: Mike Kravetz Cc: Rik van Riel Cc: Shakeel Butt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/shmem_fs.h | 13 +++++++++++++ mm/huge_memory.c | 6 ++++-- mm/shmem.c | 15 ++++++++++++++- 3 files changed, 31 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index 0a8499fb9c3c..bfc5899d18e0 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -18,6 +18,7 @@ struct shmem_inode_info { unsigned long flags; unsigned long alloced; /* data pages alloced to file */ unsigned long swapped; /* subtotal assigned to swap */ + pgoff_t fallocend; /* highest fallocate endindex */ struct list_head shrinklist; /* shrinkable hpage inodes */ struct list_head swaplist; /* chain of maybes on swap */ struct shared_policy policy; /* NUMA memory alloc policy */ @@ -119,6 +120,18 @@ static inline bool shmem_file(struct file *file) return shmem_mapping(file->f_mapping); } +/* + * If fallocate(FALLOC_FL_KEEP_SIZE) has been used, there may be pages + * beyond i_size's notion of EOF, which fallocate has committed to reserving: + * which split_huge_page() must therefore not delete. This use of a single + * "fallocend" per inode errs on the side of not deleting a reservation when + * in doubt: there are plenty of cases when it preserves unreserved pages. + */ +static inline pgoff_t shmem_fallocend(struct inode *inode, pgoff_t eof) +{ + return max(eof, SHMEM_I(inode)->fallocend); +} + extern bool shmem_charge(struct inode *inode, long pages); extern void shmem_uncharge(struct inode *inode, long pages); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index afff3ac87067..890fb73ac89b 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2454,11 +2454,11 @@ static void __split_huge_page(struct page *page, struct list_head *list, for (i = nr - 1; i >= 1; i--) { __split_huge_page_tail(head, i, lruvec, list); - /* Some pages can be beyond i_size: drop them from page cache */ + /* Some pages can be beyond EOF: drop them from page cache */ if (head[i].index >= end) { ClearPageDirty(head + i); __delete_from_page_cache(head + i, NULL); - if (IS_ENABLED(CONFIG_SHMEM) && PageSwapBacked(head)) + if (shmem_mapping(head->mapping)) shmem_uncharge(head->mapping->host, 1); put_page(head + i); } else if (!PageAnon(page)) { @@ -2686,6 +2686,8 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) * head page lock is good enough to serialize the trimming. */ end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE); + if (shmem_mapping(mapping)) + end = shmem_fallocend(mapping->host, end); } /* diff --git a/mm/shmem.c b/mm/shmem.c index 9ef579f6cab3..e391325dfc21 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -902,6 +902,9 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, if (lend == -1) end = -1; /* unsigned, so actually very big */ + if (info->fallocend > start && info->fallocend <= end && !unfalloc) + info->fallocend = start; + pagevec_init(&pvec); index = start; while (index < end && find_lock_entries(mapping, index, end - 1, @@ -2650,7 +2653,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_falloc shmem_falloc; - pgoff_t start, index, end; + pgoff_t start, index, end, undo_fallocend; int error; if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) @@ -2719,6 +2722,15 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, inode->i_private = &shmem_falloc; spin_unlock(&inode->i_lock); + /* + * info->fallocend is only relevant when huge pages might be + * involved: to prevent split_huge_page() freeing fallocated + * pages when FALLOC_FL_KEEP_SIZE committed beyond i_size. + */ + undo_fallocend = info->fallocend; + if (info->fallocend < end) + info->fallocend = end; + for (index = start; index < end; ) { struct page *page; @@ -2733,6 +2745,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, else error = shmem_getpage(inode, index, &page, SGP_FALLOC); if (error) { + info->fallocend = undo_fallocend; /* Remove the !PageUptodate pages we added */ if (index > start) { shmem_undo_range(inode, -- cgit v1.2.3 From acdd9f8e0fed9f1bd7e83a8ff934694bb4c9a72b Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 2 Sep 2021 14:54:34 -0700 Subject: huge tmpfs: SGP_NOALLOC to stop collapse_file() on race khugepaged's collapse_file() currently uses SGP_NOHUGE to tell shmem_getpage() not to try allocating a huge page, in the very unlikely event that a racing hole-punch removes the swapped or fallocated page as soon as i_pages lock is dropped. We want to consolidate shmem's huge decisions, removing SGP_HUGE and SGP_NOHUGE; but cannot quite persuade ourselves that it's okay to regress the protection in this case - Yang Shi points out that the huge page would remain indefinitely, charged to root instead of the intended memcg. collapse_file() should not even allocate a small page in this case: why proceed if someone is punching a hole? SGP_READ is almost the right flag here, except that it optimizes away from a fallocated page, with NULL to tell caller to fill with zeroes (like a hole); whereas collapse_file()'s sequence relies on using a cache page. Add SGP_NOALLOC just for this. There are too many consecutive "if (page"s there in shmem_getpage_gfp(): group it better; and fix the outdated "bring it back from swap" comment. Link: https://lkml.kernel.org/r/1355343b-acf-4653-ef79-6aee40214ac5@google.com Signed-off-by: Hugh Dickins Reviewed-by: Yang Shi Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Michal Hocko Cc: Mike Kravetz Cc: Rik van Riel Cc: Shakeel Butt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/shmem_fs.h | 1 + mm/khugepaged.c | 2 +- mm/shmem.c | 29 +++++++++++++++++------------ 3 files changed, 19 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index bfc5899d18e0..a3f4502ec8a9 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -94,6 +94,7 @@ extern unsigned long shmem_partial_swap_usage(struct address_space *mapping, /* Flag allocation requirements to shmem_getpage */ enum sgp_type { SGP_READ, /* don't exceed i_size, don't allocate page */ + SGP_NOALLOC, /* similar, but fail on hole or use fallocated page */ SGP_CACHE, /* don't exceed i_size, may allocate page */ SGP_NOHUGE, /* like SGP_CACHE, but no huge pages */ SGP_HUGE, /* like SGP_CACHE, huge pages preferred */ diff --git a/mm/khugepaged.c b/mm/khugepaged.c index b0412be08fa2..045cc579f724 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1721,7 +1721,7 @@ static void collapse_file(struct mm_struct *mm, xas_unlock_irq(&xas); /* swap in or instantiate fallocated page */ if (shmem_getpage(mapping->host, index, &page, - SGP_NOHUGE)) { + SGP_NOALLOC)) { result = SCAN_FAIL; goto xa_unlocked; } diff --git a/mm/shmem.c b/mm/shmem.c index 2df6a5370cd7..867cb404090a 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1854,26 +1854,31 @@ repeat: return error; } - if (page) + if (page) { hindex = page->index; - if (page && sgp == SGP_WRITE) - mark_page_accessed(page); - - /* fallocated page? */ - if (page && !PageUptodate(page)) { + if (sgp == SGP_WRITE) + mark_page_accessed(page); + if (PageUptodate(page)) + goto out; + /* fallocated page */ if (sgp != SGP_READ) goto clear; unlock_page(page); put_page(page); - page = NULL; - hindex = index; } - if (page || sgp == SGP_READ) - goto out; /* - * Fast cache lookup did not find it: - * bring it back from swap or allocate. + * SGP_READ: succeed on hole, with NULL page, letting caller zero. + * SGP_NOALLOC: fail on hole, with NULL page, letting caller fail. + */ + *pagep = NULL; + if (sgp == SGP_READ) + return 0; + if (sgp == SGP_NOALLOC) + return -ENOENT; + + /* + * Fast cache lookup and swap lookup did not find it: allocate. */ if (vma && userfaultfd_missing(vma)) { -- cgit v1.2.3 From 5e6e5a12a44ca5ff2b130d8d39aaf9b8c026de94 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 2 Sep 2021 14:54:37 -0700 Subject: huge tmpfs: shmem_is_huge(vma, inode, index) Extend shmem_huge_enabled(vma) to shmem_is_huge(vma, inode, index), so that a consistent set of checks can be applied, even when the inode is accessed through read/write syscalls (with NULL vma) instead of mmaps (the index argument is seldom of interest, but required by mount option "huge=within_size"). Clean up and rearrange the checks a little. This then replaces the checks which shmem_fault() and shmem_getpage_gfp() were making, and eliminates the SGP_HUGE and SGP_NOHUGE modes. Replace a couple of 0s by explicit SHMEM_HUGE_NEVERs; and replace the obscure !shmem_mapping() symlink check by explicit S_ISLNK() - nothing else needs that symlink check, so leave it there in shmem_getpage_gfp(). Link: https://lkml.kernel.org/r/23a77889-2ddc-b030-75cd-44ca27fd4d1@google.com Signed-off-by: Hugh Dickins Reviewed-by: Yang Shi Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Michal Hocko Cc: Mike Kravetz Cc: Rik van Riel Cc: Shakeel Butt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/shmem_fs.h | 9 ++++-- mm/shmem.c | 84 ++++++++++++++---------------------------------- 2 files changed, 31 insertions(+), 62 deletions(-) (limited to 'include/linux') diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index a3f4502ec8a9..166158b6e917 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -86,7 +86,12 @@ extern void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end); extern int shmem_unuse(unsigned int type, bool frontswap, unsigned long *fs_pages_to_unuse); -extern bool shmem_huge_enabled(struct vm_area_struct *vma); +extern bool shmem_is_huge(struct vm_area_struct *vma, + struct inode *inode, pgoff_t index); +static inline bool shmem_huge_enabled(struct vm_area_struct *vma) +{ + return shmem_is_huge(vma, file_inode(vma->vm_file), vma->vm_pgoff); +} extern unsigned long shmem_swap_usage(struct vm_area_struct *vma); extern unsigned long shmem_partial_swap_usage(struct address_space *mapping, pgoff_t start, pgoff_t end); @@ -96,8 +101,6 @@ enum sgp_type { SGP_READ, /* don't exceed i_size, don't allocate page */ SGP_NOALLOC, /* similar, but fail on hole or use fallocated page */ SGP_CACHE, /* don't exceed i_size, may allocate page */ - SGP_NOHUGE, /* like SGP_CACHE, but no huge pages */ - SGP_HUGE, /* like SGP_CACHE, huge pages preferred */ SGP_WRITE, /* may exceed i_size, may allocate !Uptodate page */ SGP_FALLOC, /* like SGP_WRITE, but make existing page Uptodate */ }; diff --git a/mm/shmem.c b/mm/shmem.c index 867cb404090a..69c9788a0094 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -471,39 +471,35 @@ static bool shmem_confirm_swap(struct address_space *mapping, #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* ifdef here to avoid bloating shmem.o when not necessary */ -static int shmem_huge __read_mostly; +static int shmem_huge __read_mostly = SHMEM_HUGE_NEVER; -bool shmem_huge_enabled(struct vm_area_struct *vma) +bool shmem_is_huge(struct vm_area_struct *vma, + struct inode *inode, pgoff_t index) { - struct inode *inode = file_inode(vma->vm_file); - struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); loff_t i_size; - pgoff_t off; - if ((vma->vm_flags & VM_NOHUGEPAGE) || - test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) - return false; - if (shmem_huge == SHMEM_HUGE_FORCE) - return true; if (shmem_huge == SHMEM_HUGE_DENY) return false; - switch (sbinfo->huge) { - case SHMEM_HUGE_NEVER: + if (vma && ((vma->vm_flags & VM_NOHUGEPAGE) || + test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))) return false; + if (shmem_huge == SHMEM_HUGE_FORCE) + return true; + + switch (SHMEM_SB(inode->i_sb)->huge) { case SHMEM_HUGE_ALWAYS: return true; case SHMEM_HUGE_WITHIN_SIZE: - off = round_up(vma->vm_pgoff, HPAGE_PMD_NR); + index = round_up(index, HPAGE_PMD_NR); i_size = round_up(i_size_read(inode), PAGE_SIZE); - if (i_size >= HPAGE_PMD_SIZE && - i_size >> PAGE_SHIFT >= off) + if (i_size >= HPAGE_PMD_SIZE && (i_size >> PAGE_SHIFT) >= index) return true; fallthrough; case SHMEM_HUGE_ADVISE: - /* TODO: implement fadvise() hints */ - return (vma->vm_flags & VM_HUGEPAGE); + if (vma && (vma->vm_flags & VM_HUGEPAGE)) + return true; + fallthrough; default: - VM_BUG_ON(1); return false; } } @@ -677,6 +673,12 @@ static long shmem_unused_huge_count(struct super_block *sb, #define shmem_huge SHMEM_HUGE_DENY +bool shmem_is_huge(struct vm_area_struct *vma, + struct inode *inode, pgoff_t index) +{ + return false; +} + static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, struct shrink_control *sc, unsigned long nr_to_split) { @@ -1812,7 +1814,6 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, struct shmem_sb_info *sbinfo; struct mm_struct *charge_mm; struct page *page; - enum sgp_type sgp_huge = sgp; pgoff_t hindex = index; gfp_t huge_gfp; int error; @@ -1821,8 +1822,6 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, if (index > (MAX_LFS_FILESIZE >> PAGE_SHIFT)) return -EFBIG; - if (sgp == SGP_NOHUGE || sgp == SGP_HUGE) - sgp = SGP_CACHE; repeat: if (sgp <= SGP_CACHE && ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) { @@ -1886,36 +1885,12 @@ repeat: return 0; } - /* shmem_symlink() */ - if (!shmem_mapping(mapping)) - goto alloc_nohuge; - if (shmem_huge == SHMEM_HUGE_DENY || sgp_huge == SGP_NOHUGE) + /* Never use a huge page for shmem_symlink() */ + if (S_ISLNK(inode->i_mode)) goto alloc_nohuge; - if (shmem_huge == SHMEM_HUGE_FORCE) - goto alloc_huge; - switch (sbinfo->huge) { - case SHMEM_HUGE_NEVER: + if (!shmem_is_huge(vma, inode, index)) goto alloc_nohuge; - case SHMEM_HUGE_WITHIN_SIZE: { - loff_t i_size; - pgoff_t off; - - off = round_up(index, HPAGE_PMD_NR); - i_size = round_up(i_size_read(inode), PAGE_SIZE); - if (i_size >= HPAGE_PMD_SIZE && - i_size >> PAGE_SHIFT >= off) - goto alloc_huge; - fallthrough; - } - case SHMEM_HUGE_ADVISE: - if (sgp_huge == SGP_HUGE) - goto alloc_huge; - /* TODO: implement fadvise() hints */ - goto alloc_nohuge; - } - -alloc_huge: huge_gfp = vma_thp_gfp_mask(vma); huge_gfp = limit_gfp_mask(huge_gfp, gfp); page = shmem_alloc_and_acct_page(huge_gfp, inode, index, true); @@ -2071,7 +2046,6 @@ static vm_fault_t shmem_fault(struct vm_fault *vmf) struct vm_area_struct *vma = vmf->vma; struct inode *inode = file_inode(vma->vm_file); gfp_t gfp = mapping_gfp_mask(inode->i_mapping); - enum sgp_type sgp; int err; vm_fault_t ret = VM_FAULT_LOCKED; @@ -2134,15 +2108,7 @@ static vm_fault_t shmem_fault(struct vm_fault *vmf) spin_unlock(&inode->i_lock); } - sgp = SGP_CACHE; - - if ((vma->vm_flags & VM_NOHUGEPAGE) || - test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) - sgp = SGP_NOHUGE; - else if (vma->vm_flags & VM_HUGEPAGE) - sgp = SGP_HUGE; - - err = shmem_getpage_gfp(inode, vmf->pgoff, &vmf->page, sgp, + err = shmem_getpage_gfp(inode, vmf->pgoff, &vmf->page, SGP_CACHE, gfp, vma, vmf, &ret); if (err) return vmf_error(err); @@ -3950,7 +3916,7 @@ int __init shmem_init(void) if (has_transparent_hugepage() && shmem_huge > SHMEM_HUGE_DENY) SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge; else - shmem_huge = 0; /* just in case it was patched */ + shmem_huge = SHMEM_HUGE_NEVER; /* just in case it was patched */ #endif return 0; -- cgit v1.2.3 From 2c8d8f97ae2272f1455ee31a5af62b326772eb31 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 2 Sep 2021 14:54:50 -0700 Subject: mm, memcg: inline mem_cgroup_{charge/uncharge} to improve disabled memcg config Inline mem_cgroup_{charge/uncharge} and mem_cgroup_uncharge_list functions functions to perform mem_cgroup_disabled static key check inline before calling the main body of the function. This minimizes the memcg overhead in the pagefault and exit_mmap paths when memcgs are disabled using cgroup_disable=memory command-line option. This change results in ~0.4% overhead reduction when running PFT test [1] comparing {CONFIG_MEMCG=n} against {CONFIG_MEMCG=y, cgroup_disable=memory} configuration on an 8-core ARM64 Android device. [1] https://lkml.org/lkml/2006/8/29/294 also used in mmtests suite Link: https://lkml.kernel.org/r/20210713010934.299876-2-surenb@google.com Signed-off-by: Suren Baghdasaryan Reviewed-by: Shakeel Butt Reviewed-by: Muchun Song Acked-by: Michal Hocko Acked-by: Johannes Weiner Cc: Alex Shi Cc: Alistair Popple Cc: David Hildenbrand Cc: Jens Axboe Cc: Joonsoo Kim Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Minchan Kim Cc: Roman Gushchin Cc: Tejun Heo Cc: Wei Yang Cc: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 28 +++++++++++++++++++++++++--- mm/memcontrol.c | 33 ++++++++++++--------------------- 2 files changed, 37 insertions(+), 24 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 3403ec77528a..a00bf337567b 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -693,13 +693,35 @@ static inline bool mem_cgroup_below_min(struct mem_cgroup *memcg) page_counter_read(&memcg->memory); } -int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask); +int __mem_cgroup_charge(struct page *page, struct mm_struct *mm, + gfp_t gfp_mask); +static inline int mem_cgroup_charge(struct page *page, struct mm_struct *mm, + gfp_t gfp_mask) +{ + if (mem_cgroup_disabled()) + return 0; + return __mem_cgroup_charge(page, mm, gfp_mask); +} + int mem_cgroup_swapin_charge_page(struct page *page, struct mm_struct *mm, gfp_t gfp, swp_entry_t entry); void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry); -void mem_cgroup_uncharge(struct page *page); -void mem_cgroup_uncharge_list(struct list_head *page_list); +void __mem_cgroup_uncharge(struct page *page); +static inline void mem_cgroup_uncharge(struct page *page) +{ + if (mem_cgroup_disabled()) + return; + __mem_cgroup_uncharge(page); +} + +void __mem_cgroup_uncharge_list(struct list_head *page_list); +static inline void mem_cgroup_uncharge_list(struct list_head *page_list) +{ + if (mem_cgroup_disabled()) + return; + __mem_cgroup_uncharge_list(page_list); +} void mem_cgroup_migrate(struct page *oldpage, struct page *newpage); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 211f3911228f..33bb8434eea0 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6693,8 +6693,7 @@ void mem_cgroup_calculate_protection(struct mem_cgroup *root, atomic_long_read(&parent->memory.children_low_usage))); } -static int __mem_cgroup_charge(struct page *page, struct mem_cgroup *memcg, - gfp_t gfp) +static int charge_memcg(struct page *page, struct mem_cgroup *memcg, gfp_t gfp) { unsigned int nr_pages = thp_nr_pages(page); int ret; @@ -6715,7 +6714,7 @@ out: } /** - * mem_cgroup_charge - charge a newly allocated page to a cgroup + * __mem_cgroup_charge - charge a newly allocated page to a cgroup * @page: page to charge * @mm: mm context of the victim * @gfp_mask: reclaim mode @@ -6728,16 +6727,14 @@ out: * * Returns 0 on success. Otherwise, an error code is returned. */ -int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) +int __mem_cgroup_charge(struct page *page, struct mm_struct *mm, + gfp_t gfp_mask) { struct mem_cgroup *memcg; int ret; - if (mem_cgroup_disabled()) - return 0; - memcg = get_mem_cgroup_from_mm(mm); - ret = __mem_cgroup_charge(page, memcg, gfp_mask); + ret = charge_memcg(page, memcg, gfp_mask); css_put(&memcg->css); return ret; @@ -6772,7 +6769,7 @@ int mem_cgroup_swapin_charge_page(struct page *page, struct mm_struct *mm, memcg = get_mem_cgroup_from_mm(mm); rcu_read_unlock(); - ret = __mem_cgroup_charge(page, memcg, gfp); + ret = charge_memcg(page, memcg, gfp); css_put(&memcg->css); return ret; @@ -6908,18 +6905,15 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug) } /** - * mem_cgroup_uncharge - uncharge a page + * __mem_cgroup_uncharge - uncharge a page * @page: page to uncharge * - * Uncharge a page previously charged with mem_cgroup_charge(). + * Uncharge a page previously charged with __mem_cgroup_charge(). */ -void mem_cgroup_uncharge(struct page *page) +void __mem_cgroup_uncharge(struct page *page) { struct uncharge_gather ug; - if (mem_cgroup_disabled()) - return; - /* Don't touch page->lru of any random page, pre-check: */ if (!page_memcg(page)) return; @@ -6930,20 +6924,17 @@ void mem_cgroup_uncharge(struct page *page) } /** - * mem_cgroup_uncharge_list - uncharge a list of page + * __mem_cgroup_uncharge_list - uncharge a list of page * @page_list: list of pages to uncharge * * Uncharge a list of pages previously charged with - * mem_cgroup_charge(). + * __mem_cgroup_charge(). */ -void mem_cgroup_uncharge_list(struct list_head *page_list) +void __mem_cgroup_uncharge_list(struct list_head *page_list) { struct uncharge_gather ug; struct page *page; - if (mem_cgroup_disabled()) - return; - uncharge_gather_clear(&ug); list_for_each_entry(page, page_list, lru) uncharge_page(page, &ug); -- cgit v1.2.3 From 01c4b28cd2e600160566a7d83b4703800381dae1 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 2 Sep 2021 14:54:54 -0700 Subject: mm, memcg: inline swap-related functions to improve disabled memcg config Inline mem_cgroup_try_charge_swap, mem_cgroup_uncharge_swap and cgroup_throttle_swaprate functions to perform mem_cgroup_disabled static key check inline before calling the main body of the function. This minimizes the memcg overhead in the pagefault and exit_mmap paths when memcgs are disabled using cgroup_disable=memory command-line option. This change results in ~1% overhead reduction when running PFT test [1] comparing {CONFIG_MEMCG=n} against {CONFIG_MEMCG=y, cgroup_disable=memory} configuration on an 8-core ARM64 Android device. [1] https://lkml.org/lkml/2006/8/29/294 also used in mmtests suite Link: https://lkml.kernel.org/r/20210713010934.299876-3-surenb@google.com Signed-off-by: Suren Baghdasaryan Reviewed-by: Shakeel Butt Reviewed-by: Muchun Song Acked-by: Johannes Weiner Acked-by: Michal Hocko Cc: Tejun Heo Cc: Roman Gushchin Cc: Yang Shi Cc: Alex Shi Cc: Wei Yang Cc: Jens Axboe Cc: Joonsoo Kim Cc: David Hildenbrand Cc: Matthew Wilcox (Oracle) Cc: Alistair Popple Cc: Minchan Kim Cc: Miaohe Lin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 26 +++++++++++++++++++++++--- mm/memcontrol.c | 14 ++++---------- mm/swapfile.c | 5 +---- 3 files changed, 28 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index 6f5a43251593..f30d26b0f71d 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -721,7 +721,13 @@ static inline int mem_cgroup_swappiness(struct mem_cgroup *mem) #endif #if defined(CONFIG_SWAP) && defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP) -extern void cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask); +extern void __cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask); +static inline void cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask) +{ + if (mem_cgroup_disabled()) + return; + __cgroup_throttle_swaprate(page, gfp_mask); +} #else static inline void cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask) { @@ -730,8 +736,22 @@ static inline void cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask) #ifdef CONFIG_MEMCG_SWAP extern void mem_cgroup_swapout(struct page *page, swp_entry_t entry); -extern int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry); -extern void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages); +extern int __mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry); +static inline int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry) +{ + if (mem_cgroup_disabled()) + return 0; + return __mem_cgroup_try_charge_swap(page, entry); +} + +extern void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages); +static inline void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages) +{ + if (mem_cgroup_disabled()) + return; + __mem_cgroup_uncharge_swap(entry, nr_pages); +} + extern long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg); extern bool mem_cgroup_swap_full(struct page *page); #else diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 33bb8434eea0..81e15a67391b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -7226,7 +7226,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) } /** - * mem_cgroup_try_charge_swap - try charging swap space for a page + * __mem_cgroup_try_charge_swap - try charging swap space for a page * @page: page being added to swap * @entry: swap entry to charge * @@ -7234,16 +7234,13 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) * * Returns 0 on success, -ENOMEM on failure. */ -int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry) +int __mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry) { unsigned int nr_pages = thp_nr_pages(page); struct page_counter *counter; struct mem_cgroup *memcg; unsigned short oldid; - if (mem_cgroup_disabled()) - return 0; - if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) return 0; @@ -7279,18 +7276,15 @@ int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry) } /** - * mem_cgroup_uncharge_swap - uncharge swap space + * __mem_cgroup_uncharge_swap - uncharge swap space * @entry: swap entry to uncharge * @nr_pages: the amount of swap space to uncharge */ -void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages) +void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages) { struct mem_cgroup *memcg; unsigned short id; - if (mem_cgroup_disabled()) - return; - id = swap_cgroup_record(entry, 0, nr_pages); rcu_read_lock(); memcg = mem_cgroup_from_id(id); diff --git a/mm/swapfile.c b/mm/swapfile.c index 627b16aed1dc..22d10f713848 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -3779,14 +3779,11 @@ static void free_swap_count_continuations(struct swap_info_struct *si) } #if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP) -void cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask) +void __cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask) { struct swap_info_struct *si, *next; int nid = page_to_nid(page); - if (mem_cgroup_disabled()) - return; - if (!(gfp_mask & __GFP_IO)) return; -- cgit v1.2.3 From 7e1c0d6f58207e7e60674647d3935f446f05613c Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 2 Sep 2021 14:55:00 -0700 Subject: memcg: switch lruvec stats to rstat MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The commit 2d146aa3aa84 ("mm: memcontrol: switch to rstat") switched memcg stats to rstat infrastructure but skipped the conversion of the lruvec stats as such stats are read in the performance critical code paths and flushing stats may have impacted the performances of the applications. This patch converts the lruvec stats to rstat and later patches add mechanisms to keep the performance impact to minimum. The rstat conversion comes with the price i.e. memory cost. Effectively this patch reverts the savings done by the commit f3344adf38bd ("mm: memcontrol: optimize per-lruvec stats counter memory usage"). However this cost is justified due to negative impact of the inaccurate lruvec stats on many heuristics. One such case is reported in [1]. The memory reclaim code is filled with plethora of heuristics and many of those heuristics reads the lruvec stats. So, inaccurate stats can make such heuristics ineffective. [1] reports the impact of inaccurate lruvec stats on the "cache trim mode" heuristic. Inaccurate lruvec stats can impact the deactivation and aging anon heuristics as well. [1] https://lore.kernel.org/linux-mm/20210311004449.1170308-1-ying.huang@intel.com/ Link: https://lkml.kernel.org/r/20210716212137.1391164-1-shakeelb@google.com Link: https://lkml.kernel.org/r/20210714013948.270662-1-shakeelb@google.com Signed-off-by: Shakeel Butt Cc: Tejun Heo Cc: Johannes Weiner Cc: Muchun Song Cc: Michal Hocko Cc: Roman Gushchin Cc: Huang Ying Cc: Hillf Danton Cc: Michal Koutný Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 42 ++++++++--------- mm/memcontrol.c | 114 +++++++++++++++------------------------------ 2 files changed, 58 insertions(+), 98 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index a00bf337567b..47d35bef9f6e 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -105,14 +105,6 @@ struct mem_cgroup_reclaim_iter { unsigned int generation; }; -struct lruvec_stat { - long count[NR_VM_NODE_STAT_ITEMS]; -}; - -struct batched_lruvec_stat { - s32 count[NR_VM_NODE_STAT_ITEMS]; -}; - /* * Bitmap and deferred work of shrinker::id corresponding to memcg-aware * shrinkers, which have elements charged to this memcg. @@ -123,24 +115,30 @@ struct shrinker_info { unsigned long *map; }; +struct lruvec_stats_percpu { + /* Local (CPU and cgroup) state */ + long state[NR_VM_NODE_STAT_ITEMS]; + + /* Delta calculation for lockless upward propagation */ + long state_prev[NR_VM_NODE_STAT_ITEMS]; +}; + +struct lruvec_stats { + /* Aggregated (CPU and subtree) state */ + long state[NR_VM_NODE_STAT_ITEMS]; + + /* Pending child counts during tree propagation */ + long state_pending[NR_VM_NODE_STAT_ITEMS]; +}; + /* * per-node information in memory controller. */ struct mem_cgroup_per_node { struct lruvec lruvec; - /* - * Legacy local VM stats. This should be struct lruvec_stat and - * cannot be optimized to struct batched_lruvec_stat. Because - * the threshold of the lruvec_stat_cpu can be as big as - * MEMCG_CHARGE_BATCH * PAGE_SIZE. It can fit into s32. But this - * filed has no upper limit. - */ - struct lruvec_stat __percpu *lruvec_stat_local; - - /* Subtree VM stats (batched updates) */ - struct batched_lruvec_stat __percpu *lruvec_stat_cpu; - atomic_long_t lruvec_stat[NR_VM_NODE_STAT_ITEMS]; + struct lruvec_stats_percpu __percpu *lruvec_stats_percpu; + struct lruvec_stats lruvec_stats; unsigned long lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS]; @@ -997,7 +995,7 @@ static inline unsigned long lruvec_page_state(struct lruvec *lruvec, return node_page_state(lruvec_pgdat(lruvec), idx); pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); - x = atomic_long_read(&pn->lruvec_stat[idx]); + x = READ_ONCE(pn->lruvec_stats.state[idx]); #ifdef CONFIG_SMP if (x < 0) x = 0; @@ -1017,7 +1015,7 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec, pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); for_each_possible_cpu(cpu) - x += per_cpu(pn->lruvec_stat_local->count[idx], cpu); + x += per_cpu(pn->lruvec_stats_percpu->state[idx], cpu); #ifdef CONFIG_SMP if (x < 0) x = 0; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 81e15a67391b..400d210e030a 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -660,23 +660,11 @@ static unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx) return x; } -static struct mem_cgroup_per_node * -parent_nodeinfo(struct mem_cgroup_per_node *pn, int nid) -{ - struct mem_cgroup *parent; - - parent = parent_mem_cgroup(pn->memcg); - if (!parent) - return NULL; - return parent->nodeinfo[nid]; -} - void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val) { struct mem_cgroup_per_node *pn; struct mem_cgroup *memcg; - long x, threshold = MEMCG_CHARGE_BATCH; pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); memcg = pn->memcg; @@ -685,21 +673,7 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, __mod_memcg_state(memcg, idx, val); /* Update lruvec */ - __this_cpu_add(pn->lruvec_stat_local->count[idx], val); - - if (vmstat_item_in_bytes(idx)) - threshold <<= PAGE_SHIFT; - - x = val + __this_cpu_read(pn->lruvec_stat_cpu->count[idx]); - if (unlikely(abs(x) > threshold)) { - pg_data_t *pgdat = lruvec_pgdat(lruvec); - struct mem_cgroup_per_node *pi; - - for (pi = pn; pi; pi = parent_nodeinfo(pi, pgdat->node_id)) - atomic_long_add(x, &pi->lruvec_stat[idx]); - x = 0; - } - __this_cpu_write(pn->lruvec_stat_cpu->count[idx], x); + __this_cpu_add(pn->lruvec_stats_percpu->state[idx], val); } /** @@ -2278,40 +2252,13 @@ static void drain_all_stock(struct mem_cgroup *root_memcg) mutex_unlock(&percpu_charge_mutex); } -static void memcg_flush_lruvec_page_state(struct mem_cgroup *memcg, int cpu) -{ - int nid; - - for_each_node(nid) { - struct mem_cgroup_per_node *pn = memcg->nodeinfo[nid]; - unsigned long stat[NR_VM_NODE_STAT_ITEMS]; - struct batched_lruvec_stat *lstatc; - int i; - - lstatc = per_cpu_ptr(pn->lruvec_stat_cpu, cpu); - for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) { - stat[i] = lstatc->count[i]; - lstatc->count[i] = 0; - } - - do { - for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) - atomic_long_add(stat[i], &pn->lruvec_stat[i]); - } while ((pn = parent_nodeinfo(pn, nid))); - } -} - static int memcg_hotplug_cpu_dead(unsigned int cpu) { struct memcg_stock_pcp *stock; - struct mem_cgroup *memcg; stock = &per_cpu(memcg_stock, cpu); drain_stock(stock); - for_each_mem_cgroup(memcg) - memcg_flush_lruvec_page_state(memcg, cpu); - return 0; } @@ -5118,17 +5065,9 @@ static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) if (!pn) return 1; - pn->lruvec_stat_local = alloc_percpu_gfp(struct lruvec_stat, - GFP_KERNEL_ACCOUNT); - if (!pn->lruvec_stat_local) { - kfree(pn); - return 1; - } - - pn->lruvec_stat_cpu = alloc_percpu_gfp(struct batched_lruvec_stat, - GFP_KERNEL_ACCOUNT); - if (!pn->lruvec_stat_cpu) { - free_percpu(pn->lruvec_stat_local); + pn->lruvec_stats_percpu = alloc_percpu_gfp(struct lruvec_stats_percpu, + GFP_KERNEL_ACCOUNT); + if (!pn->lruvec_stats_percpu) { kfree(pn); return 1; } @@ -5149,8 +5088,7 @@ static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) if (!pn) return; - free_percpu(pn->lruvec_stat_cpu); - free_percpu(pn->lruvec_stat_local); + free_percpu(pn->lruvec_stats_percpu); kfree(pn); } @@ -5166,15 +5104,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) static void mem_cgroup_free(struct mem_cgroup *memcg) { - int cpu; - memcg_wb_domain_exit(memcg); - /* - * Flush percpu lruvec stats to guarantee the value - * correctness on parent's and all ancestor levels. - */ - for_each_online_cpu(cpu) - memcg_flush_lruvec_page_state(memcg, cpu); __mem_cgroup_free(memcg); } @@ -5407,7 +5337,7 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu) struct mem_cgroup *parent = parent_mem_cgroup(memcg); struct memcg_vmstats_percpu *statc; long delta, v; - int i; + int i, nid; statc = per_cpu_ptr(memcg->vmstats_percpu, cpu); @@ -5455,6 +5385,36 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu) if (parent) parent->vmstats.events_pending[i] += delta; } + + for_each_node_state(nid, N_MEMORY) { + struct mem_cgroup_per_node *pn = memcg->nodeinfo[nid]; + struct mem_cgroup_per_node *ppn = NULL; + struct lruvec_stats_percpu *lstatc; + + if (parent) + ppn = parent->nodeinfo[nid]; + + lstatc = per_cpu_ptr(pn->lruvec_stats_percpu, cpu); + + for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) { + delta = pn->lruvec_stats.state_pending[i]; + if (delta) + pn->lruvec_stats.state_pending[i] = 0; + + v = READ_ONCE(lstatc->state[i]); + if (v != lstatc->state_prev[i]) { + delta += v - lstatc->state_prev[i]; + lstatc->state_prev[i] = v; + } + + if (!delta) + continue; + + pn->lruvec_stats.state[i] += delta; + if (ppn) + ppn->lruvec_stats.state_pending[i] += delta; + } + } } #ifdef CONFIG_MMU @@ -6388,6 +6348,8 @@ static int memory_numa_stat_show(struct seq_file *m, void *v) int i; struct mem_cgroup *memcg = mem_cgroup_from_seq(m); + cgroup_rstat_flush(memcg->css.cgroup); + for (i = 0; i < ARRAY_SIZE(memory_stats); i++) { int nid; -- cgit v1.2.3 From aa48e47e3906c332eaf1e5d7b58be11d3509ad9f Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 2 Sep 2021 14:55:04 -0700 Subject: memcg: infrastructure to flush memcg stats MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit At the moment memcg stats are read in four contexts: 1. memcg stat user interfaces 2. dirty throttling 3. page fault 4. memory reclaim Currently the kernel flushes the stats for first two cases. Flushing the stats for remaining two casese may have performance impact. Always flushing the memcg stats on the page fault code path may negatively impacts the performance of the applications. In addition flushing in the memory reclaim code path, though treated as slowpath, can become the source of contention for the global lock taken for stat flushing because when system or memcg is under memory pressure, many tasks may enter the reclaim path. This patch uses following mechanisms to solve these challenges: 1. Periodically flush the stats from root memcg every 2 seconds. This will time limit the out of sync stats. 2. Asynchronously flush the stats after fixed number of stat updates. In the worst case the stat can be out of sync by O(nr_cpus * BATCH) for 2 seconds. 3. For avoiding thundering herd to flush the stats particularly from the memory reclaim context, introduce memcg local spinlock and let only one flusher active at a time. This could have been done through cgroup_rstat_lock lock but that lock is used by other subsystem and for userspace reading memcg stats. So, it is better to keep flushers introduced by this patch decoupled from cgroup_rstat_lock. However we would have to use irqsafe version of rstat flush but that is fine as this code path will be flushing for whole tree and do the work for everyone. No one will be waiting for that worker. [shakeelb@google.com: fix sleep-in-wrong context bug] Link: https://lkml.kernel.org/r/20210716212137.1391164-2-shakeelb@google.com Link: https://lkml.kernel.org/r/20210714013948.270662-2-shakeelb@google.com Signed-off-by: Shakeel Butt Tested-by: Marek Szyprowski Cc: Hillf Danton Cc: Huang Ying Cc: Johannes Weiner Cc: Michal Hocko Cc: Michal Koutný Cc: Muchun Song Cc: Roman Gushchin Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 6 ++++++ mm/memcontrol.c | 34 ++++++++++++++++++++++++++++++++++ mm/vmscan.c | 6 ++++++ 3 files changed, 46 insertions(+) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 47d35bef9f6e..c4c5b652d3af 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1023,6 +1023,8 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec, return x; } +void mem_cgroup_flush_stats(void); + void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val); void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val); @@ -1438,6 +1440,10 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec, return node_page_state(lruvec_pgdat(lruvec), idx); } +static inline void mem_cgroup_flush_stats(void) +{ +} + static inline void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val) { diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 400d210e030a..4d8c9afecf98 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -103,6 +103,14 @@ static bool do_memsw_account(void) return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_noswap; } +/* memcg and lruvec stats flushing */ +static void flush_memcg_stats_dwork(struct work_struct *w); +static DECLARE_DEFERRABLE_WORK(stats_flush_dwork, flush_memcg_stats_dwork); +static void flush_memcg_stats_work(struct work_struct *w); +static DECLARE_WORK(stats_flush_work, flush_memcg_stats_work); +static DEFINE_PER_CPU(unsigned int, stats_flush_threshold); +static DEFINE_SPINLOCK(stats_flush_lock); + #define THRESHOLDS_EVENTS_TARGET 128 #define SOFTLIMIT_EVENTS_TARGET 1024 @@ -674,6 +682,8 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, /* Update lruvec */ __this_cpu_add(pn->lruvec_stats_percpu->state[idx], val); + if (!(__this_cpu_inc_return(stats_flush_threshold) % MEMCG_CHARGE_BATCH)) + queue_work(system_unbound_wq, &stats_flush_work); } /** @@ -5240,6 +5250,10 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css) /* Online state pins memcg ID, memcg ID pins CSS */ refcount_set(&memcg->id.ref, 1); css_get(css); + + if (unlikely(mem_cgroup_is_root(memcg))) + queue_delayed_work(system_unbound_wq, &stats_flush_dwork, + 2UL*HZ); return 0; } @@ -5331,6 +5345,26 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) memcg_wb_domain_size_changed(memcg); } +void mem_cgroup_flush_stats(void) +{ + if (!spin_trylock(&stats_flush_lock)) + return; + + cgroup_rstat_flush_irqsafe(root_mem_cgroup->css.cgroup); + spin_unlock(&stats_flush_lock); +} + +static void flush_memcg_stats_dwork(struct work_struct *w) +{ + mem_cgroup_flush_stats(); + queue_delayed_work(system_unbound_wq, &stats_flush_dwork, 2UL*HZ); +} + +static void flush_memcg_stats_work(struct work_struct *w) +{ + mem_cgroup_flush_stats(); +} + static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); diff --git a/mm/vmscan.c b/mm/vmscan.c index 268ad6570751..6c401b44a245 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2897,6 +2897,12 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc) target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat); again: + /* + * Flush the memory cgroup stats, so that we read accurate per-memcg + * lruvec stats for heuristics. + */ + mem_cgroup_flush_stats(); + memset(&sc->nr, 0, sizeof(sc->nr)); nr_reclaimed = sc->nr_reclaimed; -- cgit v1.2.3 From 96e51ccf1af33e82f429a0d6baebba29c6448d0f Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 2 Sep 2021 14:55:46 -0700 Subject: memcg: cleanup racy sum avoidance code We used to have per-cpu memcg and lruvec stats and the readers have to traverse and sum the stats from each cpu. This summing was racy and may expose transient negative values. So, an explicit check was added to avoid such scenarios. Now these stats are moved to rstat infrastructure and are no more per-cpu, so we can remove the fixup for transient negative values. Link: https://lkml.kernel.org/r/20210728012243.3369123-1-shakeelb@google.com Signed-off-by: Shakeel Butt Acked-by: Roman Gushchin Reviewed-by: David Hildenbrand Acked-by: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index c4c5b652d3af..2ee20509ac71 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -977,30 +977,19 @@ static inline void mod_memcg_state(struct mem_cgroup *memcg, static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) { - long x = READ_ONCE(memcg->vmstats.state[idx]); -#ifdef CONFIG_SMP - if (x < 0) - x = 0; -#endif - return x; + return READ_ONCE(memcg->vmstats.state[idx]); } static inline unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx) { struct mem_cgroup_per_node *pn; - long x; if (mem_cgroup_disabled()) return node_page_state(lruvec_pgdat(lruvec), idx); pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); - x = READ_ONCE(pn->lruvec_stats.state[idx]); -#ifdef CONFIG_SMP - if (x < 0) - x = 0; -#endif - return x; + return READ_ONCE(pn->lruvec_stats.state[idx]); } static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec, -- cgit v1.2.3 From 55a68c823951855f3ec313fdb85100db84284505 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Thu, 2 Sep 2021 14:55:49 -0700 Subject: memcg: replace in_interrupt() by !in_task() in active_memcg() set_active_memcg() uses in_interrupt() check to select proper storage for cgroup: pointer on task struct or per-cpu pointer. It isn't fully correct: obsoleted in_interrupt() includes tasks with disabled BH. It's better to use '!in_task()' instead. Link: https://lkml.org/lkml/2021/7/26/487 Link: https://lkml.kernel.org/r/ed4448b0-4970-616f-7368-ef9dd3cb628d@virtuozzo.com Fixes: 37d5985c003d ("mm: kmem: prepare remote memcg charging infra for interrupt contexts") Signed-off-by: Vasily Averin Reviewed-by: Shakeel Butt Cc: Johannes Weiner Cc: Michal Hocko Cc: Roman Gushchin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched/mm.h | 2 +- mm/memcontrol.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 8894825cc4db..5561486fddef 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -306,7 +306,7 @@ set_active_memcg(struct mem_cgroup *memcg) { struct mem_cgroup *old; - if (in_interrupt()) { + if (!in_task()) { old = this_cpu_read(int_active_memcg); this_cpu_write(int_active_memcg, memcg); } else { diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 4d8c9afecf98..b09f2639f28b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -878,7 +878,7 @@ EXPORT_SYMBOL(mem_cgroup_from_task); static __always_inline struct mem_cgroup *active_memcg(void) { - if (in_interrupt()) + if (!in_task()) return this_cpu_read(int_active_memcg); else return current->active_memcg; -- cgit v1.2.3 From bec49c067c679e9b7ca7c1aac50b56618c12d879 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 2 Sep 2021 14:55:56 -0700 Subject: mm, memcg: remove unused functions Since commit 2d146aa3aa84 ("mm: memcontrol: switch to rstat"), last user of memcg_stat_item_in_bytes() is gone. And since commit fa40d1ee9f15 ("mm: vmscan: memcontrol: remove mem_cgroup_select_victim_node()"), only the declaration of mem_cgroup_select_victim_node() is remained here. Remove them. Link: https://lkml.kernel.org/r/20210807082835.61281-2-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Shakeel Butt Reviewed-by: Muchun Song Acked-by: Roman Gushchin Acked-by: Michal Hocko Cc: Alex Shi Cc: Matthew Wilcox (Oracle) Cc: Vladimir Davydov Cc: Wei Yang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 2ee20509ac71..69e6c5462c43 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -593,13 +593,6 @@ static inline struct obj_cgroup **page_objcgs_check(struct page *page) } #endif -static __always_inline bool memcg_stat_item_in_bytes(int idx) -{ - if (idx == MEMCG_PERCPU_B) - return true; - return vmstat_item_in_bytes(idx); -} - static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) { return (memcg == root_mem_cgroup); @@ -904,11 +897,6 @@ static inline bool mem_cgroup_online(struct mem_cgroup *memcg) return !!(memcg->css.flags & CSS_ONLINE); } -/* - * For memory reclaim. - */ -int mem_cgroup_select_victim_node(struct mem_cgroup *memcg); - void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, int zid, int nr_pages); -- cgit v1.2.3 From f358afc52c3066f4e8cd7b3a2d75b31e822519e9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 2 Sep 2021 14:56:36 -0700 Subject: mm: remove flush_kernel_dcache_page flush_kernel_dcache_page is a rather confusing interface that implements a subset of flush_dcache_page by not being able to properly handle page cache mapped pages. The only callers left are in the exec code as all other previous callers were incorrect as they could have dealt with page cache pages. Replace the calls to flush_kernel_dcache_page with calls to flush_dcache_page, which for all architectures does either exactly the same thing, can contains one or more of the following: 1) an optimization to defer the cache flush for page cache pages not mapped into userspace 2) additional flushing for mapped page cache pages if cache aliases are possible Link: https://lkml.kernel.org/r/20210712060928.4161649-7-hch@lst.de Signed-off-by: Christoph Hellwig Acked-by: Linus Torvalds Reviewed-by: Ira Weiny Cc: Alex Shi Cc: Geoff Levand Cc: Greentime Hu Cc: Guo Ren Cc: Helge Deller Cc: "James E.J. Bottomley" Cc: Nick Hu Cc: Paul Cercueil Cc: Rich Felker Cc: Russell King Cc: Thomas Bogendoerfer Cc: Ulf Hansson Cc: Vincent Chen Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/core-api/cachetlb.rst | 86 ++++++++++------------ .../translations/zh_CN/core-api/cachetlb.rst | 9 --- arch/arm/include/asm/cacheflush.h | 4 +- arch/arm/mm/flush.c | 33 --------- arch/arm/mm/nommu.c | 6 -- arch/csky/abiv1/cacheflush.c | 11 --- arch/csky/abiv1/inc/abi/cacheflush.h | 4 +- arch/mips/include/asm/cacheflush.h | 8 +- arch/nds32/include/asm/cacheflush.h | 3 +- arch/nds32/mm/cacheflush.c | 9 --- arch/parisc/include/asm/cacheflush.h | 8 +- arch/parisc/kernel/cache.c | 3 +- arch/sh/include/asm/cacheflush.h | 8 +- block/blk-map.c | 2 +- fs/exec.c | 6 +- include/linux/highmem.h | 5 +- tools/testing/scatterlist/linux/mm.h | 1 - 17 files changed, 51 insertions(+), 155 deletions(-) (limited to 'include/linux') diff --git a/Documentation/core-api/cachetlb.rst b/Documentation/core-api/cachetlb.rst index fe4290e26729..8aed9103e48a 100644 --- a/Documentation/core-api/cachetlb.rst +++ b/Documentation/core-api/cachetlb.rst @@ -271,10 +271,15 @@ maps this page at its virtual address. ``void flush_dcache_page(struct page *page)`` - Any time the kernel writes to a page cache page, _OR_ - the kernel is about to read from a page cache page and - user space shared/writable mappings of this page potentially - exist, this routine is called. + This routines must be called when: + + a) the kernel did write to a page that is in the page cache page + and / or in high memory + b) the kernel is about to read from a page cache page and user space + shared/writable mappings of this page potentially exist. Note + that {get,pin}_user_pages{_fast} already call flush_dcache_page + on any page found in the user address space and thus driver + code rarely needs to take this into account. .. note:: @@ -284,38 +289,34 @@ maps this page at its virtual address. handling vfs symlinks in the page cache need not call this interface at all. - The phrase "kernel writes to a page cache page" means, - specifically, that the kernel executes store instructions - that dirty data in that page at the page->virtual mapping - of that page. It is important to flush here to handle - D-cache aliasing, to make sure these kernel stores are - visible to user space mappings of that page. - - The corollary case is just as important, if there are users - which have shared+writable mappings of this file, we must make - sure that kernel reads of these pages will see the most recent - stores done by the user. - - If D-cache aliasing is not an issue, this routine may - simply be defined as a nop on that architecture. - - There is a bit set aside in page->flags (PG_arch_1) as - "architecture private". The kernel guarantees that, - for pagecache pages, it will clear this bit when such - a page first enters the pagecache. - - This allows these interfaces to be implemented much more - efficiently. It allows one to "defer" (perhaps indefinitely) - the actual flush if there are currently no user processes - mapping this page. See sparc64's flush_dcache_page and - update_mmu_cache implementations for an example of how to go - about doing this. - - The idea is, first at flush_dcache_page() time, if - page->mapping->i_mmap is an empty tree, just mark the architecture - private page flag bit. Later, in update_mmu_cache(), a check is - made of this flag bit, and if set the flush is done and the flag - bit is cleared. + The phrase "kernel writes to a page cache page" means, specifically, + that the kernel executes store instructions that dirty data in that + page at the page->virtual mapping of that page. It is important to + flush here to handle D-cache aliasing, to make sure these kernel stores + are visible to user space mappings of that page. + + The corollary case is just as important, if there are users which have + shared+writable mappings of this file, we must make sure that kernel + reads of these pages will see the most recent stores done by the user. + + If D-cache aliasing is not an issue, this routine may simply be defined + as a nop on that architecture. + + There is a bit set aside in page->flags (PG_arch_1) as "architecture + private". The kernel guarantees that, for pagecache pages, it will + clear this bit when such a page first enters the pagecache. + + This allows these interfaces to be implemented much more efficiently. + It allows one to "defer" (perhaps indefinitely) the actual flush if + there are currently no user processes mapping this page. See sparc64's + flush_dcache_page and update_mmu_cache implementations for an example + of how to go about doing this. + + The idea is, first at flush_dcache_page() time, if page_file_mapping() + returns a mapping, and mapping_mapped on that mapping returns %false, + just mark the architecture private page flag bit. Later, in + update_mmu_cache(), a check is made of this flag bit, and if set the + flush is done and the flag bit is cleared. .. important:: @@ -351,19 +352,6 @@ maps this page at its virtual address. architectures). For incoherent architectures, it should flush the cache of the page at vmaddr. - ``void flush_kernel_dcache_page(struct page *page)`` - - When the kernel needs to modify a user page is has obtained - with kmap, it calls this function after all modifications are - complete (but before kunmapping it) to bring the underlying - page up to date. It is assumed here that the user has no - incoherent cached copies (i.e. the original page was obtained - from a mechanism like get_user_pages()). The default - implementation is a nop and should remain so on all coherent - architectures. On incoherent architectures, this should flush - the kernel cache for page (using page_address(page)). - - ``void flush_icache_range(unsigned long start, unsigned long end)`` When the kernel stores into addresses that it will execute diff --git a/Documentation/translations/zh_CN/core-api/cachetlb.rst b/Documentation/translations/zh_CN/core-api/cachetlb.rst index 8376485a534d..55827b8a7c53 100644 --- a/Documentation/translations/zh_CN/core-api/cachetlb.rst +++ b/Documentation/translations/zh_CN/core-api/cachetlb.rst @@ -298,15 +298,6 @@ HyperSparc cpu就是这样一个具有这种属性的cpu。 用。默认的实现是nop(对于所有相干的架构应该保持这样)。对于不一致性 的架构,它应该刷新vmaddr处的页面缓存。 - ``void flush_kernel_dcache_page(struct page *page)`` - - 当内核需要修改一个用kmap获得的用户页时,它会在所有修改完成后(但在 - kunmapping之前)调用这个函数,以使底层页面达到最新状态。这里假定用 - 户没有不一致性的缓存副本(即原始页面是从类似get_user_pages()的机制 - 中获得的)。默认的实现是一个nop,在所有相干的架构上都应该如此。在不 - 一致性的架构上,这应该刷新内核缓存中的页面(使用page_address(page))。 - - ``void flush_icache_range(unsigned long start, unsigned long end)`` 当内核存储到它将执行的地址中时(例如在加载模块时),这个函数被调用。 diff --git a/arch/arm/include/asm/cacheflush.h b/arch/arm/include/asm/cacheflush.h index 2e24e765e6d3..5e56288e343b 100644 --- a/arch/arm/include/asm/cacheflush.h +++ b/arch/arm/include/asm/cacheflush.h @@ -291,6 +291,7 @@ extern void flush_cache_page(struct vm_area_struct *vma, unsigned long user_addr #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 extern void flush_dcache_page(struct page *); +#define ARCH_IMPLEMENTS_FLUSH_KERNEL_VMAP_RANGE 1 static inline void flush_kernel_vmap_range(void *addr, int size) { if ((cache_is_vivt() || cache_is_vipt_aliasing())) @@ -312,9 +313,6 @@ static inline void flush_anon_page(struct vm_area_struct *vma, __flush_anon_page(vma, page, vmaddr); } -#define ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE -extern void flush_kernel_dcache_page(struct page *); - #define flush_dcache_mmap_lock(mapping) xa_lock_irq(&mapping->i_pages) #define flush_dcache_mmap_unlock(mapping) xa_unlock_irq(&mapping->i_pages) diff --git a/arch/arm/mm/flush.c b/arch/arm/mm/flush.c index 6d89db7895d1..7ff9feea13a6 100644 --- a/arch/arm/mm/flush.c +++ b/arch/arm/mm/flush.c @@ -345,39 +345,6 @@ void flush_dcache_page(struct page *page) } EXPORT_SYMBOL(flush_dcache_page); -/* - * Ensure cache coherency for the kernel mapping of this page. We can - * assume that the page is pinned via kmap. - * - * If the page only exists in the page cache and there are no user - * space mappings, this is a no-op since the page was already marked - * dirty at creation. Otherwise, we need to flush the dirty kernel - * cache lines directly. - */ -void flush_kernel_dcache_page(struct page *page) -{ - if (cache_is_vivt() || cache_is_vipt_aliasing()) { - struct address_space *mapping; - - mapping = page_mapping_file(page); - - if (!mapping || mapping_mapped(mapping)) { - void *addr; - - addr = page_address(page); - /* - * kmap_atomic() doesn't set the page virtual - * address for highmem pages, and - * kunmap_atomic() takes care of cache - * flushing already. - */ - if (!IS_ENABLED(CONFIG_HIGHMEM) || addr) - __cpuc_flush_dcache_area(addr, PAGE_SIZE); - } - } -} -EXPORT_SYMBOL(flush_kernel_dcache_page); - /* * Flush an anonymous page so that users of get_user_pages() * can safely access the data. The expected sequence is: diff --git a/arch/arm/mm/nommu.c b/arch/arm/mm/nommu.c index 8b3d7191e2b8..2658f52903da 100644 --- a/arch/arm/mm/nommu.c +++ b/arch/arm/mm/nommu.c @@ -166,12 +166,6 @@ void flush_dcache_page(struct page *page) } EXPORT_SYMBOL(flush_dcache_page); -void flush_kernel_dcache_page(struct page *page) -{ - __cpuc_flush_dcache_area(page_address(page), PAGE_SIZE); -} -EXPORT_SYMBOL(flush_kernel_dcache_page); - void copy_to_user_page(struct vm_area_struct *vma, struct page *page, unsigned long uaddr, void *dst, const void *src, unsigned long len) diff --git a/arch/csky/abiv1/cacheflush.c b/arch/csky/abiv1/cacheflush.c index 07ff17ea33de..fb91b069dc69 100644 --- a/arch/csky/abiv1/cacheflush.c +++ b/arch/csky/abiv1/cacheflush.c @@ -56,17 +56,6 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long addr, } } -void flush_kernel_dcache_page(struct page *page) -{ - struct address_space *mapping; - - mapping = page_mapping_file(page); - - if (!mapping || mapping_mapped(mapping)) - dcache_wbinv_all(); -} -EXPORT_SYMBOL(flush_kernel_dcache_page); - void flush_cache_range(struct vm_area_struct *vma, unsigned long start, unsigned long end) { diff --git a/arch/csky/abiv1/inc/abi/cacheflush.h b/arch/csky/abiv1/inc/abi/cacheflush.h index 6cab7afae962..ed62e2066ba7 100644 --- a/arch/csky/abiv1/inc/abi/cacheflush.h +++ b/arch/csky/abiv1/inc/abi/cacheflush.h @@ -14,12 +14,10 @@ extern void flush_dcache_page(struct page *); #define flush_cache_page(vma, page, pfn) cache_wbinv_all() #define flush_cache_dup_mm(mm) cache_wbinv_all() -#define ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE -extern void flush_kernel_dcache_page(struct page *); - #define flush_dcache_mmap_lock(mapping) xa_lock_irq(&mapping->i_pages) #define flush_dcache_mmap_unlock(mapping) xa_unlock_irq(&mapping->i_pages) +#define ARCH_IMPLEMENTS_FLUSH_KERNEL_VMAP_RANGE 1 static inline void flush_kernel_vmap_range(void *addr, int size) { dcache_wbinv_all(); diff --git a/arch/mips/include/asm/cacheflush.h b/arch/mips/include/asm/cacheflush.h index d687b40b9fbb..b3dc9c589442 100644 --- a/arch/mips/include/asm/cacheflush.h +++ b/arch/mips/include/asm/cacheflush.h @@ -125,13 +125,7 @@ static inline void kunmap_noncoherent(void) kunmap_coherent(); } -#define ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE -static inline void flush_kernel_dcache_page(struct page *page) -{ - BUG_ON(cpu_has_dc_aliases && PageHighMem(page)); - flush_dcache_page(page); -} - +#define ARCH_IMPLEMENTS_FLUSH_KERNEL_VMAP_RANGE 1 /* * For now flush_kernel_vmap_range and invalidate_kernel_vmap_range both do a * cache writeback and invalidate operation. diff --git a/arch/nds32/include/asm/cacheflush.h b/arch/nds32/include/asm/cacheflush.h index 7d6824f7c0e8..c2a222ebfa2a 100644 --- a/arch/nds32/include/asm/cacheflush.h +++ b/arch/nds32/include/asm/cacheflush.h @@ -36,8 +36,7 @@ void copy_from_user_page(struct vm_area_struct *vma, struct page *page, void flush_anon_page(struct vm_area_struct *vma, struct page *page, unsigned long vaddr); -#define ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE -void flush_kernel_dcache_page(struct page *page); +#define ARCH_IMPLEMENTS_FLUSH_KERNEL_VMAP_RANGE 1 void flush_kernel_vmap_range(void *addr, int size); void invalidate_kernel_vmap_range(void *addr, int size); #define flush_dcache_mmap_lock(mapping) xa_lock_irq(&(mapping)->i_pages) diff --git a/arch/nds32/mm/cacheflush.c b/arch/nds32/mm/cacheflush.c index ad5344ef5d33..07aac65d1cab 100644 --- a/arch/nds32/mm/cacheflush.c +++ b/arch/nds32/mm/cacheflush.c @@ -318,15 +318,6 @@ void flush_anon_page(struct vm_area_struct *vma, local_irq_restore(flags); } -void flush_kernel_dcache_page(struct page *page) -{ - unsigned long flags; - local_irq_save(flags); - cpu_dcache_wbinval_page((unsigned long)page_address(page)); - local_irq_restore(flags); -} -EXPORT_SYMBOL(flush_kernel_dcache_page); - void flush_kernel_vmap_range(void *addr, int size) { unsigned long flags; diff --git a/arch/parisc/include/asm/cacheflush.h b/arch/parisc/include/asm/cacheflush.h index 99663fc1f997..eef0096db5f8 100644 --- a/arch/parisc/include/asm/cacheflush.h +++ b/arch/parisc/include/asm/cacheflush.h @@ -36,16 +36,12 @@ void flush_cache_all_local(void); void flush_cache_all(void); void flush_cache_mm(struct mm_struct *mm); -#define ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE void flush_kernel_dcache_page_addr(void *addr); -static inline void flush_kernel_dcache_page(struct page *page) -{ - flush_kernel_dcache_page_addr(page_address(page)); -} #define flush_kernel_dcache_range(start,size) \ flush_kernel_dcache_range_asm((start), (start)+(size)); +#define ARCH_IMPLEMENTS_FLUSH_KERNEL_VMAP_RANGE 1 void flush_kernel_vmap_range(void *vaddr, int size); void invalidate_kernel_vmap_range(void *vaddr, int size); @@ -59,7 +55,7 @@ extern void flush_dcache_page(struct page *page); #define flush_dcache_mmap_unlock(mapping) xa_unlock_irq(&mapping->i_pages) #define flush_icache_page(vma,page) do { \ - flush_kernel_dcache_page(page); \ + flush_kernel_dcache_page_addr(page_address(page)); \ flush_kernel_icache_page(page_address(page)); \ } while (0) diff --git a/arch/parisc/kernel/cache.c b/arch/parisc/kernel/cache.c index 86a1a63563fd..39e02227e231 100644 --- a/arch/parisc/kernel/cache.c +++ b/arch/parisc/kernel/cache.c @@ -334,7 +334,7 @@ void flush_dcache_page(struct page *page) return; } - flush_kernel_dcache_page(page); + flush_kernel_dcache_page_addr(page_address(page)); if (!mapping) return; @@ -375,7 +375,6 @@ EXPORT_SYMBOL(flush_dcache_page); /* Defined in arch/parisc/kernel/pacache.S */ EXPORT_SYMBOL(flush_kernel_dcache_range_asm); -EXPORT_SYMBOL(flush_kernel_dcache_page_asm); EXPORT_SYMBOL(flush_data_cache_local); EXPORT_SYMBOL(flush_kernel_icache_range_asm); diff --git a/arch/sh/include/asm/cacheflush.h b/arch/sh/include/asm/cacheflush.h index 4486a865ff62..372afa82fee6 100644 --- a/arch/sh/include/asm/cacheflush.h +++ b/arch/sh/include/asm/cacheflush.h @@ -63,6 +63,8 @@ static inline void flush_anon_page(struct vm_area_struct *vma, if (boot_cpu_data.dcache.n_aliases && PageAnon(page)) __flush_anon_page(page, vmaddr); } + +#define ARCH_IMPLEMENTS_FLUSH_KERNEL_VMAP_RANGE 1 static inline void flush_kernel_vmap_range(void *addr, int size) { __flush_wback_region(addr, size); @@ -72,12 +74,6 @@ static inline void invalidate_kernel_vmap_range(void *addr, int size) __flush_invalidate_region(addr, size); } -#define ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE -static inline void flush_kernel_dcache_page(struct page *page) -{ - flush_dcache_page(page); -} - extern void copy_to_user_page(struct vm_area_struct *vma, struct page *page, unsigned long vaddr, void *dst, const void *src, unsigned long len); diff --git a/block/blk-map.c b/block/blk-map.c index 3743158ddaeb..4639bc6b5c62 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -309,7 +309,7 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter, static void bio_invalidate_vmalloc_pages(struct bio *bio) { -#ifdef ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE +#ifdef ARCH_IMPLEMENTS_FLUSH_KERNEL_VMAP_RANGE if (bio->bi_private && !op_is_write(bio_op(bio))) { unsigned long i, len = 0; diff --git a/fs/exec.c b/fs/exec.c index 38f63451b928..41a888d4edde 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -574,7 +574,7 @@ static int copy_strings(int argc, struct user_arg_ptr argv, } if (kmapped_page) { - flush_kernel_dcache_page(kmapped_page); + flush_dcache_page(kmapped_page); kunmap(kmapped_page); put_arg_page(kmapped_page); } @@ -592,7 +592,7 @@ static int copy_strings(int argc, struct user_arg_ptr argv, ret = 0; out: if (kmapped_page) { - flush_kernel_dcache_page(kmapped_page); + flush_dcache_page(kmapped_page); kunmap(kmapped_page); put_arg_page(kmapped_page); } @@ -634,7 +634,7 @@ int copy_string_kernel(const char *arg, struct linux_binprm *bprm) kaddr = kmap_atomic(page); flush_arg_page(bprm, pos & PAGE_MASK, page); memcpy(kaddr + offset_in_page(pos), arg, bytes_to_copy); - flush_kernel_dcache_page(page); + flush_dcache_page(page); kunmap_atomic(kaddr); put_arg_page(page); } diff --git a/include/linux/highmem.h b/include/linux/highmem.h index d9a606a9fc64..b4c49f9cc379 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -130,10 +130,7 @@ static inline void flush_anon_page(struct vm_area_struct *vma, struct page *page } #endif -#ifndef ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE -static inline void flush_kernel_dcache_page(struct page *page) -{ -} +#ifndef ARCH_IMPLEMENTS_FLUSH_KERNEL_VMAP_RANGE static inline void flush_kernel_vmap_range(void *vaddr, int size) { } diff --git a/tools/testing/scatterlist/linux/mm.h b/tools/testing/scatterlist/linux/mm.h index f9a12005fcea..16ec895bbe5f 100644 --- a/tools/testing/scatterlist/linux/mm.h +++ b/tools/testing/scatterlist/linux/mm.h @@ -127,7 +127,6 @@ kmalloc_array(unsigned int n, unsigned int size, unsigned int flags) #define kmemleak_free(a) #define PageSlab(p) (0) -#define flush_kernel_dcache_page(p) #define MAX_ERRNO 4095 -- cgit v1.2.3 From e15710bf04063766f428048f4dad7b73b646203f Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Thu, 2 Sep 2021 14:56:43 -0700 Subject: mm: change fault_in_pages_* to have an unsigned size parameter fault_in_pages_writeable() and fault_in_pages_readable() treat the size parameter as unsigned, doing pointer math with the value, so make this explicit and set it to be a size_t type which all callers currently treat it as anyway. This solves the issue where static checkers get nervous seeing pointer arithmetic happening with a signed value. Link: https://lkml.kernel.org/r/20210727111136.457638-1-gregkh@linuxfoundation.org Signed-off-by: Greg Kroah-Hartman Reported-by: Jordy Zomer Cc: Matthew Wilcox Cc: David Howells Cc: William Kucharski Cc: "Darrick J. Wong" Cc: Hugh Dickins Cc: Mauro Carvalho Chehab Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagemap.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index ed02aa522263..5dcf446f42e5 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -736,7 +736,7 @@ extern void add_page_wait_queue(struct page *page, wait_queue_entry_t *waiter); /* * Fault everything in given userspace address range in. */ -static inline int fault_in_pages_writeable(char __user *uaddr, int size) +static inline int fault_in_pages_writeable(char __user *uaddr, size_t size) { char __user *end = uaddr + size - 1; @@ -763,7 +763,7 @@ static inline int fault_in_pages_writeable(char __user *uaddr, int size) return 0; } -static inline int fault_in_pages_readable(const char __user *uaddr, int size) +static inline int fault_in_pages_readable(const char __user *uaddr, size_t size) { volatile char c; const char __user *end = uaddr + size - 1; -- cgit v1.2.3 From fc1f5e980a463325cf41d39ac6a69aa3cca73995 Mon Sep 17 00:00:00 2001 From: Ohhoon Kwon Date: Thu, 2 Sep 2021 14:57:01 -0700 Subject: mm: sparse: pass section_nr to find_memory_block With CONFIG_SPARSEMEM_EXTREME enabled, __section_nr() which converts mem_section to section_nr could be costly since it iterates all section roots to check if the given mem_section is in its range. On the other hand, __nr_to_section() which converts section_nr to mem_section can be done in O(1). Let's pass section_nr instead of mem_section ptr to find_memory_block() in order to reduce needless iterations. Link: https://lkml.kernel.org/r/20210707150212.855-3-ohoono.kwon@samsung.com Signed-off-by: Ohhoon Kwon Acked-by: Michal Hocko Acked-by: Mike Rapoport Reviewed-by: David Hildenbrand Cc: Baoquan He Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/platforms/pseries/hotplug-memory.c | 4 +--- drivers/base/memory.c | 4 ++-- include/linux/memory.h | 2 +- 3 files changed, 4 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c index 377d852f5a9a..d4f28ee4d5dc 100644 --- a/arch/powerpc/platforms/pseries/hotplug-memory.c +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c @@ -211,13 +211,11 @@ static int update_lmb_associativity_index(struct drmem_lmb *lmb) static struct memory_block *lmb_to_memblock(struct drmem_lmb *lmb) { unsigned long section_nr; - struct mem_section *mem_sect; struct memory_block *mem_block; section_nr = pfn_to_section_nr(PFN_DOWN(lmb->base_addr)); - mem_sect = __nr_to_section(section_nr); - mem_block = find_memory_block(mem_sect); + mem_block = find_memory_block(section_nr); return mem_block; } diff --git a/drivers/base/memory.c b/drivers/base/memory.c index aa31a21f33d7..e3fd2dbf4eea 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -578,9 +578,9 @@ static struct memory_block *find_memory_block_by_id(unsigned long block_id) /* * Called under device_hotplug_lock. */ -struct memory_block *find_memory_block(struct mem_section *section) +struct memory_block *find_memory_block(unsigned long section_nr) { - unsigned long block_id = memory_block_id(__section_nr(section)); + unsigned long block_id = memory_block_id(section_nr); return find_memory_block_by_id(block_id); } diff --git a/include/linux/memory.h b/include/linux/memory.h index 97e92e8b556a..d9a0b61cd432 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -90,7 +90,7 @@ int create_memory_block_devices(unsigned long start, unsigned long size, void remove_memory_block_devices(unsigned long start, unsigned long size); extern void memory_dev_init(void); extern int memory_notify(unsigned long val, void *v); -extern struct memory_block *find_memory_block(struct mem_section *); +extern struct memory_block *find_memory_block(unsigned long section_nr); typedef int (*walk_memory_blocks_func_t)(struct memory_block *, void *); extern int walk_memory_blocks(unsigned long start, unsigned long size, void *arg, walk_memory_blocks_func_t func); -- cgit v1.2.3 From 11e02d3729da1a2d4a33db5ea61291770d411884 Mon Sep 17 00:00:00 2001 From: Ohhoon Kwon Date: Thu, 2 Sep 2021 14:57:04 -0700 Subject: mm: sparse: remove __section_nr() function As the last users of __section_nr() are gone, let's remove unused function __section_nr(). Link: https://lkml.kernel.org/r/20210707150212.855-4-ohoono.kwon@samsung.com Signed-off-by: Ohhoon Kwon Acked-by: Michal Hocko Acked-by: Mike Rapoport Reviewed-by: David Hildenbrand Cc: Baoquan He Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 1 - mm/sparse.c | 26 -------------------------- 2 files changed, 27 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index fcb535560028..8827f4d081d4 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1342,7 +1342,6 @@ static inline struct mem_section *__nr_to_section(unsigned long nr) return NULL; return &mem_section[SECTION_NR_TO_ROOT(nr)][nr & SECTION_ROOT_MASK]; } -extern unsigned long __section_nr(struct mem_section *ms); extern size_t mem_section_usage_size(void); /* diff --git a/mm/sparse.c b/mm/sparse.c index 8018ee7fcda5..d85655242ed9 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -109,32 +109,6 @@ static inline int sparse_index_init(unsigned long section_nr, int nid) } #endif -#ifdef CONFIG_SPARSEMEM_EXTREME -unsigned long __section_nr(struct mem_section *ms) -{ - unsigned long root_nr; - struct mem_section *root = NULL; - - for (root_nr = 0; root_nr < NR_SECTION_ROOTS; root_nr++) { - root = __nr_to_section(root_nr * SECTIONS_PER_ROOT); - if (!root) - continue; - - if ((ms >= root) && (ms < (root + SECTIONS_PER_ROOT))) - break; - } - - VM_BUG_ON(!root); - - return (root_nr * SECTIONS_PER_ROOT) + (ms - root); -} -#else -unsigned long __section_nr(struct mem_section *ms) -{ - return (unsigned long)(ms - mem_section[0]); -} -#endif - /* * During early boot, before section_mem_map is used for an actual * mem_map, we use section_mem_map to store the section's NUMA -- cgit v1.2.3 From 01c8d337d195ed105cabab95bc4dcb9e145bf5ea Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Thu, 2 Sep 2021 14:57:07 -0700 Subject: mm/sparse: set SECTION_NID_SHIFT to 6 Currently SECTION_NID_SHIFT is set to 3, which is incorrect because bit 3 and 4 can be overlapped by sub-field for early NID, and can be unexpectedly set on NUMA systems. There are a few non-critical issues related to this: - Having SECTION_TAINT_ZONE_DEVICE set for wrong sections forces pfn_to_online_page() through the slow path, but doesn't actually break the kernel. - A kdump generation tool like makedumpfile uses this field to calculate the physical address to read. So wrong bits can make the tool access to wrong address and fail to create kdump. This can be avoided by the tool, so it's not critical. To fix it, set SECTION_NID_SHIFT to 6 which is the minimum number of available bits of section flag field. Link: https://lkml.kernel.org/r/20210707045548.810271-1-naoya.horiguchi@linux.dev Fixes: 1f90a3477df3 ("mm: teach pfn_to_online_page() about ZONE_DEVICE section collisions") Signed-off-by: Naoya Horiguchi Reported-by: Kazuhito Hagio Suggested-by: Dan Williams Acked-by: David Hildenbrand Cc: Oscar Salvador Cc: Wang Wensheng Cc: Rui Xiang Cc: Kazu Cc: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 8827f4d081d4..59bad25ce78e 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1364,7 +1364,7 @@ extern size_t mem_section_usage_size(void); #define SECTION_TAINT_ZONE_DEVICE (1UL<<4) #define SECTION_MAP_LAST_BIT (1UL<<5) #define SECTION_MAP_MASK (~(SECTION_MAP_LAST_BIT-1)) -#define SECTION_NID_SHIFT 3 +#define SECTION_NID_SHIFT 6 static inline struct page *__section_mem_map_addr(struct mem_section *section) { -- cgit v1.2.3 From d0505e9f7dcec85da6634ec66da2b17656ee177b Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Thu, 2 Sep 2021 14:58:31 -0700 Subject: mm: hwpoison: don't drop slab caches for offlining non-LRU page In the current implementation of soft offline, if non-LRU page is met, all the slab caches will be dropped to free the page then offline. But if the page is not slab page all the effort is wasted in vain. Even though it is a slab page, it is not guaranteed the page could be freed at all. However the side effect and cost is quite high. It does not only drop the slab caches, but also may drop a significant amount of page caches which are associated with inode caches. It could make the most workingset gone in order to just offline a page. And the offline is not guaranteed to succeed at all, actually I really doubt the success rate for real life workload. Furthermore the worse consequence is the system may be locked up and unusable since the page cache release may incur huge amount of works queued for memcg release. Actually we ran into such unpleasant case in our production environment. Firstly, the workqueue of memory_failure_work_func is locked up as below: BUG: workqueue lockup - pool cpus=1 node=0 flags=0x0 nice=0 stuck for 53s! Showing busy workqueues and worker pools: workqueue events: flags=0x0 pwq 2: cpus=1 node=0 flags=0x0 nice=0 active=14/256 refcnt=15 in-flight: 409271:memory_failure_work_func pending: kfree_rcu_work, kfree_rcu_monitor, kfree_rcu_work, rht_deferred_worker, rht_deferred_worker, rht_deferred_worker, rht_deferred_worker, kfree_rcu_work, kfree_rcu_work, kfree_rcu_work, kfree_rcu_work, drain_local_stock, kfree_rcu_work workqueue mm_percpu_wq: flags=0x8 pwq 2: cpus=1 node=0 flags=0x0 nice=0 active=1/256 refcnt=2 pending: vmstat_update workqueue cgroup_destroy: flags=0x0 pwq 2: cpus=1 node=0 flags=0x0 nice=0 active=1/1 refcnt=12072 pending: css_release_work_fn There were over 12K css_release_work_fn queued, and this caused a few lockups due to the contention of worker pool lock with IRQ disabled, for example: NMI watchdog: Watchdog detected hard LOCKUP on cpu 1 Modules linked in: amd64_edac_mod edac_mce_amd crct10dif_pclmul crc32_pclmul ghash_clmulni_intel xt_DSCP iptable_mangle kvm_amd bpfilter vfat fat acpi_ipmi i2c_piix4 usb_storage ipmi_si k10temp i2c_core ipmi_devintf ipmi_msghandler acpi_cpufreq sch_fq_codel xfs libcrc32c crc32c_intel mlx5_core mlxfw nvme xhci_pci ptp nvme_core pps_core xhci_hcd CPU: 1 PID: 205500 Comm: kworker/1:0 Tainted: G L 5.10.32-t1.el7.twitter.x86_64 #1 Hardware name: TYAN F5AMT /z /S8026GM2NRE-CGN, BIOS V8.030 03/30/2021 Workqueue: events memory_failure_work_func RIP: 0010:queued_spin_lock_slowpath+0x41/0x1a0 Code: 41 f0 0f ba 2f 08 0f 92 c0 0f b6 c0 c1 e0 08 89 c2 8b 07 30 e4 09 d0 a9 00 01 ff ff 75 1b 85 c0 74 0e 8b 07 84 c0 74 08 f3 90 <8b> 07 84 c0 75 f8 b8 01 00 00 00 66 89 07 c3 f6 c4 01 75 04 c6 47 RSP: 0018:ffff9b2ac278f900 EFLAGS: 00000002 RAX: 0000000000480101 RBX: ffff8ce98ce71800 RCX: 0000000000000084 RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff8ce98ce6a140 RBP: 00000000000284c8 R08: ffffd7248dcb6808 R09: 0000000000000000 R10: 0000000000000003 R11: ffff9b2ac278f9b0 R12: 0000000000000001 R13: ffff8cb44dab9c00 R14: ffffffffbd1ce6a0 R15: ffff8cacaa37f068 FS: 0000000000000000(0000) GS:ffff8ce98ce40000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fcf6e8cb000 CR3: 0000000a0c60a000 CR4: 0000000000350ee0 Call Trace: __queue_work+0xd6/0x3c0 queue_work_on+0x1c/0x30 uncharge_batch+0x10e/0x110 mem_cgroup_uncharge_list+0x6d/0x80 release_pages+0x37f/0x3f0 __pagevec_release+0x1c/0x50 __invalidate_mapping_pages+0x348/0x380 inode_lru_isolate+0x10a/0x160 __list_lru_walk_one+0x7b/0x170 list_lru_walk_one+0x4a/0x60 prune_icache_sb+0x37/0x50 super_cache_scan+0x123/0x1a0 do_shrink_slab+0x10c/0x2c0 shrink_slab+0x1f1/0x290 drop_slab_node+0x4d/0x70 soft_offline_page+0x1ac/0x5b0 memory_failure_work_func+0x6a/0x90 process_one_work+0x19e/0x340 worker_thread+0x30/0x360 kthread+0x116/0x130 The lockup made the machine is quite unusable. And it also made the most workingset gone, the reclaimabled slab caches were reduced from 12G to 300MB, the page caches were decreased from 17G to 4G. But the most disappointing thing is all the effort doesn't make the page offline, it just returns: soft_offline: 0x1469f2: unknown non LRU page type 5ffff0000000000 () It seems the aggressive behavior for non-LRU page didn't pay back, so it doesn't make too much sense to keep it considering the terrible side effect. Link: https://lkml.kernel.org/r/20210819054116.266126-1-shy828301@gmail.com Signed-off-by: Yang Shi Reported-by: David Mackey Acked-by: David Hildenbrand Acked-by: Naoya Horiguchi Cc: Oscar Salvador Cc: Matthew Wilcox (Oracle) Cc: Jonathan Corbet Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 +- mm/hwpoison-inject.c | 2 +- mm/memory-failure.c | 18 ++++++++---------- 3 files changed, 10 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 35bbac32b6f6..11c38550627c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3110,7 +3110,7 @@ extern void memory_failure_queue_kick(int cpu); extern int unpoison_memory(unsigned long pfn); extern int sysctl_memory_failure_early_kill; extern int sysctl_memory_failure_recovery; -extern void shake_page(struct page *p, int access); +extern void shake_page(struct page *p); extern atomic_long_t num_poisoned_pages __read_mostly; extern int soft_offline_page(unsigned long pfn, int flags); diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c index 1ae1ebc2b9b1..aff4d27ec235 100644 --- a/mm/hwpoison-inject.c +++ b/mm/hwpoison-inject.c @@ -30,7 +30,7 @@ static int hwpoison_inject(void *data, u64 val) if (!hwpoison_filter_enable) goto inject; - shake_page(hpage, 0); + shake_page(hpage); /* * This implies unable to support non-LRU pages. */ diff --git a/mm/memory-failure.c b/mm/memory-failure.c index f83a2af0af18..5decacb86b9f 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -282,9 +282,9 @@ static int kill_proc(struct to_kill *tk, unsigned long pfn, int flags) /* * Unknown page type encountered. Try to check whether it can turn PageLRU by - * lru_add_drain_all, or a free page by reclaiming slabs when possible. + * lru_add_drain_all. */ -void shake_page(struct page *p, int access) +void shake_page(struct page *p) { if (PageHuge(p)) return; @@ -296,11 +296,9 @@ void shake_page(struct page *p, int access) } /* - * Only call drop_slab_node here (which would also shrink - * other caches) if access is not potentially fatal. + * TODO: Could shrink slab caches here if a lightweight range-based + * shrinker will be available. */ - if (access) - drop_slab_node(page_to_nid(p)); } EXPORT_SYMBOL_GPL(shake_page); @@ -1205,7 +1203,7 @@ try_again: * page, retry. */ if (pass++ < 3) { - shake_page(p, 1); + shake_page(p); goto try_again; } ret = -EIO; @@ -1222,7 +1220,7 @@ try_again: */ if (pass++ < 3) { put_page(p); - shake_page(p, 1); + shake_page(p); count_increased = false; goto try_again; } @@ -1369,7 +1367,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, * shake_page() again to ensure that it's flushed. */ if (mlocked) - shake_page(hpage, 0); + shake_page(hpage); /* * Now that the dirty bit has been propagated to the @@ -1723,7 +1721,7 @@ try_again: * The check (unnecessarily) ignores LRU pages being isolated and * walked by the page reclaim code, however that's not a big loss. */ - shake_page(p, 0); + shake_page(p); lock_page(p); -- cgit v1.2.3 From 09a26e832705fdb7a9484495b71a05e0bbc65207 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Thu, 2 Sep 2021 14:58:53 -0700 Subject: hugetlb: fix hugetlb cgroup refcounting during vma split Guillaume Morin reported hitting the following WARNING followed by GPF or NULL pointer deference either in cgroups_destroy or in the kill_css path.: percpu ref (css_release) <= 0 (-1) after switching to atomic WARNING: CPU: 23 PID: 130 at lib/percpu-refcount.c:196 percpu_ref_switch_to_atomic_rcu+0x127/0x130 CPU: 23 PID: 130 Comm: ksoftirqd/23 Kdump: loaded Tainted: G O 5.10.60 #1 RIP: 0010:percpu_ref_switch_to_atomic_rcu+0x127/0x130 Call Trace: rcu_core+0x30f/0x530 rcu_core_si+0xe/0x10 __do_softirq+0x103/0x2a2 run_ksoftirqd+0x2b/0x40 smpboot_thread_fn+0x11a/0x170 kthread+0x10a/0x140 ret_from_fork+0x22/0x30 Upon further examination, it was discovered that the css structure was associated with hugetlb reservations. For private hugetlb mappings the vma points to a reserve map that contains a pointer to the css. At mmap time, reservations are set up and a reference to the css is taken. This reference is dropped in the vma close operation; hugetlb_vm_op_close. However, if a vma is split no additional reference to the css is taken yet hugetlb_vm_op_close will be called twice for the split vma resulting in an underflow. Fix by taking another reference in hugetlb_vm_op_open. Note that the reference is only taken for the owner of the reserve map. In the more common fork case, the pointer to the reserve map is cleared for non-owning vmas. Link: https://lkml.kernel.org/r/20210830215015.155224-1-mike.kravetz@oracle.com Fixes: e9fe92ae0cd2 ("hugetlb_cgroup: add reservation accounting for private mappings") Signed-off-by: Mike Kravetz Reported-by: Guillaume Morin Suggested-by: Guillaume Morin Tested-by: Guillaume Morin Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb_cgroup.h | 12 ++++++++++++ mm/hugetlb.c | 4 +++- 2 files changed, 15 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h index 0b8d1fdda3a1..c137396129db 100644 --- a/include/linux/hugetlb_cgroup.h +++ b/include/linux/hugetlb_cgroup.h @@ -121,6 +121,13 @@ static inline void hugetlb_cgroup_put_rsvd_cgroup(struct hugetlb_cgroup *h_cg) css_put(&h_cg->css); } +static inline void resv_map_dup_hugetlb_cgroup_uncharge_info( + struct resv_map *resv_map) +{ + if (resv_map->css) + css_get(resv_map->css); +} + extern int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, struct hugetlb_cgroup **ptr); extern int hugetlb_cgroup_charge_cgroup_rsvd(int idx, unsigned long nr_pages, @@ -199,6 +206,11 @@ static inline void hugetlb_cgroup_put_rsvd_cgroup(struct hugetlb_cgroup *h_cg) { } +static inline void resv_map_dup_hugetlb_cgroup_uncharge_info( + struct resv_map *resv_map) +{ +} + static inline int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, struct hugetlb_cgroup **ptr) { diff --git a/mm/hugetlb.c b/mm/hugetlb.c index dd1c1e7d970b..41a1778d3f67 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4106,8 +4106,10 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma) * after this open call completes. It is therefore safe to take a * new reference here without additional locking. */ - if (resv && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) + if (resv && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { + resv_map_dup_hugetlb_cgroup_uncharge_info(resv); kref_get(&resv->refs); + } } static void hugetlb_vm_op_close(struct vm_area_struct *vma) -- cgit v1.2.3 From a759a909d42d727e918bd5248d6cff7562fa8109 Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Thu, 2 Sep 2021 14:58:56 -0700 Subject: userfaultfd: change mmap_changing to atomic Patch series "userfaultfd: minor bug fixes". Three unrelated bug fixes. The first two addresses possible issues (not too theoretical ones), but I did not encounter them in practice. The third patch addresses a test bug that causes the test to fail on my system. It has been sent before as part of a bigger RFC. This patch (of 3): mmap_changing is currently a boolean variable, which is set and cleared without any lock that protects against concurrent modifications. mmap_changing is supposed to mark whether userfaultfd page-faults handling should be retried since mappings are undergoing a change. However, concurrent calls, for instance to madvise(MADV_DONTNEED), might cause mmap_changing to be false, although the remove event was still not read (hence acknowledged) by the user. Change mmap_changing to atomic_t and increase/decrease appropriately. Add a debug assertion to see whether mmap_changing is negative. Link: https://lkml.kernel.org/r/20210808020724.1022515-1-namit@vmware.com Link: https://lkml.kernel.org/r/20210808020724.1022515-2-namit@vmware.com Fixes: df2cc96e77011 ("userfaultfd: prevent non-cooperative events vs mcopy_atomic races") Signed-off-by: Nadav Amit Cc: Mike Rapoport Cc: Peter Xu Cc: Axel Rasmussen Cc: Alexander Viro Cc: Andrea Arcangeli Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/userfaultfd.c | 25 +++++++++++++------------ include/linux/userfaultfd_k.h | 8 ++++---- mm/userfaultfd.c | 15 ++++++++------- 3 files changed, 25 insertions(+), 23 deletions(-) (limited to 'include/linux') diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 5c2d806e6ae5..29a3016f16c9 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -74,7 +74,7 @@ struct userfaultfd_ctx { /* released */ bool released; /* memory mappings are changing because of non-cooperative event */ - bool mmap_changing; + atomic_t mmap_changing; /* mm with one ore more vmas attached to this userfaultfd_ctx */ struct mm_struct *mm; }; @@ -623,7 +623,8 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx, * already released. */ out: - WRITE_ONCE(ctx->mmap_changing, false); + atomic_dec(&ctx->mmap_changing); + VM_BUG_ON(atomic_read(&ctx->mmap_changing) < 0); userfaultfd_ctx_put(ctx); } @@ -669,12 +670,12 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs) ctx->state = UFFD_STATE_RUNNING; ctx->features = octx->features; ctx->released = false; - ctx->mmap_changing = false; + atomic_set(&ctx->mmap_changing, 0); ctx->mm = vma->vm_mm; mmgrab(ctx->mm); userfaultfd_ctx_get(octx); - WRITE_ONCE(octx->mmap_changing, true); + atomic_inc(&octx->mmap_changing); fctx->orig = octx; fctx->new = ctx; list_add_tail(&fctx->list, fcs); @@ -721,7 +722,7 @@ void mremap_userfaultfd_prep(struct vm_area_struct *vma, if (ctx->features & UFFD_FEATURE_EVENT_REMAP) { vm_ctx->ctx = ctx; userfaultfd_ctx_get(ctx); - WRITE_ONCE(ctx->mmap_changing, true); + atomic_inc(&ctx->mmap_changing); } else { /* Drop uffd context if remap feature not enabled */ vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; @@ -766,7 +767,7 @@ bool userfaultfd_remove(struct vm_area_struct *vma, return true; userfaultfd_ctx_get(ctx); - WRITE_ONCE(ctx->mmap_changing, true); + atomic_inc(&ctx->mmap_changing); mmap_read_unlock(mm); msg_init(&ewq.msg); @@ -810,7 +811,7 @@ int userfaultfd_unmap_prep(struct vm_area_struct *vma, return -ENOMEM; userfaultfd_ctx_get(ctx); - WRITE_ONCE(ctx->mmap_changing, true); + atomic_inc(&ctx->mmap_changing); unmap_ctx->ctx = ctx; unmap_ctx->start = start; unmap_ctx->end = end; @@ -1700,7 +1701,7 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx, user_uffdio_copy = (struct uffdio_copy __user *) arg; ret = -EAGAIN; - if (READ_ONCE(ctx->mmap_changing)) + if (atomic_read(&ctx->mmap_changing)) goto out; ret = -EFAULT; @@ -1757,7 +1758,7 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx, user_uffdio_zeropage = (struct uffdio_zeropage __user *) arg; ret = -EAGAIN; - if (READ_ONCE(ctx->mmap_changing)) + if (atomic_read(&ctx->mmap_changing)) goto out; ret = -EFAULT; @@ -1807,7 +1808,7 @@ static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx, struct userfaultfd_wake_range range; bool mode_wp, mode_dontwake; - if (READ_ONCE(ctx->mmap_changing)) + if (atomic_read(&ctx->mmap_changing)) return -EAGAIN; user_uffdio_wp = (struct uffdio_writeprotect __user *) arg; @@ -1855,7 +1856,7 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg) user_uffdio_continue = (struct uffdio_continue __user *)arg; ret = -EAGAIN; - if (READ_ONCE(ctx->mmap_changing)) + if (atomic_read(&ctx->mmap_changing)) goto out; ret = -EFAULT; @@ -2087,7 +2088,7 @@ SYSCALL_DEFINE1(userfaultfd, int, flags) ctx->features = 0; ctx->state = UFFD_STATE_WAIT_API; ctx->released = false; - ctx->mmap_changing = false; + atomic_set(&ctx->mmap_changing, 0); ctx->mm = current->mm; /* prevent the mm struct to be freed */ mmgrab(ctx->mm); diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index 331d2ccf0bcc..33cea484d1ad 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -60,16 +60,16 @@ extern int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start, unsigned long src_start, unsigned long len, - bool *mmap_changing, __u64 mode); + atomic_t *mmap_changing, __u64 mode); extern ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long dst_start, unsigned long len, - bool *mmap_changing); + atomic_t *mmap_changing); extern ssize_t mcopy_continue(struct mm_struct *dst_mm, unsigned long dst_start, - unsigned long len, bool *mmap_changing); + unsigned long len, atomic_t *mmap_changing); extern int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start, unsigned long len, - bool enable_wp, bool *mmap_changing); + bool enable_wp, atomic_t *mmap_changing); /* mm helpers */ static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma, diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 0e2132834bc7..7a9008415534 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -483,7 +483,7 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm, unsigned long src_start, unsigned long len, enum mcopy_atomic_mode mcopy_mode, - bool *mmap_changing, + atomic_t *mmap_changing, __u64 mode) { struct vm_area_struct *dst_vma; @@ -517,7 +517,7 @@ retry: * request the user to retry later */ err = -EAGAIN; - if (mmap_changing && READ_ONCE(*mmap_changing)) + if (mmap_changing && atomic_read(mmap_changing)) goto out_unlock; /* @@ -650,28 +650,29 @@ out: ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start, unsigned long src_start, unsigned long len, - bool *mmap_changing, __u64 mode) + atomic_t *mmap_changing, __u64 mode) { return __mcopy_atomic(dst_mm, dst_start, src_start, len, MCOPY_ATOMIC_NORMAL, mmap_changing, mode); } ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start, - unsigned long len, bool *mmap_changing) + unsigned long len, atomic_t *mmap_changing) { return __mcopy_atomic(dst_mm, start, 0, len, MCOPY_ATOMIC_ZEROPAGE, mmap_changing, 0); } ssize_t mcopy_continue(struct mm_struct *dst_mm, unsigned long start, - unsigned long len, bool *mmap_changing) + unsigned long len, atomic_t *mmap_changing) { return __mcopy_atomic(dst_mm, start, 0, len, MCOPY_ATOMIC_CONTINUE, mmap_changing, 0); } int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start, - unsigned long len, bool enable_wp, bool *mmap_changing) + unsigned long len, bool enable_wp, + atomic_t *mmap_changing) { struct vm_area_struct *dst_vma; pgprot_t newprot; @@ -694,7 +695,7 @@ int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start, * request the user to retry later */ err = -EAGAIN; - if (mmap_changing && READ_ONCE(*mmap_changing)) + if (mmap_changing && atomic_read(mmap_changing)) goto out_unlock; err = -ENOENT; -- cgit v1.2.3 From 5ac95884a784e822b8cbe3d4bd6e9f96b3b71e3f Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Thu, 2 Sep 2021 14:59:13 -0700 Subject: mm/migrate: enable returning precise migrate_pages() success count Under normal circumstances, migrate_pages() returns the number of pages migrated. In error conditions, it returns an error code. When returning an error code, there is no way to know how many pages were migrated or not migrated. Make migrate_pages() return how many pages are demoted successfully for all cases, including when encountering errors. Page reclaim behavior will depend on this in subsequent patches. Link: https://lkml.kernel.org/r/20210721063926.3024591-3-ying.huang@intel.com Link: https://lkml.kernel.org/r/20210715055145.195411-4-ying.huang@intel.com Signed-off-by: Yang Shi Signed-off-by: Dave Hansen Signed-off-by: "Huang, Ying" Suggested-by: Oscar Salvador [optional parameter] Reviewed-by: Yang Shi Reviewed-by: Zi Yan Cc: Michal Hocko Cc: Wei Xu Cc: Dan Williams Cc: David Hildenbrand Cc: David Rientjes Cc: Greg Thelen Cc: Keith Busch Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/migrate.h | 5 +++-- mm/compaction.c | 2 +- mm/gup.c | 2 +- mm/memory-failure.c | 2 +- mm/memory_hotplug.c | 2 +- mm/mempolicy.c | 4 ++-- mm/migrate.c | 11 ++++++++--- mm/page_alloc.c | 2 +- 8 files changed, 18 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 23dadf7aeba8..8ab88d46318e 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -41,7 +41,8 @@ extern int migrate_page(struct address_space *mapping, struct page *newpage, struct page *page, enum migrate_mode mode); extern int migrate_pages(struct list_head *l, new_page_t new, free_page_t free, - unsigned long private, enum migrate_mode mode, int reason); + unsigned long private, enum migrate_mode mode, int reason, + unsigned int *ret_succeeded); extern struct page *alloc_migration_target(struct page *page, unsigned long private); extern int isolate_movable_page(struct page *page, isolate_mode_t mode); @@ -56,7 +57,7 @@ extern int migrate_page_move_mapping(struct address_space *mapping, static inline void putback_movable_pages(struct list_head *l) {} static inline int migrate_pages(struct list_head *l, new_page_t new, free_page_t free, unsigned long private, enum migrate_mode mode, - int reason) + int reason, unsigned int *ret_succeeded) { return -ENOSYS; } static inline struct page *alloc_migration_target(struct page *page, unsigned long private) diff --git a/mm/compaction.c b/mm/compaction.c index 621508e0ecd5..61fb64f47a06 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -2398,7 +2398,7 @@ compact_zone(struct compact_control *cc, struct capture_control *capc) err = migrate_pages(&cc->migratepages, compaction_alloc, compaction_free, (unsigned long)cc, cc->mode, - MR_COMPACTION); + MR_COMPACTION, NULL); trace_mm_compaction_migratepages(cc->nr_migratepages, err, &cc->migratepages); diff --git a/mm/gup.c b/mm/gup.c index 1c7f4ec6990b..9935a4480710 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1772,7 +1772,7 @@ static long check_and_migrate_movable_pages(unsigned long nr_pages, if (!list_empty(&movable_page_list)) { ret = migrate_pages(&movable_page_list, alloc_migration_target, NULL, (unsigned long)&mtc, MIGRATE_SYNC, - MR_LONGTERM_PIN); + MR_LONGTERM_PIN, NULL); if (ret && !list_empty(&movable_page_list)) putback_movable_pages(&movable_page_list); } diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 2f925615e573..517789b03961 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -2099,7 +2099,7 @@ static int __soft_offline_page(struct page *page) if (isolate_page(hpage, &pagelist)) { ret = migrate_pages(&pagelist, alloc_migration_target, NULL, - (unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_FAILURE); + (unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_FAILURE, NULL); if (!ret) { bool release = !huge; diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 86c3af79e874..4c527a80b6c9 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1469,7 +1469,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) if (nodes_empty(nmask)) node_set(mtc.nid, nmask); ret = migrate_pages(&source, alloc_migration_target, NULL, - (unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_HOTPLUG); + (unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_HOTPLUG, NULL); if (ret) { list_for_each_entry(page, &source, lru) { if (__ratelimit(&migrate_rs)) { diff --git a/mm/mempolicy.c b/mm/mempolicy.c index e32360e90274..939eabcaf488 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1084,7 +1084,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, if (!list_empty(&pagelist)) { err = migrate_pages(&pagelist, alloc_migration_target, NULL, - (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL); + (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL); if (err) putback_movable_pages(&pagelist); } @@ -1338,7 +1338,7 @@ static long do_mbind(unsigned long start, unsigned long len, if (!list_empty(&pagelist)) { WARN_ON_ONCE(flags & MPOL_MF_LAZY); nr_failed = migrate_pages(&pagelist, new_page, NULL, - start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND); + start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND, NULL); if (nr_failed) putback_movable_pages(&pagelist); } diff --git a/mm/migrate.c b/mm/migrate.c index 0c12af203b68..ae923e9b8874 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1429,6 +1429,8 @@ static inline int try_split_thp(struct page *page, struct page **page2, * @mode: The migration mode that specifies the constraints for * page migration, if any. * @reason: The reason for page migration. + * @ret_succeeded: Set to the number of pages migrated successfully if + * the caller passes a non-NULL pointer. * * The function returns after 10 attempts or if no pages are movable any more * because the list has become empty or no retryable pages exist any more. @@ -1439,7 +1441,7 @@ static inline int try_split_thp(struct page *page, struct page **page2, */ int migrate_pages(struct list_head *from, new_page_t get_new_page, free_page_t put_new_page, unsigned long private, - enum migrate_mode mode, int reason) + enum migrate_mode mode, int reason, unsigned int *ret_succeeded) { int retry = 1; int thp_retry = 1; @@ -1594,6 +1596,9 @@ out: if (!swapwrite) current->flags &= ~PF_SWAPWRITE; + if (ret_succeeded) + *ret_succeeded = nr_succeeded; + return rc; } @@ -1663,7 +1668,7 @@ static int do_move_pages_to_node(struct mm_struct *mm, }; err = migrate_pages(pagelist, alloc_migration_target, NULL, - (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL); + (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL); if (err) putback_movable_pages(pagelist); return err; @@ -2178,7 +2183,7 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, list_add(&page->lru, &migratepages); nr_remaining = migrate_pages(&migratepages, *new, NULL, node, - MIGRATE_ASYNC, MR_NUMA_MISPLACED); + MIGRATE_ASYNC, MR_NUMA_MISPLACED, NULL); if (nr_remaining) { if (!list_empty(&migratepages)) { list_del(&page->lru); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index cafdca874e0d..f95e1d2386a1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -8990,7 +8990,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, cc->nr_migratepages -= nr_reclaimed; ret = migrate_pages(&cc->migratepages, alloc_migration_target, - NULL, (unsigned long)&mtc, cc->mode, MR_CONTIG_RANGE); + NULL, (unsigned long)&mtc, cc->mode, MR_CONTIG_RANGE, NULL); /* * On -ENOMEM, migrate_pages() bails out right away. It is pointless -- cgit v1.2.3 From 26aa2d199d6f2cfa6f2ef2a5dfe891f2250e71a0 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Thu, 2 Sep 2021 14:59:16 -0700 Subject: mm/migrate: demote pages during reclaim This is mostly derived from a patch from Yang Shi: https://lore.kernel.org/linux-mm/1560468577-101178-10-git-send-email-yang.shi@linux.alibaba.com/ Add code to the reclaim path (shrink_page_list()) to "demote" data to another NUMA node instead of discarding the data. This always avoids the cost of I/O needed to read the page back in and sometimes avoids the writeout cost when the page is dirty. A second pass through shrink_page_list() will be made if any demotions fail. This essentially falls back to normal reclaim behavior in the case that demotions fail. Previous versions of this patch may have simply failed to reclaim pages which were eligible for demotion but were unable to be demoted in practice. For some cases, for example, MADV_PAGEOUT, the pages are always discarded instead of demoted to follow the kernel API definition. Because MADV_PAGEOUT is defined as freeing specified pages regardless in which tier they are. Note: This just adds the start of infrastructure for migration. It is actually disabled next to the FIXME in migrate_demote_page_ok(). [dave.hansen@linux.intel.com: v11] Link: https://lkml.kernel.org/r/20210715055145.195411-5-ying.huang@intel.com Link: https://lkml.kernel.org/r/20210721063926.3024591-4-ying.huang@intel.com Link: https://lkml.kernel.org/r/20210715055145.195411-5-ying.huang@intel.com Signed-off-by: Dave Hansen Signed-off-by: "Huang, Ying" Reviewed-by: Yang Shi Reviewed-by: Wei Xu Reviewed-by: Oscar Salvador Reviewed-by: Zi Yan Cc: Michal Hocko Cc: David Rientjes Cc: Dan Williams Cc: David Hildenbrand Cc: Greg Thelen Cc: Keith Busch Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/migrate.h | 9 +++++ include/trace/events/migrate.h | 3 +- mm/vmscan.c | 85 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 96 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 8ab88d46318e..326250996b4e 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -28,6 +28,7 @@ enum migrate_reason { MR_NUMA_MISPLACED, MR_CONTIG_RANGE, MR_LONGTERM_PIN, + MR_DEMOTION, MR_TYPES }; @@ -167,6 +168,14 @@ struct migrate_vma { int migrate_vma_setup(struct migrate_vma *args); void migrate_vma_pages(struct migrate_vma *migrate); void migrate_vma_finalize(struct migrate_vma *migrate); +int next_demotion_node(int node); + +#else /* CONFIG_MIGRATION disabled: */ + +static inline int next_demotion_node(int node) +{ + return NUMA_NO_NODE; +} #endif /* CONFIG_MIGRATION */ diff --git a/include/trace/events/migrate.h b/include/trace/events/migrate.h index 9fb2a3bbcdfb..779f3fad9ecd 100644 --- a/include/trace/events/migrate.h +++ b/include/trace/events/migrate.h @@ -21,7 +21,8 @@ EM( MR_MEMPOLICY_MBIND, "mempolicy_mbind") \ EM( MR_NUMA_MISPLACED, "numa_misplaced") \ EM( MR_CONTIG_RANGE, "contig_range") \ - EMe(MR_LONGTERM_PIN, "longterm_pin") + EM( MR_LONGTERM_PIN, "longterm_pin") \ + EMe(MR_DEMOTION, "demotion") /* * First define the enums in the above macros to be exported to userspace diff --git a/mm/vmscan.c b/mm/vmscan.c index 6c401b44a245..f26b247f5daf 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -41,6 +41,7 @@ #include #include #include +#include #include #include #include @@ -121,6 +122,9 @@ struct scan_control { /* The file pages on the current node are dangerously low */ unsigned int file_is_tiny:1; + /* Always discard instead of demoting to lower tier memory */ + unsigned int no_demotion:1; + /* Allocation order */ s8 order; @@ -518,6 +522,17 @@ static long add_nr_deferred(long nr, struct shrinker *shrinker, return atomic_long_add_return(nr, &shrinker->nr_deferred[nid]); } +static bool can_demote(int nid, struct scan_control *sc) +{ + if (sc->no_demotion) + return false; + if (next_demotion_node(nid) == NUMA_NO_NODE) + return false; + + // FIXME: actually enable this later in the series + return false; +} + /* * This misses isolated pages which are not accounted for to save counters. * As the data only determines if reclaim or compaction continues, it is @@ -1263,6 +1278,49 @@ static void page_check_dirty_writeback(struct page *page, mapping->a_ops->is_dirty_writeback(page, dirty, writeback); } +static struct page *alloc_demote_page(struct page *page, unsigned long node) +{ + struct migration_target_control mtc = { + /* + * Allocate from 'node', or fail quickly and quietly. + * When this happens, 'page' will likely just be discarded + * instead of migrated. + */ + .gfp_mask = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) | + __GFP_THISNODE | __GFP_NOWARN | + __GFP_NOMEMALLOC | GFP_NOWAIT, + .nid = node + }; + + return alloc_migration_target(page, (unsigned long)&mtc); +} + +/* + * Take pages on @demote_list and attempt to demote them to + * another node. Pages which are not demoted are left on + * @demote_pages. + */ +static unsigned int demote_page_list(struct list_head *demote_pages, + struct pglist_data *pgdat) +{ + int target_nid = next_demotion_node(pgdat->node_id); + unsigned int nr_succeeded; + int err; + + if (list_empty(demote_pages)) + return 0; + + if (target_nid == NUMA_NO_NODE) + return 0; + + /* Demotion ignores all cpuset and mempolicy settings */ + err = migrate_pages(demote_pages, alloc_demote_page, NULL, + target_nid, MIGRATE_ASYNC, MR_DEMOTION, + &nr_succeeded); + + return nr_succeeded; +} + /* * shrink_page_list() returns the number of reclaimed pages */ @@ -1274,12 +1332,16 @@ static unsigned int shrink_page_list(struct list_head *page_list, { LIST_HEAD(ret_pages); LIST_HEAD(free_pages); + LIST_HEAD(demote_pages); unsigned int nr_reclaimed = 0; unsigned int pgactivate = 0; + bool do_demote_pass; memset(stat, 0, sizeof(*stat)); cond_resched(); + do_demote_pass = can_demote(pgdat->node_id, sc); +retry: while (!list_empty(page_list)) { struct address_space *mapping; struct page *page; @@ -1428,6 +1490,17 @@ static unsigned int shrink_page_list(struct list_head *page_list, ; /* try to reclaim the page below */ } + /* + * Before reclaiming the page, try to relocate + * its contents to another node. + */ + if (do_demote_pass && + (thp_migration_supported() || !PageTransHuge(page))) { + list_add(&page->lru, &demote_pages); + unlock_page(page); + continue; + } + /* * Anonymous process memory has backing store? * Try to allocate it some swap space here. @@ -1679,6 +1752,17 @@ keep: list_add(&page->lru, &ret_pages); VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page); } + /* 'page_list' is always empty here */ + + /* Migrate pages selected for demotion */ + nr_reclaimed += demote_page_list(&demote_pages, pgdat); + /* Pages that could not be demoted are still in @demote_pages */ + if (!list_empty(&demote_pages)) { + /* Pages which failed to demoted go back on @page_list for retry: */ + list_splice_init(&demote_pages, page_list); + do_demote_pass = false; + goto retry; + } pgactivate = stat->nr_activate[0] + stat->nr_activate[1]; @@ -2326,6 +2410,7 @@ unsigned long reclaim_pages(struct list_head *page_list) .may_writepage = 1, .may_unmap = 1, .may_swap = 1, + .no_demotion = 1, }; noreclaim_flag = memalloc_noreclaim_save(); -- cgit v1.2.3 From 668e4147d8850df32ca41e28f52c146025ca45c6 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Thu, 2 Sep 2021 14:59:19 -0700 Subject: mm/vmscan: add page demotion counter Account the number of demoted pages. Add pgdemote_kswapd and pgdemote_direct VM counters showed in /proc/vmstat. [ daveh: - __count_vm_events() a bit, and made them look at the THP size directly rather than getting data from migrate_pages() ] Link: https://lkml.kernel.org/r/20210721063926.3024591-5-ying.huang@intel.com Link: https://lkml.kernel.org/r/20210715055145.195411-6-ying.huang@intel.com Signed-off-by: Yang Shi Signed-off-by: Dave Hansen Signed-off-by: "Huang, Ying" Reviewed-by: Yang Shi Reviewed-by: Wei Xu Reviewed-by: Zi Yan Cc: Michal Hocko Cc: David Rientjes Cc: Dan Williams Cc: David Hildenbrand Cc: Oscar Salvador Cc: Greg Thelen Cc: Keith Busch Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vm_event_item.h | 2 ++ mm/vmscan.c | 5 +++++ mm/vmstat.c | 2 ++ 3 files changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index ae0dd1948c2b..a185cc75ff52 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -33,6 +33,8 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, PGREUSE, PGSTEAL_KSWAPD, PGSTEAL_DIRECT, + PGDEMOTE_KSWAPD, + PGDEMOTE_DIRECT, PGSCAN_KSWAPD, PGSCAN_DIRECT, PGSCAN_DIRECT_THROTTLE, diff --git a/mm/vmscan.c b/mm/vmscan.c index f26b247f5daf..88593b82a8df 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1318,6 +1318,11 @@ static unsigned int demote_page_list(struct list_head *demote_pages, target_nid, MIGRATE_ASYNC, MR_DEMOTION, &nr_succeeded); + if (current_is_kswapd()) + __count_vm_events(PGDEMOTE_KSWAPD, nr_succeeded); + else + __count_vm_events(PGDEMOTE_DIRECT, nr_succeeded); + return nr_succeeded; } diff --git a/mm/vmstat.c b/mm/vmstat.c index b0534e068166..ec5a2e789dd2 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1217,6 +1217,8 @@ const char * const vmstat_text[] = { "pgreuse", "pgsteal_kswapd", "pgsteal_direct", + "pgdemote_kswapd", + "pgdemote_direct", "pgscan_kswapd", "pgscan_direct", "pgscan_direct_throttle", -- cgit v1.2.3 From 20b51af15e014cac63b58a4f8b8b323ac35bccce Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Thu, 2 Sep 2021 14:59:33 -0700 Subject: mm/migrate: add sysfs interface to enable reclaim migration Some method is obviously needed to enable reclaim-based migration. Just like traditional autonuma, there will be some workloads that will benefit like workloads with more "static" configurations where hot pages stay hot and cold pages stay cold. If pages come and go from the hot and cold sets, the benefits of this approach will be more limited. The benefits are truly workload-based and *not* hardware-based. We do not believe that there is a viable threshold where certain hardware configurations should have this mechanism enabled while others do not. To be conservative, earlier work defaulted to disable reclaim- based migration and did not include a mechanism to enable it. This proposes add a new sysfs file /sys/kernel/mm/numa/demotion_enabled as a method to enable it. We are open to any alternative that allows end users to enable this mechanism or disable it if workload harm is detected (just like traditional autonuma). Once this is enabled page demotion may move data to a NUMA node that does not fall into the cpuset of the allocating process. This could be construed to violate the guarantees of cpusets. However, since this is an opt-in mechanism, the assumption is that anyone enabling it is content to relax the guarantees. Link: https://lkml.kernel.org/r/20210721063926.3024591-9-ying.huang@intel.com Link: https://lkml.kernel.org/r/20210715055145.195411-10-ying.huang@intel.com Signed-off-by: Huang Ying Originally-by: Dave Hansen Cc: Michal Hocko Cc: Wei Xu Cc: Yang Shi Cc: Zi Yan Cc: David Rientjes Cc: Dan Williams Cc: David Hildenbrand Cc: Greg Thelen Cc: Keith Busch Cc: Oscar Salvador Cc: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/ABI/testing/sysfs-kernel-mm-numa | 24 ++++++++++ include/linux/mempolicy.h | 4 ++ mm/mempolicy.c | 61 ++++++++++++++++++++++++++ mm/vmscan.c | 5 ++- 4 files changed, 92 insertions(+), 2 deletions(-) create mode 100644 Documentation/ABI/testing/sysfs-kernel-mm-numa (limited to 'include/linux') diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-numa b/Documentation/ABI/testing/sysfs-kernel-mm-numa new file mode 100644 index 000000000000..77e559d4ed80 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-kernel-mm-numa @@ -0,0 +1,24 @@ +What: /sys/kernel/mm/numa/ +Date: June 2021 +Contact: Linux memory management mailing list +Description: Interface for NUMA + +What: /sys/kernel/mm/numa/demotion_enabled +Date: June 2021 +Contact: Linux memory management mailing list +Description: Enable/disable demoting pages during reclaim + + Page migration during reclaim is intended for systems + with tiered memory configurations. These systems have + multiple types of memory with varied performance + characteristics instead of plain NUMA systems where + the same kind of memory is found at varied distances. + Allowing page migration during reclaim enables these + systems to migrate pages from fast tiers to slow tiers + when the fast tier is under pressure. This migration + is performed before swap. It may move data to a NUMA + node that does not fall into the cpuset of the + allocating process which might be construed to violate + the guarantees of cpusets. This should not be enabled + on systems which need strict cpuset location + guarantees. diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 0aaf91b496e2..4ca025e2a77e 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -184,6 +184,8 @@ extern bool vma_migratable(struct vm_area_struct *vma); extern int mpol_misplaced(struct page *, struct vm_area_struct *, unsigned long); extern void mpol_put_task_policy(struct task_struct *); +extern bool numa_demotion_enabled; + #else struct mempolicy {}; @@ -292,5 +294,7 @@ static inline nodemask_t *policy_nodemask_current(gfp_t gfp) { return NULL; } + +#define numa_demotion_enabled false #endif /* CONFIG_NUMA */ #endif diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 939eabcaf488..e675bfb856da 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -3021,3 +3021,64 @@ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) p += scnprintf(p, buffer + maxlen - p, ":%*pbl", nodemask_pr_args(&nodes)); } + +bool numa_demotion_enabled = false; + +#ifdef CONFIG_SYSFS +static ssize_t numa_demotion_enabled_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%s\n", + numa_demotion_enabled? "true" : "false"); +} + +static ssize_t numa_demotion_enabled_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + if (!strncmp(buf, "true", 4) || !strncmp(buf, "1", 1)) + numa_demotion_enabled = true; + else if (!strncmp(buf, "false", 5) || !strncmp(buf, "0", 1)) + numa_demotion_enabled = false; + else + return -EINVAL; + + return count; +} + +static struct kobj_attribute numa_demotion_enabled_attr = + __ATTR(demotion_enabled, 0644, numa_demotion_enabled_show, + numa_demotion_enabled_store); + +static struct attribute *numa_attrs[] = { + &numa_demotion_enabled_attr.attr, + NULL, +}; + +static const struct attribute_group numa_attr_group = { + .attrs = numa_attrs, +}; + +static int __init numa_init_sysfs(void) +{ + int err; + struct kobject *numa_kobj; + + numa_kobj = kobject_create_and_add("numa", mm_kobj); + if (!numa_kobj) { + pr_err("failed to create numa kobject\n"); + return -ENOMEM; + } + err = sysfs_create_group(numa_kobj, &numa_attr_group); + if (err) { + pr_err("failed to register numa group\n"); + goto delete_obj; + } + return 0; + +delete_obj: + kobject_put(numa_kobj); + return err; +} +subsys_initcall(numa_init_sysfs); +#endif diff --git a/mm/vmscan.c b/mm/vmscan.c index 43289f5f8488..2255025f1891 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -524,6 +524,8 @@ static long add_nr_deferred(long nr, struct shrinker *shrinker, static bool can_demote(int nid, struct scan_control *sc) { + if (!numa_demotion_enabled) + return false; if (sc) { if (sc->no_demotion) return false; @@ -534,8 +536,7 @@ static bool can_demote(int nid, struct scan_control *sc) if (next_demotion_node(nid) == NUMA_NO_NODE) return false; - // FIXME: actually enable this later in the series - return false; + return true; } static inline bool can_reclaim_anon_pages(struct mem_cgroup *memcg, -- cgit v1.2.3 From 9647875be52b33fe22cb034ec3074896c581543f Mon Sep 17 00:00:00 2001 From: Hui Su Date: Thu, 2 Sep 2021 14:59:36 -0700 Subject: mm/vmpressure: replace vmpressure_to_css() with vmpressure_to_memcg() We can get memcg directly form vmpr instead of vmpr->memcg->css->memcg, so add a new func helper vmpressure_to_memcg(). And no code will use vmpressure_to_css(), so delete it. Link: https://lkml.kernel.org/r/20210630112146.455103-1-suhui@zeku.com Signed-off-by: Hui Su Acked-by: Michal Hocko Acked-by: Chris Down Cc: Johannes Weiner Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vmpressure.h | 2 +- mm/memcontrol.c | 4 ++-- mm/vmpressure.c | 3 +-- 3 files changed, 4 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/vmpressure.h b/include/linux/vmpressure.h index 6d28bc433c1c..6a2f51ebbfd3 100644 --- a/include/linux/vmpressure.h +++ b/include/linux/vmpressure.h @@ -37,7 +37,7 @@ extern void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio); extern void vmpressure_init(struct vmpressure *vmpr); extern void vmpressure_cleanup(struct vmpressure *vmpr); extern struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg); -extern struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr); +extern struct mem_cgroup *vmpressure_to_memcg(struct vmpressure *vmpr); extern int vmpressure_register_event(struct mem_cgroup *memcg, struct eventfd_ctx *eventfd, const char *args); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 06ae0075e864..896f0f403c52 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -256,9 +256,9 @@ struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg) return &memcg->vmpressure; } -struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr) +struct mem_cgroup *vmpressure_to_memcg(struct vmpressure *vmpr) { - return &container_of(vmpr, struct mem_cgroup, vmpressure)->css; + return container_of(vmpr, struct mem_cgroup, vmpressure); } #ifdef CONFIG_MEMCG_KMEM diff --git a/mm/vmpressure.c b/mm/vmpressure.c index 9b172561fded..76518e4166dc 100644 --- a/mm/vmpressure.c +++ b/mm/vmpressure.c @@ -74,8 +74,7 @@ static struct vmpressure *work_to_vmpressure(struct work_struct *work) static struct vmpressure *vmpressure_parent(struct vmpressure *vmpr) { - struct cgroup_subsys_state *css = vmpressure_to_css(vmpr); - struct mem_cgroup *memcg = mem_cgroup_from_css(css); + struct mem_cgroup *memcg = vmpressure_to_memcg(vmpr); memcg = parent_mem_cgroup(memcg); if (!memcg) -- cgit v1.2.3 From b87c517ac5de168aec6e8318ca0707b11b2ccfaf Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 2 Sep 2021 14:59:46 -0700 Subject: mm/vmscan: remove unneeded return value of kswapd_run() The return value of kswapd_run() is unused now. Clean it up. Link: https://lkml.kernel.org/r/20210717065911.61497-4-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Acked-by: Michal Hocko Cc: Alex Shi Cc: Alistair Popple Cc: David Hildenbrand Cc: Hillf Danton Cc: Jens Axboe Cc: Johannes Weiner Cc: John Hubbard Cc: Joonsoo Kim Cc: Matthew Wilcox Cc: Minchan Kim Cc: Shaohua Li Cc: Vlastimil Babka Cc: Yu Zhao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 2 +- mm/vmscan.c | 7 ++----- 2 files changed, 3 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index f30d26b0f71d..ba52f3a3478e 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -408,7 +408,7 @@ static inline bool node_reclaim_enabled(void) extern void check_move_unevictable_pages(struct pagevec *pvec); -extern int kswapd_run(int nid); +extern void kswapd_run(int nid); extern void kswapd_stop(int nid); #ifdef CONFIG_SWAP diff --git a/mm/vmscan.c b/mm/vmscan.c index 8857e4dcbfd3..ab5019700dc3 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -4434,23 +4434,20 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) * This kswapd start function will be called by init and node-hot-add. * On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added. */ -int kswapd_run(int nid) +void kswapd_run(int nid) { pg_data_t *pgdat = NODE_DATA(nid); - int ret = 0; if (pgdat->kswapd) - return 0; + return; pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid); if (IS_ERR(pgdat->kswapd)) { /* failure at boot is fatal */ BUG_ON(system_state < SYSTEM_RUNNING); pr_err("Failed to start kswapd on node %d\n", nid); - ret = PTR_ERR(pgdat->kswapd); pgdat->kswapd = NULL; } - return ret; } /* -- cgit v1.2.3 From 65d759c8f9f57b96c199f3fe5cfb93ac7da095e9 Mon Sep 17 00:00:00 2001 From: Charan Teja Reddy Date: Thu, 2 Sep 2021 14:59:59 -0700 Subject: mm: compaction: support triggering of proactive compaction by user The proactive compaction[1] gets triggered for every 500msec and run compaction on the node for COMPACTION_HPAGE_ORDER (usually order-9) pages based on the value set to sysctl.compaction_proactiveness. Triggering the compaction for every 500msec in search of COMPACTION_HPAGE_ORDER pages is not needed for all applications, especially on the embedded system usecases which may have few MB's of RAM. Enabling the proactive compaction in its state will endup in running almost always on such systems. Other side, proactive compaction can still be very much useful for getting a set of higher order pages in some controllable manner(controlled by using the sysctl.compaction_proactiveness). So, on systems where enabling the proactive compaction always may proove not required, can trigger the same from user space on write to its sysctl interface. As an example, say app launcher decide to launch the memory heavy application which can be launched fast if it gets more higher order pages thus launcher can prepare the system in advance by triggering the proactive compaction from userspace. This triggering of proactive compaction is done on a write to sysctl.compaction_proactiveness by user. [1]https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit?id=facdaa917c4d5a376d09d25865f5a863f906234a [akpm@linux-foundation.org: tweak vm.rst, per Mike] Link: https://lkml.kernel.org/r/1627653207-12317-1-git-send-email-charante@codeaurora.org Signed-off-by: Charan Teja Reddy Acked-by: Vlastimil Babka Acked-by: Rafael Aquini Cc: Mike Rapoport Cc: Luis Chamberlain Cc: Kees Cook Cc: Iurii Zaikin Cc: Dave Hansen Cc: Mel Gorman Cc: Nitin Gupta Cc: Jonathan Corbet Cc: Khalid Aziz Cc: David Rientjes Cc: Vinayak Menon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/sysctl/vm.rst | 3 ++- include/linux/compaction.h | 2 ++ include/linux/mmzone.h | 1 + kernel/sysctl.c | 2 +- mm/compaction.c | 38 +++++++++++++++++++++++++++++++-- 5 files changed, 42 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst index 003d5cc3751b..5e795202111f 100644 --- a/Documentation/admin-guide/sysctl/vm.rst +++ b/Documentation/admin-guide/sysctl/vm.rst @@ -118,7 +118,8 @@ compaction_proactiveness This tunable takes a value in the range [0, 100] with a default value of 20. This tunable determines how aggressively compaction is done in the -background. Setting it to 0 disables proactive compaction. +background. Write of a non zero value to this tunable will immediately +trigger the proactive compaction. Setting it to 0 disables proactive compaction. Note that compaction has a non-trivial system-wide impact as pages belonging to different processes are moved around, which could also lead diff --git a/include/linux/compaction.h b/include/linux/compaction.h index c24098c7acca..34bce35c808d 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -84,6 +84,8 @@ static inline unsigned long compact_gap(unsigned int order) extern unsigned int sysctl_compaction_proactiveness; extern int sysctl_compaction_handler(struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos); +extern int compaction_proactiveness_sysctl_handler(struct ctl_table *table, + int write, void *buffer, size_t *length, loff_t *ppos); extern int sysctl_extfrag_threshold; extern int sysctl_compact_unevictable_allowed; diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 59bad25ce78e..1bd5f5955f9a 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -846,6 +846,7 @@ typedef struct pglist_data { enum zone_type kcompactd_highest_zoneidx; wait_queue_head_t kcompactd_wait; struct task_struct *kcompactd; + bool proactive_compact_trigger; #endif /* * This is a per-node reserve of pages that are not available diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 272f4a272f8c..297f0b3966bd 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2871,7 +2871,7 @@ static struct ctl_table vm_table[] = { .data = &sysctl_compaction_proactiveness, .maxlen = sizeof(sysctl_compaction_proactiveness), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = compaction_proactiveness_sysctl_handler, .extra1 = SYSCTL_ZERO, .extra2 = &one_hundred, }, diff --git a/mm/compaction.c b/mm/compaction.c index 4ee0d40d93f2..fa9b2b598eab 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -2706,6 +2706,30 @@ static void compact_nodes(void) */ unsigned int __read_mostly sysctl_compaction_proactiveness = 20; +int compaction_proactiveness_sysctl_handler(struct ctl_table *table, int write, + void *buffer, size_t *length, loff_t *ppos) +{ + int rc, nid; + + rc = proc_dointvec_minmax(table, write, buffer, length, ppos); + if (rc) + return rc; + + if (write && sysctl_compaction_proactiveness) { + for_each_online_node(nid) { + pg_data_t *pgdat = NODE_DATA(nid); + + if (pgdat->proactive_compact_trigger) + continue; + + pgdat->proactive_compact_trigger = true; + wake_up_interruptible(&pgdat->kcompactd_wait); + } + } + + return 0; +} + /* * This is the entry point for compacting all nodes via * /proc/sys/vm/compact_memory @@ -2750,7 +2774,8 @@ void compaction_unregister_node(struct node *node) static inline bool kcompactd_work_requested(pg_data_t *pgdat) { - return pgdat->kcompactd_max_order > 0 || kthread_should_stop(); + return pgdat->kcompactd_max_order > 0 || kthread_should_stop() || + pgdat->proactive_compact_trigger; } static bool kcompactd_node_suitable(pg_data_t *pgdat) @@ -2901,9 +2926,16 @@ static int kcompactd(void *p) while (!kthread_should_stop()) { unsigned long pflags; + /* + * Avoid the unnecessary wakeup for proactive compaction + * when it is disabled. + */ + if (!sysctl_compaction_proactiveness) + timeout = MAX_SCHEDULE_TIMEOUT; trace_mm_compaction_kcompactd_sleep(pgdat->node_id); if (wait_event_freezable_timeout(pgdat->kcompactd_wait, - kcompactd_work_requested(pgdat), timeout)) { + kcompactd_work_requested(pgdat), timeout) && + !pgdat->proactive_compact_trigger) { psi_memstall_enter(&pflags); kcompactd_do_work(pgdat); @@ -2938,6 +2970,8 @@ static int kcompactd(void *p) timeout = default_timeout << COMPACT_MAX_DEFER_SHIFT; } + if (unlikely(pgdat->proactive_compact_trigger)) + pgdat->proactive_compact_trigger = false; } return 0; -- cgit v1.2.3 From cfcaa66f803233c50e17239469f6c96136a673a1 Mon Sep 17 00:00:00 2001 From: Ben Widawsky Date: Thu, 2 Sep 2021 15:00:13 -0700 Subject: mm/hugetlb: add support for mempolicy MPOL_PREFERRED_MANY Implement the missing huge page allocation functionality while obeying the preferred node semantics. This is similar to the implementation for general page allocation, as it uses a fallback mechanism to try multiple preferred nodes first, and then all other nodes. To avoid adding too many "#ifdef CONFIG_NUMA" check, add a helper function in mempolicy.h to check whether a mempolicy is MPOL_PREFERRED_MANY. [akpm@linux-foundation.org: fix compiling issue when merging with other hugetlb patch] [Thanks to 0day bot for catching the !CONFIG_NUMA compiling issue] [mhocko@suse.com: suggest to remove the #ifdef CONFIG_NUMA check] [ben.widawsky@intel.com: add helpers to avoid ifdefs] Link: https://lore.kernel.org/r/20200630212517.308045-12-ben.widawsky@intel.com Link: https://lkml.kernel.org/r/1627970362-61305-4-git-send-email-feng.tang@intel.com Link: https://lkml.kernel.org/r/20210809024430.GA46432@shbuild999.sh.intel.com [nathan@kernel.org: initialize page to NULL in alloc_buddy_huge_page_with_mpol()] Link: https://lkml.kernel.org/r/20210810200632.3812797-1-nathan@kernel.org Link: https://lore.kernel.org/r/20200630212517.308045-12-ben.widawsky@intel.com Link: https://lkml.kernel.org/r/1627970362-61305-4-git-send-email-feng.tang@intel.com Link: https://lkml.kernel.org/r/20210809024430.GA46432@shbuild999.sh.intel.com Signed-off-by: Ben Widawsky Signed-off-by: Feng Tang Signed-off-by: Nathan Chancellor Co-developed-by: Feng Tang Suggested-by: Michal Hocko Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mempolicy.h | 12 ++++++++++++ mm/hugetlb.c | 30 +++++++++++++++++++++++++----- 2 files changed, 37 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 4ca025e2a77e..4091692bed8c 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -186,6 +186,12 @@ extern void mpol_put_task_policy(struct task_struct *); extern bool numa_demotion_enabled; +static inline bool mpol_is_preferred_many(struct mempolicy *pol) +{ + return (pol->mode == MPOL_PREFERRED_MANY); +} + + #else struct mempolicy {}; @@ -296,5 +302,11 @@ static inline nodemask_t *policy_nodemask_current(gfp_t gfp) } #define numa_demotion_enabled false + +static inline bool mpol_is_preferred_many(struct mempolicy *pol) +{ + return false; +} + #endif /* CONFIG_NUMA */ #endif diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 41a1778d3f67..95dc7b83381f 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1145,7 +1145,7 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, unsigned long address, int avoid_reserve, long chg) { - struct page *page; + struct page *page = NULL; struct mempolicy *mpol; gfp_t gfp_mask; nodemask_t *nodemask; @@ -1166,7 +1166,17 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, gfp_mask = htlb_alloc_mask(h); nid = huge_node(vma, address, gfp_mask, &mpol, &nodemask); - page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask); + + if (mpol_is_preferred_many(mpol)) { + page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask); + + /* Fallback to all nodes if page==NULL */ + nodemask = NULL; + } + + if (!page) + page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask); + if (page && !avoid_reserve && vma_has_reserves(vma, chg)) { SetHPageRestoreReserve(page); h->resv_huge_pages--; @@ -2142,16 +2152,26 @@ static struct page *alloc_buddy_huge_page_with_mpol(struct hstate *h, struct vm_area_struct *vma, unsigned long addr) { - struct page *page; + struct page *page = NULL; struct mempolicy *mpol; gfp_t gfp_mask = htlb_alloc_mask(h); int nid; nodemask_t *nodemask; nid = huge_node(vma, addr, gfp_mask, &mpol, &nodemask); - page = alloc_surplus_huge_page(h, gfp_mask, nid, nodemask, false); - mpol_cond_put(mpol); + if (mpol_is_preferred_many(mpol)) { + gfp_t gfp = gfp_mask | __GFP_NOWARN; + + gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); + page = alloc_surplus_huge_page(h, gfp, nid, nodemask, false); + /* Fallback to all nodes if page==NULL */ + nodemask = NULL; + } + + if (!page) + page = alloc_surplus_huge_page(h, gfp_mask, nid, nodemask, false); + mpol_cond_put(mpol); return page; } -- cgit v1.2.3 From a7259df7670240ee03b0cfce8a3e5d3773911e24 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Thu, 2 Sep 2021 15:00:26 -0700 Subject: memblock: make memblock_find_in_range method private There are a lot of uses of memblock_find_in_range() along with memblock_reserve() from the times memblock allocation APIs did not exist. memblock_find_in_range() is the very core of memblock allocations, so any future changes to its internal behaviour would mandate updates of all the users outside memblock. Replace the calls to memblock_find_in_range() with an equivalent calls to memblock_phys_alloc() and memblock_phys_alloc_range() and make memblock_find_in_range() private method of memblock. This simplifies the callers, ensures that (unlikely) errors in memblock_reserve() are handled and improves maintainability of memblock_find_in_range(). Link: https://lkml.kernel.org/r/20210816122622.30279-1-rppt@kernel.org Signed-off-by: Mike Rapoport Reviewed-by: Catalin Marinas [arm64] Acked-by: Kirill A. Shutemov Acked-by: Rafael J. Wysocki [ACPI] Acked-by: Russell King (Oracle) Acked-by: Nick Kossifidis [riscv] Tested-by: Guenter Roeck Acked-by: Rob Herring Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/kernel/setup.c | 20 +++++++----------- arch/arm64/kvm/hyp/reserved_mem.c | 9 +++----- arch/arm64/mm/init.c | 36 ++++++++++---------------------- arch/mips/kernel/setup.c | 14 ++++++------- arch/riscv/mm/init.c | 44 +++++++++++++-------------------------- arch/s390/kernel/setup.c | 9 +++++--- arch/x86/kernel/aperture_64.c | 5 ++--- arch/x86/mm/init.c | 23 +++++++++++++------- arch/x86/mm/numa.c | 5 ++--- arch/x86/mm/numa_emulation.c | 5 ++--- arch/x86/realmode/init.c | 2 +- drivers/acpi/tables.c | 5 ++--- drivers/base/arch_numa.c | 5 +---- drivers/of/of_reserved_mem.c | 12 +++++++---- include/linux/memblock.h | 2 -- mm/memblock.c | 2 +- 16 files changed, 81 insertions(+), 117 deletions(-) (limited to 'include/linux') diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c index f97eb2371672..284a80c0b6e1 100644 --- a/arch/arm/kernel/setup.c +++ b/arch/arm/kernel/setup.c @@ -1012,31 +1012,25 @@ static void __init reserve_crashkernel(void) unsigned long long lowmem_max = __pa(high_memory - 1) + 1; if (crash_max > lowmem_max) crash_max = lowmem_max; - crash_base = memblock_find_in_range(CRASH_ALIGN, crash_max, - crash_size, CRASH_ALIGN); + + crash_base = memblock_phys_alloc_range(crash_size, CRASH_ALIGN, + CRASH_ALIGN, crash_max); if (!crash_base) { pr_err("crashkernel reservation failed - No suitable area found.\n"); return; } } else { + unsigned long long crash_max = crash_base + crash_size; unsigned long long start; - start = memblock_find_in_range(crash_base, - crash_base + crash_size, - crash_size, SECTION_SIZE); - if (start != crash_base) { + start = memblock_phys_alloc_range(crash_size, SECTION_SIZE, + crash_base, crash_max); + if (!start) { pr_err("crashkernel reservation failed - memory is in use.\n"); return; } } - ret = memblock_reserve(crash_base, crash_size); - if (ret < 0) { - pr_warn("crashkernel reservation failed - memory is in use (0x%lx)\n", - (unsigned long)crash_base); - return; - } - pr_info("Reserving %ldMB of memory at %ldMB for crashkernel (System RAM: %ldMB)\n", (unsigned long)(crash_size >> 20), (unsigned long)(crash_base >> 20), diff --git a/arch/arm64/kvm/hyp/reserved_mem.c b/arch/arm64/kvm/hyp/reserved_mem.c index d654921dd09b..578670e3f608 100644 --- a/arch/arm64/kvm/hyp/reserved_mem.c +++ b/arch/arm64/kvm/hyp/reserved_mem.c @@ -92,12 +92,10 @@ void __init kvm_hyp_reserve(void) * this is unmapped from the host stage-2, and fallback to PAGE_SIZE. */ hyp_mem_size = hyp_mem_pages << PAGE_SHIFT; - hyp_mem_base = memblock_find_in_range(0, memblock_end_of_DRAM(), - ALIGN(hyp_mem_size, PMD_SIZE), - PMD_SIZE); + hyp_mem_base = memblock_phys_alloc(ALIGN(hyp_mem_size, PMD_SIZE), + PMD_SIZE); if (!hyp_mem_base) - hyp_mem_base = memblock_find_in_range(0, memblock_end_of_DRAM(), - hyp_mem_size, PAGE_SIZE); + hyp_mem_base = memblock_phys_alloc(hyp_mem_size, PAGE_SIZE); else hyp_mem_size = ALIGN(hyp_mem_size, PMD_SIZE); @@ -105,7 +103,6 @@ void __init kvm_hyp_reserve(void) kvm_err("Failed to reserve hyp memory\n"); return; } - memblock_reserve(hyp_mem_base, hyp_mem_size); kvm_info("Reserved %lld MiB at 0x%llx\n", hyp_mem_size >> 20, hyp_mem_base); diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 1fdb7bb7c198..bf5b8a5cd451 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -74,6 +74,7 @@ phys_addr_t arm64_dma_phys_limit __ro_after_init; static void __init reserve_crashkernel(void) { unsigned long long crash_base, crash_size; + unsigned long long crash_max = arm64_dma_phys_limit; int ret; ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(), @@ -84,33 +85,18 @@ static void __init reserve_crashkernel(void) crash_size = PAGE_ALIGN(crash_size); - if (crash_base == 0) { - /* Current arm64 boot protocol requires 2MB alignment */ - crash_base = memblock_find_in_range(0, arm64_dma_phys_limit, - crash_size, SZ_2M); - if (crash_base == 0) { - pr_warn("cannot allocate crashkernel (size:0x%llx)\n", - crash_size); - return; - } - } else { - /* User specifies base address explicitly. */ - if (!memblock_is_region_memory(crash_base, crash_size)) { - pr_warn("cannot reserve crashkernel: region is not memory\n"); - return; - } + /* User specifies base address explicitly. */ + if (crash_base) + crash_max = crash_base + crash_size; - if (memblock_is_region_reserved(crash_base, crash_size)) { - pr_warn("cannot reserve crashkernel: region overlaps reserved memory\n"); - return; - } - - if (!IS_ALIGNED(crash_base, SZ_2M)) { - pr_warn("cannot reserve crashkernel: base address is not 2MB aligned\n"); - return; - } + /* Current arm64 boot protocol requires 2MB alignment */ + crash_base = memblock_phys_alloc_range(crash_size, SZ_2M, + crash_base, crash_max); + if (!crash_base) { + pr_warn("cannot allocate crashkernel (size:0x%llx)\n", + crash_size); + return; } - memblock_reserve(crash_base, crash_size); pr_info("crashkernel reserved: 0x%016llx - 0x%016llx (%lld MB)\n", crash_base, crash_base + crash_size, crash_size >> 20); diff --git a/arch/mips/kernel/setup.c b/arch/mips/kernel/setup.c index 23a140327a0b..f979adfd4fc2 100644 --- a/arch/mips/kernel/setup.c +++ b/arch/mips/kernel/setup.c @@ -452,8 +452,9 @@ static void __init mips_parse_crashkernel(void) return; if (crash_base <= 0) { - crash_base = memblock_find_in_range(CRASH_ALIGN, CRASH_ADDR_MAX, - crash_size, CRASH_ALIGN); + crash_base = memblock_phys_alloc_range(crash_size, CRASH_ALIGN, + CRASH_ALIGN, + CRASH_ADDR_MAX); if (!crash_base) { pr_warn("crashkernel reservation failed - No suitable area found.\n"); return; @@ -461,8 +462,9 @@ static void __init mips_parse_crashkernel(void) } else { unsigned long long start; - start = memblock_find_in_range(crash_base, crash_base + crash_size, - crash_size, 1); + start = memblock_phys_alloc_range(crash_size, 1, + crash_base, + crash_base + crash_size); if (start != crash_base) { pr_warn("Invalid memory region reserved for crash kernel\n"); return; @@ -656,10 +658,6 @@ static void __init arch_mem_init(char **cmdline_p) mips_reserve_vmcore(); mips_parse_crashkernel(); -#ifdef CONFIG_KEXEC - if (crashk_res.start != crashk_res.end) - memblock_reserve(crashk_res.start, resource_size(&crashk_res)); -#endif device_tree_init(); /* diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 7cb4f391d106..e6cac495a9e8 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -819,38 +819,22 @@ static void __init reserve_crashkernel(void) crash_size = PAGE_ALIGN(crash_size); - if (crash_base == 0) { - /* - * Current riscv boot protocol requires 2MB alignment for - * RV64 and 4MB alignment for RV32 (hugepage size) - */ - crash_base = memblock_find_in_range(search_start, search_end, - crash_size, PMD_SIZE); - - if (crash_base == 0) { - pr_warn("crashkernel: couldn't allocate %lldKB\n", - crash_size >> 10); - return; - } - } else { - /* User specifies base address explicitly. */ - if (!memblock_is_region_memory(crash_base, crash_size)) { - pr_warn("crashkernel: requested region is not memory\n"); - return; - } - - if (memblock_is_region_reserved(crash_base, crash_size)) { - pr_warn("crashkernel: requested region is reserved\n"); - return; - } - + if (crash_base) { + search_start = crash_base; + search_end = crash_base + crash_size; + } - if (!IS_ALIGNED(crash_base, PMD_SIZE)) { - pr_warn("crashkernel: requested region is misaligned\n"); - return; - } + /* + * Current riscv boot protocol requires 2MB alignment for + * RV64 and 4MB alignment for RV32 (hugepage size) + */ + crash_base = memblock_phys_alloc_range(crash_size, PMD_SIZE, + search_start, search_end); + if (crash_base == 0) { + pr_warn("crashkernel: couldn't allocate %lldKB\n", + crash_size >> 10); + return; } - memblock_reserve(crash_base, crash_size); pr_info("crashkernel: reserved 0x%016llx - 0x%016llx (%lld MB)\n", crash_base, crash_base + crash_size, crash_size >> 20); diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index ff0f9e838916..0bab57d6413b 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -626,8 +626,9 @@ static void __init reserve_crashkernel(void) return; } low = crash_base ?: low; - crash_base = memblock_find_in_range(low, high, crash_size, - KEXEC_CRASH_MEM_ALIGN); + crash_base = memblock_phys_alloc_range(crash_size, + KEXEC_CRASH_MEM_ALIGN, + low, high); } if (!crash_base) { @@ -636,8 +637,10 @@ static void __init reserve_crashkernel(void) return; } - if (register_memory_notifier(&kdump_mem_nb)) + if (register_memory_notifier(&kdump_mem_nb)) { + memblock_free(crash_base, crash_size); return; + } if (!OLDMEM_BASE && MACHINE_IS_VM) diag10_range(PFN_DOWN(crash_base), PFN_DOWN(crash_size)); diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index 294ed4392a0e..10562885f5fc 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -109,14 +109,13 @@ static u32 __init allocate_aperture(void) * memory. Unfortunately we cannot move it up because that would * make the IOMMU useless. */ - addr = memblock_find_in_range(GART_MIN_ADDR, GART_MAX_ADDR, - aper_size, aper_size); + addr = memblock_phys_alloc_range(aper_size, aper_size, + GART_MIN_ADDR, GART_MAX_ADDR); if (!addr) { pr_err("Cannot allocate aperture memory hole [mem %#010lx-%#010lx] (%uKB)\n", addr, addr + aper_size - 1, aper_size >> 10); return 0; } - memblock_reserve(addr, aper_size); pr_info("Mapping aperture over RAM [mem %#010lx-%#010lx] (%uKB)\n", addr, addr + aper_size - 1, aper_size >> 10); register_nosave_region(addr >> PAGE_SHIFT, diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 75ef19aa8903..23a14d82e783 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -127,14 +127,12 @@ __ref void *alloc_low_pages(unsigned int num) unsigned long ret = 0; if (min_pfn_mapped < max_pfn_mapped) { - ret = memblock_find_in_range( + ret = memblock_phys_alloc_range( + PAGE_SIZE * num, PAGE_SIZE, min_pfn_mapped << PAGE_SHIFT, - max_pfn_mapped << PAGE_SHIFT, - PAGE_SIZE * num , PAGE_SIZE); + max_pfn_mapped << PAGE_SHIFT); } - if (ret) - memblock_reserve(ret, PAGE_SIZE * num); - else if (can_use_brk_pgt) + if (!ret && can_use_brk_pgt) ret = __pa(extend_brk(PAGE_SIZE * num, PAGE_SIZE)); if (!ret) @@ -610,8 +608,17 @@ static void __init memory_map_top_down(unsigned long map_start, unsigned long addr; unsigned long mapped_ram_size = 0; - /* xen has big range in reserved near end of ram, skip it at first.*/ - addr = memblock_find_in_range(map_start, map_end, PMD_SIZE, PMD_SIZE); + /* + * Systems that have many reserved areas near top of the memory, + * e.g. QEMU with less than 1G RAM and EFI enabled, or Xen, will + * require lots of 4K mappings which may exhaust pgt_buf. + * Start with top-most PMD_SIZE range aligned at PMD_SIZE to ensure + * there is enough mapped memory that can be allocated from + * memblock. + */ + addr = memblock_phys_alloc_range(PMD_SIZE, PMD_SIZE, map_start, + map_end); + memblock_free(addr, PMD_SIZE); real_end = addr + PMD_SIZE; /* step_size need to be small so pgt_buf from BRK could cover it */ diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index e94da744386f..a1b5c71099e6 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -376,15 +376,14 @@ static int __init numa_alloc_distance(void) cnt++; size = cnt * cnt * sizeof(numa_distance[0]); - phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped), - size, PAGE_SIZE); + phys = memblock_phys_alloc_range(size, PAGE_SIZE, 0, + PFN_PHYS(max_pfn_mapped)); if (!phys) { pr_warn("Warning: can't allocate distance table!\n"); /* don't retry until explicitly reset */ numa_distance = (void *)1LU; return -ENOMEM; } - memblock_reserve(phys, size); numa_distance = __va(phys); numa_distance_cnt = cnt; diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c index 87d77cc52f86..737491b13728 100644 --- a/arch/x86/mm/numa_emulation.c +++ b/arch/x86/mm/numa_emulation.c @@ -447,13 +447,12 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) if (numa_dist_cnt) { u64 phys; - phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped), - phys_size, PAGE_SIZE); + phys = memblock_phys_alloc_range(phys_size, PAGE_SIZE, 0, + PFN_PHYS(max_pfn_mapped)); if (!phys) { pr_warn("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n"); goto no_emu; } - memblock_reserve(phys, phys_size); phys_dist = __va(phys); for (i = 0; i < numa_dist_cnt; i++) diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c index 6534c92d0f83..31b5856010cb 100644 --- a/arch/x86/realmode/init.c +++ b/arch/x86/realmode/init.c @@ -28,7 +28,7 @@ void __init reserve_real_mode(void) WARN_ON(slab_is_available()); /* Has to be under 1M so we can execute real-mode AP code. */ - mem = memblock_find_in_range(0, 1<<20, size, PAGE_SIZE); + mem = memblock_phys_alloc_range(size, PAGE_SIZE, 0, 1<<20); if (!mem) pr_info("No sub-1M memory is available for the trampoline\n"); else diff --git a/drivers/acpi/tables.c b/drivers/acpi/tables.c index a37a1532a575..f9383736fa0f 100644 --- a/drivers/acpi/tables.c +++ b/drivers/acpi/tables.c @@ -583,8 +583,8 @@ void __init acpi_table_upgrade(void) } acpi_tables_addr = - memblock_find_in_range(0, ACPI_TABLE_UPGRADE_MAX_PHYS, - all_tables_size, PAGE_SIZE); + memblock_phys_alloc_range(all_tables_size, PAGE_SIZE, + 0, ACPI_TABLE_UPGRADE_MAX_PHYS); if (!acpi_tables_addr) { WARN_ON(1); return; @@ -599,7 +599,6 @@ void __init acpi_table_upgrade(void) * Both memblock_reserve and e820__range_add (via arch_reserve_mem_area) * works fine. */ - memblock_reserve(acpi_tables_addr, all_tables_size); arch_reserve_mem_area(acpi_tables_addr, all_tables_size); /* diff --git a/drivers/base/arch_numa.c b/drivers/base/arch_numa.c index 4cc4e117727d..46c503486e96 100644 --- a/drivers/base/arch_numa.c +++ b/drivers/base/arch_numa.c @@ -279,13 +279,10 @@ static int __init numa_alloc_distance(void) int i, j; size = nr_node_ids * nr_node_ids * sizeof(numa_distance[0]); - phys = memblock_find_in_range(0, PFN_PHYS(max_pfn), - size, PAGE_SIZE); + phys = memblock_phys_alloc_range(size, PAGE_SIZE, 0, PFN_PHYS(max_pfn)); if (WARN_ON(!phys)) return -ENOMEM; - memblock_reserve(phys, size); - numa_distance = __va(phys); numa_distance_cnt = nr_node_ids; diff --git a/drivers/of/of_reserved_mem.c b/drivers/of/of_reserved_mem.c index fd3964d24224..59c1390cdf42 100644 --- a/drivers/of/of_reserved_mem.c +++ b/drivers/of/of_reserved_mem.c @@ -33,18 +33,22 @@ static int __init early_init_dt_alloc_reserved_memory_arch(phys_addr_t size, phys_addr_t *res_base) { phys_addr_t base; + int err = 0; end = !end ? MEMBLOCK_ALLOC_ANYWHERE : end; align = !align ? SMP_CACHE_BYTES : align; - base = memblock_find_in_range(start, end, size, align); + base = memblock_phys_alloc_range(size, align, start, end); if (!base) return -ENOMEM; *res_base = base; - if (nomap) - return memblock_mark_nomap(base, size); + if (nomap) { + err = memblock_mark_nomap(base, size); + if (err) + memblock_free(base, size); + } - return memblock_reserve(base, size); + return err; } /* diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 4a53c3ca86bd..b066024c62e3 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -99,8 +99,6 @@ void memblock_discard(void); static inline void memblock_discard(void) {} #endif -phys_addr_t memblock_find_in_range(phys_addr_t start, phys_addr_t end, - phys_addr_t size, phys_addr_t align); void memblock_allow_resize(void); int memblock_add_node(phys_addr_t base, phys_addr_t size, int nid); int memblock_add(phys_addr_t base, phys_addr_t size); diff --git a/mm/memblock.c b/mm/memblock.c index a69449bffc8d..e6b4654f9dfd 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -315,7 +315,7 @@ static phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size, * Return: * Found address on success, 0 on failure. */ -phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start, +static phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start, phys_addr_t end, phys_addr_t size, phys_addr_t align) { -- cgit v1.2.3 From dce49103962840dd61423d7627748d6c558d58c5 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 2 Sep 2021 15:00:33 -0700 Subject: mm: wire up syscall process_mrelease Split off from prev patch in the series that implements the syscall. Link: https://lkml.kernel.org/r/20210809185259.405936-2-surenb@google.com Signed-off-by: Suren Baghdasaryan Acked-by: Geert Uytterhoeven Cc: Andy Lutomirski Cc: Christian Brauner Cc: Christoph Hellwig Cc: David Hildenbrand Cc: David Rientjes Cc: Florian Weimer Cc: Jan Engelhardt Cc: Jann Horn Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Minchan Kim Cc: Oleg Nesterov Cc: Rik van Riel Cc: Roman Gushchin Cc: Shakeel Butt Cc: Tim Murray Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/alpha/kernel/syscalls/syscall.tbl | 2 ++ arch/arm/tools/syscall.tbl | 2 ++ arch/arm64/include/asm/unistd.h | 2 +- arch/arm64/include/asm/unistd32.h | 2 ++ arch/ia64/kernel/syscalls/syscall.tbl | 2 ++ arch/m68k/kernel/syscalls/syscall.tbl | 2 ++ arch/microblaze/kernel/syscalls/syscall.tbl | 2 ++ arch/mips/kernel/syscalls/syscall_n32.tbl | 2 ++ arch/mips/kernel/syscalls/syscall_n64.tbl | 2 ++ arch/mips/kernel/syscalls/syscall_o32.tbl | 2 ++ arch/parisc/kernel/syscalls/syscall.tbl | 2 ++ arch/powerpc/kernel/syscalls/syscall.tbl | 2 ++ arch/s390/kernel/syscalls/syscall.tbl | 2 ++ arch/sh/kernel/syscalls/syscall.tbl | 2 ++ arch/sparc/kernel/syscalls/syscall.tbl | 2 ++ arch/x86/entry/syscalls/syscall_32.tbl | 1 + arch/x86/entry/syscalls/syscall_64.tbl | 1 + arch/xtensa/kernel/syscalls/syscall.tbl | 2 ++ include/linux/syscalls.h | 1 + include/uapi/asm-generic/unistd.h | 4 +++- kernel/sys_ni.c | 1 + 21 files changed, 38 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl index a17687ed4b51..605645eae04c 100644 --- a/arch/alpha/kernel/syscalls/syscall.tbl +++ b/arch/alpha/kernel/syscalls/syscall.tbl @@ -486,3 +486,5 @@ 554 common landlock_create_ruleset sys_landlock_create_ruleset 555 common landlock_add_rule sys_landlock_add_rule 556 common landlock_restrict_self sys_landlock_restrict_self +# 557 reserved for memfd_secret +558 common process_mrelease sys_process_mrelease diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl index c5df1179fc5d..2f32eb8beca8 100644 --- a/arch/arm/tools/syscall.tbl +++ b/arch/arm/tools/syscall.tbl @@ -460,3 +460,5 @@ 444 common landlock_create_ruleset sys_landlock_create_ruleset 445 common landlock_add_rule sys_landlock_add_rule 446 common landlock_restrict_self sys_landlock_restrict_self +# 447 reserved for memfd_secret +448 common process_mrelease sys_process_mrelease diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h index 727bfc3be99b..3cb206aea3db 100644 --- a/arch/arm64/include/asm/unistd.h +++ b/arch/arm64/include/asm/unistd.h @@ -38,7 +38,7 @@ #define __ARM_NR_compat_set_tls (__ARM_NR_COMPAT_BASE + 5) #define __ARM_NR_COMPAT_END (__ARM_NR_COMPAT_BASE + 0x800) -#define __NR_compat_syscalls 447 +#define __NR_compat_syscalls 449 #endif #define __ARCH_WANT_SYS_CLONE diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h index 99ffcafc736c..0f49cdb180dd 100644 --- a/arch/arm64/include/asm/unistd32.h +++ b/arch/arm64/include/asm/unistd32.h @@ -901,6 +901,8 @@ __SYSCALL(__NR_landlock_create_ruleset, sys_landlock_create_ruleset) __SYSCALL(__NR_landlock_add_rule, sys_landlock_add_rule) #define __NR_landlock_restrict_self 446 __SYSCALL(__NR_landlock_restrict_self, sys_landlock_restrict_self) +#define __NR_process_mrelease 448 +__SYSCALL(__NR_process_mrelease, sys_process_mrelease) /* * Please add new compat syscalls above this comment and update diff --git a/arch/ia64/kernel/syscalls/syscall.tbl b/arch/ia64/kernel/syscalls/syscall.tbl index 6d07742c57b8..9bf45f2be966 100644 --- a/arch/ia64/kernel/syscalls/syscall.tbl +++ b/arch/ia64/kernel/syscalls/syscall.tbl @@ -367,3 +367,5 @@ 444 common landlock_create_ruleset sys_landlock_create_ruleset 445 common landlock_add_rule sys_landlock_add_rule 446 common landlock_restrict_self sys_landlock_restrict_self +# 447 reserved for memfd_secret +448 common process_mrelease sys_process_mrelease diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl index 541bc1b3a8f9..f1f98ee6c82d 100644 --- a/arch/m68k/kernel/syscalls/syscall.tbl +++ b/arch/m68k/kernel/syscalls/syscall.tbl @@ -446,3 +446,5 @@ 444 common landlock_create_ruleset sys_landlock_create_ruleset 445 common landlock_add_rule sys_landlock_add_rule 446 common landlock_restrict_self sys_landlock_restrict_self +# 447 reserved for memfd_secret +448 common process_mrelease sys_process_mrelease diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl index a176faca2927..da49ddd4bb54 100644 --- a/arch/microblaze/kernel/syscalls/syscall.tbl +++ b/arch/microblaze/kernel/syscalls/syscall.tbl @@ -452,3 +452,5 @@ 444 common landlock_create_ruleset sys_landlock_create_ruleset 445 common landlock_add_rule sys_landlock_add_rule 446 common landlock_restrict_self sys_landlock_restrict_self +# 447 reserved for memfd_secret +448 common process_mrelease sys_process_mrelease diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl index c2d2e19abea8..56c8d3cf42ed 100644 --- a/arch/mips/kernel/syscalls/syscall_n32.tbl +++ b/arch/mips/kernel/syscalls/syscall_n32.tbl @@ -385,3 +385,5 @@ 444 n32 landlock_create_ruleset sys_landlock_create_ruleset 445 n32 landlock_add_rule sys_landlock_add_rule 446 n32 landlock_restrict_self sys_landlock_restrict_self +# 447 reserved for memfd_secret +448 n32 process_mrelease sys_process_mrelease diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl index ac653d08b1ea..1ca7bc337932 100644 --- a/arch/mips/kernel/syscalls/syscall_n64.tbl +++ b/arch/mips/kernel/syscalls/syscall_n64.tbl @@ -361,3 +361,5 @@ 444 n64 landlock_create_ruleset sys_landlock_create_ruleset 445 n64 landlock_add_rule sys_landlock_add_rule 446 n64 landlock_restrict_self sys_landlock_restrict_self +# 447 reserved for memfd_secret +448 n64 process_mrelease sys_process_mrelease diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl index 253f2cd70b6b..fd3a9df60ec2 100644 --- a/arch/mips/kernel/syscalls/syscall_o32.tbl +++ b/arch/mips/kernel/syscalls/syscall_o32.tbl @@ -434,3 +434,5 @@ 444 o32 landlock_create_ruleset sys_landlock_create_ruleset 445 o32 landlock_add_rule sys_landlock_add_rule 446 o32 landlock_restrict_self sys_landlock_restrict_self +# 447 reserved for memfd_secret +448 o32 process_mrelease sys_process_mrelease diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl index e26187b9ab87..040df1b7a589 100644 --- a/arch/parisc/kernel/syscalls/syscall.tbl +++ b/arch/parisc/kernel/syscalls/syscall.tbl @@ -444,3 +444,5 @@ 444 common landlock_create_ruleset sys_landlock_create_ruleset 445 common landlock_add_rule sys_landlock_add_rule 446 common landlock_restrict_self sys_landlock_restrict_self +# 447 reserved for memfd_secret +448 common process_mrelease sys_process_mrelease diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl index aef2a290e71a..d8ebd7d37c0f 100644 --- a/arch/powerpc/kernel/syscalls/syscall.tbl +++ b/arch/powerpc/kernel/syscalls/syscall.tbl @@ -526,3 +526,5 @@ 444 common landlock_create_ruleset sys_landlock_create_ruleset 445 common landlock_add_rule sys_landlock_add_rule 446 common landlock_restrict_self sys_landlock_restrict_self +# 447 reserved for memfd_secret +448 common process_mrelease sys_process_mrelease diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl index 64d51ab5a8b4..57233ace30cb 100644 --- a/arch/s390/kernel/syscalls/syscall.tbl +++ b/arch/s390/kernel/syscalls/syscall.tbl @@ -449,3 +449,5 @@ 444 common landlock_create_ruleset sys_landlock_create_ruleset sys_landlock_create_ruleset 445 common landlock_add_rule sys_landlock_add_rule sys_landlock_add_rule 446 common landlock_restrict_self sys_landlock_restrict_self sys_landlock_restrict_self +# 447 reserved for memfd_secret +448 common process_mrelease sys_process_mrelease sys_process_mrelease diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl index e0a70be77d84..2f6e95eb4690 100644 --- a/arch/sh/kernel/syscalls/syscall.tbl +++ b/arch/sh/kernel/syscalls/syscall.tbl @@ -449,3 +449,5 @@ 444 common landlock_create_ruleset sys_landlock_create_ruleset 445 common landlock_add_rule sys_landlock_add_rule 446 common landlock_restrict_self sys_landlock_restrict_self +# 447 reserved for memfd_secret +448 common process_mrelease sys_process_mrelease diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl index 603f5a821502..42fc2906215d 100644 --- a/arch/sparc/kernel/syscalls/syscall.tbl +++ b/arch/sparc/kernel/syscalls/syscall.tbl @@ -492,3 +492,5 @@ 444 common landlock_create_ruleset sys_landlock_create_ruleset 445 common landlock_add_rule sys_landlock_add_rule 446 common landlock_restrict_self sys_landlock_restrict_self +# 447 reserved for memfd_secret +448 common process_mrelease sys_process_mrelease diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index ce763a12311c..661a03bcfbd1 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -452,3 +452,4 @@ 445 i386 landlock_add_rule sys_landlock_add_rule 446 i386 landlock_restrict_self sys_landlock_restrict_self 447 i386 memfd_secret sys_memfd_secret +448 i386 process_mrelease sys_process_mrelease diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index f6b57799c1ea..807b6a1de8e8 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -369,6 +369,7 @@ 445 common landlock_add_rule sys_landlock_add_rule 446 common landlock_restrict_self sys_landlock_restrict_self 447 common memfd_secret sys_memfd_secret +448 common process_mrelease sys_process_mrelease # # Due to a historical design error, certain syscalls are numbered differently diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl index 235d67d6ceb4..f4384951f393 100644 --- a/arch/xtensa/kernel/syscalls/syscall.tbl +++ b/arch/xtensa/kernel/syscalls/syscall.tbl @@ -417,3 +417,5 @@ 444 common landlock_create_ruleset sys_landlock_create_ruleset 445 common landlock_add_rule sys_landlock_add_rule 446 common landlock_restrict_self sys_landlock_restrict_self +# 447 reserved for memfd_secret +448 common process_mrelease sys_process_mrelease diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 69c9a7010081..00bc170a50f0 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -915,6 +915,7 @@ asmlinkage long sys_mincore(unsigned long start, size_t len, asmlinkage long sys_madvise(unsigned long start, size_t len, int behavior); asmlinkage long sys_process_madvise(int pidfd, const struct iovec __user *vec, size_t vlen, int behavior, unsigned int flags); +asmlinkage long sys_process_mrelease(int pidfd, unsigned int flags); asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size, unsigned long prot, unsigned long pgoff, unsigned long flags); diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index a9d6fcd95f42..14c8fe863c6d 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -877,9 +877,11 @@ __SYSCALL(__NR_landlock_restrict_self, sys_landlock_restrict_self) #define __NR_memfd_secret 447 __SYSCALL(__NR_memfd_secret, sys_memfd_secret) #endif +#define __NR_process_mrelease 448 +__SYSCALL(__NR_process_mrelease, sys_process_mrelease) #undef __NR_syscalls -#define __NR_syscalls 448 +#define __NR_syscalls 449 /* * 32 bit systems traditionally used different diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 30971b1dd4a9..18a9c2cde767 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -289,6 +289,7 @@ COND_SYSCALL(munlockall); COND_SYSCALL(mincore); COND_SYSCALL(madvise); COND_SYSCALL(process_madvise); +COND_SYSCALL(process_mrelease); COND_SYSCALL(remap_file_pages); COND_SYSCALL(mbind); COND_SYSCALL_COMPAT(mbind); -- cgit v1.2.3