From 35822fdae3bf509532b0954088070f17de51ff15 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Fri, 21 Apr 2023 17:40:19 +0000 Subject: memcg: remove mem_cgroup_flush_stats_atomic() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previous patches removed all callers of mem_cgroup_flush_stats_atomic(). Remove the function and simplify the code. Link: https://lkml.kernel.org/r/20230421174020.2994750-5-yosryahmed@google.com Signed-off-by: Yosry Ahmed Acked-by: Shakeel Butt Cc: Alexander Viro Cc: Christian Brauner Cc: Jan Kara Cc: Jens Axboe Cc: Johannes Weiner Cc: Michal Hocko Cc: Michal Koutný Cc: Muchun Song Cc: Roman Gushchin Cc: Tejun Heo Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 222d7370134c..00a88cf947e1 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1038,7 +1038,6 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec, } void mem_cgroup_flush_stats(void); -void mem_cgroup_flush_stats_atomic(void); void mem_cgroup_flush_stats_ratelimited(void); void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, @@ -1537,10 +1536,6 @@ static inline void mem_cgroup_flush_stats(void) { } -static inline void mem_cgroup_flush_stats_atomic(void) -{ -} - static inline void mem_cgroup_flush_stats_ratelimited(void) { } -- cgit v1.2.3 From 0a2dc6ac33297f8a1a65f81b633a1ea753f19f69 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Fri, 21 Apr 2023 17:40:20 +0000 Subject: cgroup: remove cgroup_rstat_flush_atomic() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previous patches removed the only caller of cgroup_rstat_flush_atomic(). Remove the function and simplify the code. Link: https://lkml.kernel.org/r/20230421174020.2994750-6-yosryahmed@google.com Signed-off-by: Yosry Ahmed Acked-by: Shakeel Butt Acked-by: Tejun Heo Cc: Alexander Viro Cc: Christian Brauner Cc: Jan Kara Cc: Jens Axboe Cc: Johannes Weiner Cc: Michal Hocko Cc: Michal Koutný Cc: Muchun Song Cc: Roman Gushchin Signed-off-by: Andrew Morton --- include/linux/cgroup.h | 1 - kernel/cgroup/rstat.c | 26 +++++--------------------- 2 files changed, 5 insertions(+), 22 deletions(-) (limited to 'include') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 885f5395fcd0..567c547cf371 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -692,7 +692,6 @@ static inline void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen) */ void cgroup_rstat_updated(struct cgroup *cgrp, int cpu); void cgroup_rstat_flush(struct cgroup *cgrp); -void cgroup_rstat_flush_atomic(struct cgroup *cgrp); void cgroup_rstat_flush_hold(struct cgroup *cgrp); void cgroup_rstat_flush_release(void); diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index 9c4c55228567..2542c21b6b6d 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -171,7 +171,7 @@ __weak noinline void bpf_rstat_flush(struct cgroup *cgrp, __diag_pop(); /* see cgroup_rstat_flush() */ -static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep) +static void cgroup_rstat_flush_locked(struct cgroup *cgrp) __releases(&cgroup_rstat_lock) __acquires(&cgroup_rstat_lock) { int cpu; @@ -207,9 +207,8 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep) } raw_spin_unlock_irqrestore(cpu_lock, flags); - /* if @may_sleep, play nice and yield if necessary */ - if (may_sleep && (need_resched() || - spin_needbreak(&cgroup_rstat_lock))) { + /* play nice and yield if necessary */ + if (need_resched() || spin_needbreak(&cgroup_rstat_lock)) { spin_unlock_irq(&cgroup_rstat_lock); if (!cond_resched()) cpu_relax(); @@ -236,25 +235,10 @@ __bpf_kfunc void cgroup_rstat_flush(struct cgroup *cgrp) might_sleep(); spin_lock_irq(&cgroup_rstat_lock); - cgroup_rstat_flush_locked(cgrp, true); + cgroup_rstat_flush_locked(cgrp); spin_unlock_irq(&cgroup_rstat_lock); } -/** - * cgroup_rstat_flush_atomic- atomic version of cgroup_rstat_flush() - * @cgrp: target cgroup - * - * This function can be called from any context. - */ -void cgroup_rstat_flush_atomic(struct cgroup *cgrp) -{ - unsigned long flags; - - spin_lock_irqsave(&cgroup_rstat_lock, flags); - cgroup_rstat_flush_locked(cgrp, false); - spin_unlock_irqrestore(&cgroup_rstat_lock, flags); -} - /** * cgroup_rstat_flush_hold - flush stats in @cgrp's subtree and hold * @cgrp: target cgroup @@ -269,7 +253,7 @@ void cgroup_rstat_flush_hold(struct cgroup *cgrp) { might_sleep(); spin_lock_irq(&cgroup_rstat_lock); - cgroup_rstat_flush_locked(cgrp, true); + cgroup_rstat_flush_locked(cgrp); } /** -- cgit v1.2.3 From ffcb5f5262b756a598eefb11e340bbd027cde037 Mon Sep 17 00:00:00 2001 From: Nhat Pham Date: Tue, 2 May 2023 18:36:06 -0700 Subject: workingset: refactor LRU refault to expose refault recency check Patch series "cachestat: a new syscall for page cache state of files", v13. There is currently no good way to query the page cache statistics of large files and directory trees. There is mincore(), but it scales poorly: the kernel writes out a lot of bitmap data that userspace has to aggregate, when the user really does not care about per-page information in that case. The user also needs to mmap and unmap each file as it goes along, which can be quite slow as well. Some use cases where this information could come in handy: * Allowing database to decide whether to perform an index scan or direct table queries based on the in-memory cache state of the index. * Visibility into the writeback algorithm, for performance issues diagnostic. * Workload-aware writeback pacing: estimating IO fulfilled by page cache (and IO to be done) within a range of a file, allowing for more frequent syncing when and where there is IO capacity, and batching when there is not. * Computing memory usage of large files/directory trees, analogous to the du tool for disk usage. More information about these use cases could be found in this thread: https://lore.kernel.org/lkml/20230315170934.GA97793@cmpxchg.org/ This series of patches introduces a new system call, cachestat, that summarizes the page cache statistics (number of cached pages, dirty pages, pages marked for writeback, evicted pages etc.) of a file, in a specified range of bytes. It also include a selftest suite that tests some typical usage. Currently, the syscall is only wired in for x86 architecture. This interface is inspired by past discussion and concerns with fincore, which has a similar design (and as a result, issues) as mincore. Relevant links: https://lkml.indiana.edu/hypermail/linux/kernel/1302.1/04207.html https://lkml.indiana.edu/hypermail/linux/kernel/1302.1/04209.html I have also developed a small tool that computes the memory usage of files and directories, analogous to the du utility. User can choose between mincore or cachestat (with cachestat exporting more information than mincore). To compare the performance of these two options, I benchmarked the tool on the root directory of a Meta's server machine, each for five runs: Using cachestat real -- Median: 33.377s, Average: 33.475s, Standard Deviation: 0.3602 user -- Median: 4.08s, Average: 4.1078s, Standard Deviation: 0.0742 sys -- Median: 28.823s, Average: 28.8866s, Standard Deviation: 0.2689 Using mincore: real -- Median: 102.352s, Average: 102.3442s, Standard Deviation: 0.2059 user -- Median: 10.149s, Average: 10.1482s, Standard Deviation: 0.0162 sys -- Median: 91.186s, Average: 91.2084s, Standard Deviation: 0.2046 I also ran both syscalls on a 2TB sparse file: Using cachestat: real 0m0.009s user 0m0.000s sys 0m0.009s Using mincore: real 0m37.510s user 0m2.934s sys 0m34.558s Very large files like this are the pathological case for mincore. In fact, to compute the stats for a single 2TB file, mincore takes as long as cachestat takes to compute the stats for the entire tree! This could easily happen inadvertently when we run it on subdirectories. Mincore is clearly not suitable for a general-purpose command line tool. Regarding security concerns, cachestat() should not pose any additional issues. The caller already has read permission to the file itself (since they need an fd to that file to call cachestat). This means that the caller can access the underlying data in its entirety, which is a much greater source of information (and as a result, a much greater security risk) than the cache status itself. The latest API change (in v13 of the patch series) is suggested by Jens Axboe. It allows for 64-bit length argument, even on 32-bit architecture (which is previously not possible due to the limit on the number of syscall arguments). Furthermore, it eliminates the need for compatibility handling - every user can use the same ABI. This patch (of 4): In preparation for computing recently evicted pages in cachestat, refactor workingset_refault and lru_gen_refault to expose a helper function that would test if an evicted page is recently evicted. [penguin-kernel@I-love.SAKURA.ne.jp: add missing rcu_read_unlock() in lru_gen_refault()] Link: https://lkml.kernel.org/r/610781bc-cf11-fc89-a46f-87cb8235d439@I-love.SAKURA.ne.jp Link: https://lkml.kernel.org/r/20230503013608.2431726-1-nphamcs@gmail.com Link: https://lkml.kernel.org/r/20230503013608.2431726-2-nphamcs@gmail.com Signed-off-by: Nhat Pham Signed-off-by: Tetsuo Handa Acked-by: Johannes Weiner Cc: Brian Foster Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Cc: Michael Kerrisk Cc: Tetsuo Handa Signed-off-by: Andrew Morton --- include/linux/swap.h | 1 + mm/workingset.c | 150 ++++++++++++++++++++++++++++++++++----------------- 2 files changed, 103 insertions(+), 48 deletions(-) (limited to 'include') diff --git a/include/linux/swap.h b/include/linux/swap.h index 3c69cb653cb9..b2128df5edea 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -368,6 +368,7 @@ static inline void folio_set_swap_entry(struct folio *folio, swp_entry_t entry) } /* linux/mm/workingset.c */ +bool workingset_test_recent(void *shadow, bool file, bool *workingset); void workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages); void *workingset_eviction(struct folio *folio, struct mem_cgroup *target_memcg); void workingset_refault(struct folio *folio, void *shadow); diff --git a/mm/workingset.c b/mm/workingset.c index 817758951886..90ae785d4c9c 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -255,6 +255,29 @@ static void *lru_gen_eviction(struct folio *folio) return pack_shadow(mem_cgroup_id(memcg), pgdat, token, refs); } +/* + * Tests if the shadow entry is for a folio that was recently evicted. + * Fills in @memcgid, @pglist_data, @token, @workingset with the values + * unpacked from shadow. + */ +static bool lru_gen_test_recent(void *shadow, bool file, int *memcgid, + struct pglist_data **pgdat, unsigned long *token, bool *workingset) +{ + struct mem_cgroup *eviction_memcg; + struct lruvec *lruvec; + struct lru_gen_folio *lrugen; + unsigned long min_seq; + + unpack_shadow(shadow, memcgid, pgdat, token, workingset); + eviction_memcg = mem_cgroup_from_id(*memcgid); + + lruvec = mem_cgroup_lruvec(eviction_memcg, *pgdat); + lrugen = &lruvec->lrugen; + + min_seq = READ_ONCE(lrugen->min_seq[file]); + return (*token >> LRU_REFS_WIDTH) == (min_seq & (EVICTION_MASK >> LRU_REFS_WIDTH)); +} + static void lru_gen_refault(struct folio *folio, void *shadow) { int hist, tier, refs; @@ -269,23 +292,22 @@ static void lru_gen_refault(struct folio *folio, void *shadow) int type = folio_is_file_lru(folio); int delta = folio_nr_pages(folio); - unpack_shadow(shadow, &memcg_id, &pgdat, &token, &workingset); - - if (pgdat != folio_pgdat(folio)) - return; - rcu_read_lock(); + if (!lru_gen_test_recent(shadow, type, &memcg_id, &pgdat, &token, + &workingset)) + goto unlock; + memcg = folio_memcg_rcu(folio); if (memcg_id != mem_cgroup_id(memcg)) goto unlock; + if (pgdat != folio_pgdat(folio)) + goto unlock; + lruvec = mem_cgroup_lruvec(memcg, pgdat); lrugen = &lruvec->lrugen; - min_seq = READ_ONCE(lrugen->min_seq[type]); - if ((token >> LRU_REFS_WIDTH) != (min_seq & (EVICTION_MASK >> LRU_REFS_WIDTH))) - goto unlock; hist = lru_hist_from_seq(min_seq); /* see the comment in folio_lru_refs() */ @@ -317,6 +339,12 @@ static void *lru_gen_eviction(struct folio *folio) return NULL; } +static bool lru_gen_test_recent(void *shadow, bool file, int *memcgid, + struct pglist_data **pgdat, unsigned long *token, bool *workingset) +{ + return false; +} + static void lru_gen_refault(struct folio *folio, void *shadow) { } @@ -385,42 +413,34 @@ void *workingset_eviction(struct folio *folio, struct mem_cgroup *target_memcg) } /** - * workingset_refault - Evaluate the refault of a previously evicted folio. - * @folio: The freshly allocated replacement folio. - * @shadow: Shadow entry of the evicted folio. - * - * Calculates and evaluates the refault distance of the previously - * evicted folio in the context of the node and the memcg whose memory - * pressure caused the eviction. + * workingset_test_recent - tests if the shadow entry is for a folio that was + * recently evicted. Also fills in @workingset with the value unpacked from + * shadow. + * @shadow: the shadow entry to be tested. + * @file: whether the corresponding folio is from the file lru. + * @workingset: where the workingset value unpacked from shadow should + * be stored. + * + * Return: true if the shadow is for a recently evicted folio; false otherwise. */ -void workingset_refault(struct folio *folio, void *shadow) +bool workingset_test_recent(void *shadow, bool file, bool *workingset) { - bool file = folio_is_file_lru(folio); struct mem_cgroup *eviction_memcg; struct lruvec *eviction_lruvec; unsigned long refault_distance; unsigned long workingset_size; - struct pglist_data *pgdat; - struct mem_cgroup *memcg; - unsigned long eviction; - struct lruvec *lruvec; unsigned long refault; - bool workingset; int memcgid; - long nr; + struct pglist_data *pgdat; + unsigned long eviction; - if (lru_gen_enabled()) { - lru_gen_refault(folio, shadow); - return; - } + if (lru_gen_enabled()) + return lru_gen_test_recent(shadow, file, &memcgid, &pgdat, &eviction, + workingset); - unpack_shadow(shadow, &memcgid, &pgdat, &eviction, &workingset); + unpack_shadow(shadow, &memcgid, &pgdat, &eviction, workingset); eviction <<= bucket_order; - /* Flush stats (and potentially sleep) before holding RCU read lock */ - mem_cgroup_flush_stats_ratelimited(); - - rcu_read_lock(); /* * Look up the memcg associated with the stored ID. It might * have been deleted since the folio's eviction. @@ -439,7 +459,8 @@ void workingset_refault(struct folio *folio, void *shadow) */ eviction_memcg = mem_cgroup_from_id(memcgid); if (!mem_cgroup_disabled() && !eviction_memcg) - goto out; + return false; + eviction_lruvec = mem_cgroup_lruvec(eviction_memcg, pgdat); refault = atomic_long_read(&eviction_lruvec->nonresident_age); @@ -461,20 +482,6 @@ void workingset_refault(struct folio *folio, void *shadow) */ refault_distance = (refault - eviction) & EVICTION_MASK; - /* - * The activation decision for this folio is made at the level - * where the eviction occurred, as that is where the LRU order - * during folio reclaim is being determined. - * - * However, the cgroup that will own the folio is the one that - * is actually experiencing the refault event. - */ - nr = folio_nr_pages(folio); - memcg = folio_memcg(folio); - pgdat = folio_pgdat(folio); - lruvec = mem_cgroup_lruvec(memcg, pgdat); - - mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file, nr); /* * Compare the distance to the existing workingset size. We * don't activate pages that couldn't stay resident even if @@ -495,7 +502,54 @@ void workingset_refault(struct folio *folio, void *shadow) NR_INACTIVE_ANON); } } - if (refault_distance > workingset_size) + + return refault_distance <= workingset_size; +} + +/** + * workingset_refault - Evaluate the refault of a previously evicted folio. + * @folio: The freshly allocated replacement folio. + * @shadow: Shadow entry of the evicted folio. + * + * Calculates and evaluates the refault distance of the previously + * evicted folio in the context of the node and the memcg whose memory + * pressure caused the eviction. + */ +void workingset_refault(struct folio *folio, void *shadow) +{ + bool file = folio_is_file_lru(folio); + struct pglist_data *pgdat; + struct mem_cgroup *memcg; + struct lruvec *lruvec; + bool workingset; + long nr; + + if (lru_gen_enabled()) { + lru_gen_refault(folio, shadow); + return; + } + + /* Flush stats (and potentially sleep) before holding RCU read lock */ + mem_cgroup_flush_stats_ratelimited(); + + rcu_read_lock(); + + /* + * The activation decision for this folio is made at the level + * where the eviction occurred, as that is where the LRU order + * during folio reclaim is being determined. + * + * However, the cgroup that will own the folio is the one that + * is actually experiencing the refault event. + */ + nr = folio_nr_pages(folio); + memcg = folio_memcg(folio); + pgdat = folio_pgdat(folio); + lruvec = mem_cgroup_lruvec(memcg, pgdat); + + mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file, nr); + + if (!workingset_test_recent(shadow, file, &workingset)) goto out; folio_set_active(folio); -- cgit v1.2.3 From cf264e1329fb0307e044f7675849f9f38b44c11a Mon Sep 17 00:00:00 2001 From: Nhat Pham Date: Tue, 2 May 2023 18:36:07 -0700 Subject: cachestat: implement cachestat syscall There is currently no good way to query the page cache state of large file sets and directory trees. There is mincore(), but it scales poorly: the kernel writes out a lot of bitmap data that userspace has to aggregate, when the user really doesn not care about per-page information in that case. The user also needs to mmap and unmap each file as it goes along, which can be quite slow as well. Some use cases where this information could come in handy: * Allowing database to decide whether to perform an index scan or direct table queries based on the in-memory cache state of the index. * Visibility into the writeback algorithm, for performance issues diagnostic. * Workload-aware writeback pacing: estimating IO fulfilled by page cache (and IO to be done) within a range of a file, allowing for more frequent syncing when and where there is IO capacity, and batching when there is not. * Computing memory usage of large files/directory trees, analogous to the du tool for disk usage. More information about these use cases could be found in the following thread: https://lore.kernel.org/lkml/20230315170934.GA97793@cmpxchg.org/ This patch implements a new syscall that queries cache state of a file and summarizes the number of cached pages, number of dirty pages, number of pages marked for writeback, number of (recently) evicted pages, etc. in a given range. Currently, the syscall is only wired in for x86 architecture. NAME cachestat - query the page cache statistics of a file. SYNOPSIS #include struct cachestat_range { __u64 off; __u64 len; }; struct cachestat { __u64 nr_cache; __u64 nr_dirty; __u64 nr_writeback; __u64 nr_evicted; __u64 nr_recently_evicted; }; int cachestat(unsigned int fd, struct cachestat_range *cstat_range, struct cachestat *cstat, unsigned int flags); DESCRIPTION cachestat() queries the number of cached pages, number of dirty pages, number of pages marked for writeback, number of evicted pages, number of recently evicted pages, in the bytes range given by `off` and `len`. An evicted page is a page that is previously in the page cache but has been evicted since. A page is recently evicted if its last eviction was recent enough that its reentry to the cache would indicate that it is actively being used by the system, and that there is memory pressure on the system. These values are returned in a cachestat struct, whose address is given by the `cstat` argument. The `off` and `len` arguments must be non-negative integers. If `len` > 0, the queried range is [`off`, `off` + `len`]. If `len` == 0, we will query in the range from `off` to the end of the file. The `flags` argument is unused for now, but is included for future extensibility. User should pass 0 (i.e no flag specified). Currently, hugetlbfs is not supported. Because the status of a page can change after cachestat() checks it but before it returns to the application, the returned values may contain stale information. RETURN VALUE On success, cachestat returns 0. On error, -1 is returned, and errno is set to indicate the error. ERRORS EFAULT cstat or cstat_args points to an invalid address. EINVAL invalid flags. EBADF invalid file descriptor. EOPNOTSUPP file descriptor is of a hugetlbfs file [nphamcs@gmail.com: replace rounddown logic with the existing helper] Link: https://lkml.kernel.org/r/20230504022044.3675469-1-nphamcs@gmail.com Link: https://lkml.kernel.org/r/20230503013608.2431726-3-nphamcs@gmail.com Signed-off-by: Nhat Pham Acked-by: Johannes Weiner Cc: Brian Foster Cc: Matthew Wilcox (Oracle) Cc: Michael Kerrisk Signed-off-by: Andrew Morton --- arch/x86/entry/syscalls/syscall_32.tbl | 1 + arch/x86/entry/syscalls/syscall_64.tbl | 1 + include/linux/syscalls.h | 5 + include/uapi/asm-generic/unistd.h | 5 +- include/uapi/linux/mman.h | 14 +++ init/Kconfig | 10 ++ kernel/sys_ni.c | 1 + mm/filemap.c | 171 +++++++++++++++++++++++++++++++++ 8 files changed, 207 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index 320480a8db4f..bc0a3c941b35 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -455,3 +455,4 @@ 448 i386 process_mrelease sys_process_mrelease 449 i386 futex_waitv sys_futex_waitv 450 i386 set_mempolicy_home_node sys_set_mempolicy_home_node +451 i386 cachestat sys_cachestat diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index c84d12608cd2..227538b0ce80 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -372,6 +372,7 @@ 448 common process_mrelease sys_process_mrelease 449 common futex_waitv sys_futex_waitv 450 common set_mempolicy_home_node sys_set_mempolicy_home_node +451 common cachestat sys_cachestat # # Due to a historical design error, certain syscalls are numbered differently diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 33a0ee3bcb2e..6648c07c4381 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -72,6 +72,8 @@ struct open_how; struct mount_attr; struct landlock_ruleset_attr; enum landlock_rule_type; +struct cachestat_range; +struct cachestat; #include #include @@ -1058,6 +1060,9 @@ asmlinkage long sys_memfd_secret(unsigned int flags); asmlinkage long sys_set_mempolicy_home_node(unsigned long start, unsigned long len, unsigned long home_node, unsigned long flags); +asmlinkage long sys_cachestat(unsigned int fd, + struct cachestat_range __user *cstat_range, + struct cachestat __user *cstat, unsigned int flags); /* * Architecture-specific system calls diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index 45fa180cc56a..cd639fae9086 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -886,8 +886,11 @@ __SYSCALL(__NR_futex_waitv, sys_futex_waitv) #define __NR_set_mempolicy_home_node 450 __SYSCALL(__NR_set_mempolicy_home_node, sys_set_mempolicy_home_node) +#define __NR_cachestat 451 +__SYSCALL(__NR_cachestat, sys_cachestat) + #undef __NR_syscalls -#define __NR_syscalls 451 +#define __NR_syscalls 452 /* * 32 bit systems traditionally used different diff --git a/include/uapi/linux/mman.h b/include/uapi/linux/mman.h index f55bc680b5b0..a246e11988d5 100644 --- a/include/uapi/linux/mman.h +++ b/include/uapi/linux/mman.h @@ -4,6 +4,7 @@ #include #include +#include #define MREMAP_MAYMOVE 1 #define MREMAP_FIXED 2 @@ -41,4 +42,17 @@ #define MAP_HUGE_2GB HUGETLB_FLAG_ENCODE_2GB #define MAP_HUGE_16GB HUGETLB_FLAG_ENCODE_16GB +struct cachestat_range { + __u64 off; + __u64 len; +}; + +struct cachestat { + __u64 nr_cache; + __u64 nr_dirty; + __u64 nr_writeback; + __u64 nr_evicted; + __u64 nr_recently_evicted; +}; + #endif /* _UAPI_LINUX_MMAN_H */ diff --git a/init/Kconfig b/init/Kconfig index 32c24950c4ce..f7f65af4ee12 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1771,6 +1771,16 @@ config RSEQ If unsure, say Y. +config CACHESTAT_SYSCALL + bool "Enable cachestat() system call" if EXPERT + default y + help + Enable the cachestat system call, which queries the page cache + statistics of a file (number of cached pages, dirty pages, + pages marked for writeback, (recently) evicted pages). + + If unsure say Y here. + config DEBUG_RSEQ default n bool "Enabled debugging of rseq() system call" if EXPERT diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 860b2dcf3ac4..04bfb1e4d377 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -299,6 +299,7 @@ COND_SYSCALL(set_mempolicy); COND_SYSCALL(migrate_pages); COND_SYSCALL(move_pages); COND_SYSCALL(set_mempolicy_home_node); +COND_SYSCALL(cachestat); COND_SYSCALL(perf_event_open); COND_SYSCALL(accept4); diff --git a/mm/filemap.c b/mm/filemap.c index b4c9bd368b7e..2d3d70c64dfd 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -58,6 +59,8 @@ #include +#include "swap.h" + /* * Shared mappings implemented 30.11.1994. It's not fully working yet, * though. @@ -4119,3 +4122,171 @@ bool filemap_release_folio(struct folio *folio, gfp_t gfp) return try_to_free_buffers(folio); } EXPORT_SYMBOL(filemap_release_folio); + +#ifdef CONFIG_CACHESTAT_SYSCALL +/** + * filemap_cachestat() - compute the page cache statistics of a mapping + * @mapping: The mapping to compute the statistics for. + * @first_index: The starting page cache index. + * @last_index: The final page index (inclusive). + * @cs: the cachestat struct to write the result to. + * + * This will query the page cache statistics of a mapping in the + * page range of [first_index, last_index] (inclusive). The statistics + * queried include: number of dirty pages, number of pages marked for + * writeback, and the number of (recently) evicted pages. + */ +static void filemap_cachestat(struct address_space *mapping, + pgoff_t first_index, pgoff_t last_index, struct cachestat *cs) +{ + XA_STATE(xas, &mapping->i_pages, first_index); + struct folio *folio; + + rcu_read_lock(); + xas_for_each(&xas, folio, last_index) { + unsigned long nr_pages; + pgoff_t folio_first_index, folio_last_index; + + if (xas_retry(&xas, folio)) + continue; + + if (xa_is_value(folio)) { + /* page is evicted */ + void *shadow = (void *)folio; + bool workingset; /* not used */ + int order = xa_get_order(xas.xa, xas.xa_index); + + nr_pages = 1 << order; + folio_first_index = round_down(xas.xa_index, 1 << order); + folio_last_index = folio_first_index + nr_pages - 1; + + /* Folios might straddle the range boundaries, only count covered pages */ + if (folio_first_index < first_index) + nr_pages -= first_index - folio_first_index; + + if (folio_last_index > last_index) + nr_pages -= folio_last_index - last_index; + + cs->nr_evicted += nr_pages; + +#ifdef CONFIG_SWAP /* implies CONFIG_MMU */ + if (shmem_mapping(mapping)) { + /* shmem file - in swap cache */ + swp_entry_t swp = radix_to_swp_entry(folio); + + shadow = get_shadow_from_swap_cache(swp); + } +#endif + if (workingset_test_recent(shadow, true, &workingset)) + cs->nr_recently_evicted += nr_pages; + + goto resched; + } + + nr_pages = folio_nr_pages(folio); + folio_first_index = folio_pgoff(folio); + folio_last_index = folio_first_index + nr_pages - 1; + + /* Folios might straddle the range boundaries, only count covered pages */ + if (folio_first_index < first_index) + nr_pages -= first_index - folio_first_index; + + if (folio_last_index > last_index) + nr_pages -= folio_last_index - last_index; + + /* page is in cache */ + cs->nr_cache += nr_pages; + + if (folio_test_dirty(folio)) + cs->nr_dirty += nr_pages; + + if (folio_test_writeback(folio)) + cs->nr_writeback += nr_pages; + +resched: + if (need_resched()) { + xas_pause(&xas); + cond_resched_rcu(); + } + } + rcu_read_unlock(); +} + +/* + * The cachestat(2) system call. + * + * cachestat() returns the page cache statistics of a file in the + * bytes range specified by `off` and `len`: number of cached pages, + * number of dirty pages, number of pages marked for writeback, + * number of evicted pages, and number of recently evicted pages. + * + * An evicted page is a page that is previously in the page cache + * but has been evicted since. A page is recently evicted if its last + * eviction was recent enough that its reentry to the cache would + * indicate that it is actively being used by the system, and that + * there is memory pressure on the system. + * + * `off` and `len` must be non-negative integers. If `len` > 0, + * the queried range is [`off`, `off` + `len`]. If `len` == 0, + * we will query in the range from `off` to the end of the file. + * + * The `flags` argument is unused for now, but is included for future + * extensibility. User should pass 0 (i.e no flag specified). + * + * Currently, hugetlbfs is not supported. + * + * Because the status of a page can change after cachestat() checks it + * but before it returns to the application, the returned values may + * contain stale information. + * + * return values: + * zero - success + * -EFAULT - cstat or cstat_range points to an illegal address + * -EINVAL - invalid flags + * -EBADF - invalid file descriptor + * -EOPNOTSUPP - file descriptor is of a hugetlbfs file + */ +SYSCALL_DEFINE4(cachestat, unsigned int, fd, + struct cachestat_range __user *, cstat_range, + struct cachestat __user *, cstat, unsigned int, flags) +{ + struct fd f = fdget(fd); + struct address_space *mapping; + struct cachestat_range csr; + struct cachestat cs; + pgoff_t first_index, last_index; + + if (!f.file) + return -EBADF; + + if (copy_from_user(&csr, cstat_range, + sizeof(struct cachestat_range))) { + fdput(f); + return -EFAULT; + } + + /* hugetlbfs is not supported */ + if (is_file_hugepages(f.file)) { + fdput(f); + return -EOPNOTSUPP; + } + + if (flags != 0) { + fdput(f); + return -EINVAL; + } + + first_index = csr.off >> PAGE_SHIFT; + last_index = + csr.len == 0 ? ULONG_MAX : (csr.off + csr.len - 1) >> PAGE_SHIFT; + memset(&cs, 0, sizeof(struct cachestat)); + mapping = f.file->f_mapping; + filemap_cachestat(mapping, first_index, last_index, &cs); + fdput(f); + + if (copy_to_user(cstat, &cs, sizeof(struct cachestat))) + return -EFAULT; + + return 0; +} +#endif /* CONFIG_CACHESTAT_SYSCALL */ -- cgit v1.2.3 From c963901197188189e85b4d768a059fe1bbc2a502 Mon Sep 17 00:00:00 2001 From: Pankaj Raghav Date: Wed, 10 May 2023 14:47:16 +0200 Subject: filemap: remove page_endio() page_endio() is not used anymore. Remove it. Link: https://lkml.kernel.org/r/20230510124716.73655-1-p.raghav@samsung.com Signed-off-by: Pankaj Raghav Reviewed-by: Christoph Hellwig Acked-by: Matthew Wilcox (Oracle) Cc: Luis Chamberlain Signed-off-by: Andrew Morton --- include/linux/pagemap.h | 2 -- mm/filemap.c | 30 ------------------------------ 2 files changed, 32 deletions(-) (limited to 'include') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index a56308a9d1a4..c1ae5ebc375f 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -1078,8 +1078,6 @@ int filemap_migrate_folio(struct address_space *mapping, struct folio *dst, #else #define filemap_migrate_folio NULL #endif -void page_endio(struct page *page, bool is_write, int err); - void folio_end_private_2(struct folio *folio); void folio_wait_private_2(struct folio *folio); int folio_wait_private_2_killable(struct folio *folio); diff --git a/mm/filemap.c b/mm/filemap.c index 2d3d70c64dfd..570bc8c3db87 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1628,36 +1628,6 @@ void folio_end_writeback(struct folio *folio) } EXPORT_SYMBOL(folio_end_writeback); -/* - * After completing I/O on a page, call this routine to update the page - * flags appropriately - */ -void page_endio(struct page *page, bool is_write, int err) -{ - struct folio *folio = page_folio(page); - - if (!is_write) { - if (!err) { - folio_mark_uptodate(folio); - } else { - folio_clear_uptodate(folio); - folio_set_error(folio); - } - folio_unlock(folio); - } else { - if (err) { - struct address_space *mapping; - - folio_set_error(folio); - mapping = folio_mapping(folio); - if (mapping) - mapping_set_error(mapping, err); - } - folio_end_writeback(folio); - } -} -EXPORT_SYMBOL_GPL(page_endio); - /** * __folio_lock - Get a lock on the folio, assuming we need to sleep to get it. * @folio: The folio to lock -- cgit v1.2.3 From bb6e04a173f06e51819a4bb512e127dfbc50dcfa Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 9 May 2023 16:57:21 +0200 Subject: kasan: use internal prototypes matching gcc-13 builtins gcc-13 warns about function definitions for builtin interfaces that have a different prototype, e.g.: In file included from kasan_test.c:31: kasan.h:574:6: error: conflicting types for built-in function '__asan_register_globals'; expected 'void(void *, long int)' [-Werror=builtin-declaration-mismatch] 574 | void __asan_register_globals(struct kasan_global *globals, size_t size); kasan.h:577:6: error: conflicting types for built-in function '__asan_alloca_poison'; expected 'void(void *, long int)' [-Werror=builtin-declaration-mismatch] 577 | void __asan_alloca_poison(unsigned long addr, size_t size); kasan.h:580:6: error: conflicting types for built-in function '__asan_load1'; expected 'void(void *)' [-Werror=builtin-declaration-mismatch] 580 | void __asan_load1(unsigned long addr); kasan.h:581:6: error: conflicting types for built-in function '__asan_store1'; expected 'void(void *)' [-Werror=builtin-declaration-mismatch] 581 | void __asan_store1(unsigned long addr); kasan.h:643:6: error: conflicting types for built-in function '__hwasan_tag_memory'; expected 'void(void *, unsigned char, long int)' [-Werror=builtin-declaration-mismatch] 643 | void __hwasan_tag_memory(unsigned long addr, u8 tag, unsigned long size); The two problems are: - Addresses are passes as 'unsigned long' in the kernel, but gcc-13 expects a 'void *'. - sizes meant to use a signed ssize_t rather than size_t. Change all the prototypes to match these. Using 'void *' consistently for addresses gets rid of a couple of type casts, so push that down to the leaf functions where possible. This now passes all randconfig builds on arm, arm64 and x86, but I have not tested it on the other architectures that support kasan, since they tend to fail randconfig builds in other ways. This might fail if any of the 32-bit architectures expect a 'long' instead of 'int' for the size argument. The __asan_allocas_unpoison() function prototype is somewhat weird, since it uses a pointer for 'stack_top' and an size_t for 'stack_bottom'. This looks like it is meant to be 'addr' and 'size' like the others, but the implementation clearly treats them as 'top' and 'bottom'. Link: https://lkml.kernel.org/r/20230509145735.9263-2-arnd@kernel.org Signed-off-by: Arnd Bergmann Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Marco Elver Cc: Vincenzo Frascino Cc: Signed-off-by: Andrew Morton --- arch/arm64/kernel/traps.c | 2 +- arch/arm64/mm/fault.c | 2 +- include/linux/kasan.h | 2 +- mm/kasan/common.c | 2 +- mm/kasan/generic.c | 72 ++++++++++----------- mm/kasan/kasan.h | 160 +++++++++++++++++++++++----------------------- mm/kasan/report.c | 17 +++-- mm/kasan/report_generic.c | 12 ++-- mm/kasan/report_hw_tags.c | 2 +- mm/kasan/report_sw_tags.c | 2 +- mm/kasan/shadow.c | 36 +++++------ mm/kasan/sw_tags.c | 20 +++--- 12 files changed, 164 insertions(+), 165 deletions(-) (limited to 'include') diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index 4bb1b8f47298..7b889445e5c6 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -1044,7 +1044,7 @@ static int kasan_handler(struct pt_regs *regs, unsigned long esr) bool recover = esr & KASAN_ESR_RECOVER; bool write = esr & KASAN_ESR_WRITE; size_t size = KASAN_ESR_SIZE(esr); - u64 addr = regs->regs[0]; + void *addr = (void *)regs->regs[0]; u64 pc = regs->pc; kasan_report(addr, size, write, pc); diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index cb21ccd7940d..d5047eef4295 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -317,7 +317,7 @@ static void report_tag_fault(unsigned long addr, unsigned long esr, * find out access size. */ bool is_write = !!(esr & ESR_ELx_WNR); - kasan_report(addr, 0, is_write, regs->pc); + kasan_report((void *)addr, 0, is_write, regs->pc); } #else /* Tag faults aren't enabled without CONFIG_KASAN_HW_TAGS. */ diff --git a/include/linux/kasan.h b/include/linux/kasan.h index f7ef70661ce2..819b6bc8ac08 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -343,7 +343,7 @@ static inline void *kasan_reset_tag(const void *addr) * @is_write: whether the bad access is a write or a read * @ip: instruction pointer for the accessibility check or the bad access itself */ -bool kasan_report(unsigned long addr, size_t size, +bool kasan_report(const void *addr, size_t size, bool is_write, unsigned long ip); #else /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */ diff --git a/mm/kasan/common.c b/mm/kasan/common.c index b376a5d055e5..256930da578a 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -445,7 +445,7 @@ void * __must_check __kasan_krealloc(const void *object, size_t size, gfp_t flag bool __kasan_check_byte(const void *address, unsigned long ip) { if (!kasan_byte_accessible(address)) { - kasan_report((unsigned long)address, 1, false, ip); + kasan_report(address, 1, false, ip); return false; } return true; diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c index e5eef670735e..224d161a5a22 100644 --- a/mm/kasan/generic.c +++ b/mm/kasan/generic.c @@ -40,39 +40,39 @@ * depending on memory access size X. */ -static __always_inline bool memory_is_poisoned_1(unsigned long addr) +static __always_inline bool memory_is_poisoned_1(const void *addr) { - s8 shadow_value = *(s8 *)kasan_mem_to_shadow((void *)addr); + s8 shadow_value = *(s8 *)kasan_mem_to_shadow(addr); if (unlikely(shadow_value)) { - s8 last_accessible_byte = addr & KASAN_GRANULE_MASK; + s8 last_accessible_byte = (unsigned long)addr & KASAN_GRANULE_MASK; return unlikely(last_accessible_byte >= shadow_value); } return false; } -static __always_inline bool memory_is_poisoned_2_4_8(unsigned long addr, +static __always_inline bool memory_is_poisoned_2_4_8(const void *addr, unsigned long size) { - u8 *shadow_addr = (u8 *)kasan_mem_to_shadow((void *)addr); + u8 *shadow_addr = (u8 *)kasan_mem_to_shadow(addr); /* * Access crosses 8(shadow size)-byte boundary. Such access maps * into 2 shadow bytes, so we need to check them both. */ - if (unlikely(((addr + size - 1) & KASAN_GRANULE_MASK) < size - 1)) + if (unlikely((((unsigned long)addr + size - 1) & KASAN_GRANULE_MASK) < size - 1)) return *shadow_addr || memory_is_poisoned_1(addr + size - 1); return memory_is_poisoned_1(addr + size - 1); } -static __always_inline bool memory_is_poisoned_16(unsigned long addr) +static __always_inline bool memory_is_poisoned_16(const void *addr) { - u16 *shadow_addr = (u16 *)kasan_mem_to_shadow((void *)addr); + u16 *shadow_addr = (u16 *)kasan_mem_to_shadow(addr); /* Unaligned 16-bytes access maps into 3 shadow bytes. */ - if (unlikely(!IS_ALIGNED(addr, KASAN_GRANULE_SIZE))) + if (unlikely(!IS_ALIGNED((unsigned long)addr, KASAN_GRANULE_SIZE))) return *shadow_addr || memory_is_poisoned_1(addr + 15); return *shadow_addr; @@ -120,26 +120,25 @@ static __always_inline unsigned long memory_is_nonzero(const void *start, return bytes_is_nonzero(start, (end - start) % 8); } -static __always_inline bool memory_is_poisoned_n(unsigned long addr, - size_t size) +static __always_inline bool memory_is_poisoned_n(const void *addr, size_t size) { unsigned long ret; - ret = memory_is_nonzero(kasan_mem_to_shadow((void *)addr), - kasan_mem_to_shadow((void *)addr + size - 1) + 1); + ret = memory_is_nonzero(kasan_mem_to_shadow(addr), + kasan_mem_to_shadow(addr + size - 1) + 1); if (unlikely(ret)) { - unsigned long last_byte = addr + size - 1; - s8 *last_shadow = (s8 *)kasan_mem_to_shadow((void *)last_byte); + const void *last_byte = addr + size - 1; + s8 *last_shadow = (s8 *)kasan_mem_to_shadow(last_byte); if (unlikely(ret != (unsigned long)last_shadow || - ((long)(last_byte & KASAN_GRANULE_MASK) >= *last_shadow))) + (((long)last_byte & KASAN_GRANULE_MASK) >= *last_shadow))) return true; } return false; } -static __always_inline bool memory_is_poisoned(unsigned long addr, size_t size) +static __always_inline bool memory_is_poisoned(const void *addr, size_t size) { if (__builtin_constant_p(size)) { switch (size) { @@ -159,7 +158,7 @@ static __always_inline bool memory_is_poisoned(unsigned long addr, size_t size) return memory_is_poisoned_n(addr, size); } -static __always_inline bool check_region_inline(unsigned long addr, +static __always_inline bool check_region_inline(const void *addr, size_t size, bool write, unsigned long ret_ip) { @@ -172,7 +171,7 @@ static __always_inline bool check_region_inline(unsigned long addr, if (unlikely(addr + size < addr)) return !kasan_report(addr, size, write, ret_ip); - if (unlikely(!addr_has_metadata((void *)addr))) + if (unlikely(!addr_has_metadata(addr))) return !kasan_report(addr, size, write, ret_ip); if (likely(!memory_is_poisoned(addr, size))) @@ -181,7 +180,7 @@ static __always_inline bool check_region_inline(unsigned long addr, return !kasan_report(addr, size, write, ret_ip); } -bool kasan_check_range(unsigned long addr, size_t size, bool write, +bool kasan_check_range(const void *addr, size_t size, bool write, unsigned long ret_ip) { return check_region_inline(addr, size, write, ret_ip); @@ -221,36 +220,37 @@ static void register_global(struct kasan_global *global) KASAN_GLOBAL_REDZONE, false); } -void __asan_register_globals(struct kasan_global *globals, size_t size) +void __asan_register_globals(void *ptr, ssize_t size) { int i; + struct kasan_global *globals = ptr; for (i = 0; i < size; i++) register_global(&globals[i]); } EXPORT_SYMBOL(__asan_register_globals); -void __asan_unregister_globals(struct kasan_global *globals, size_t size) +void __asan_unregister_globals(void *ptr, ssize_t size) { } EXPORT_SYMBOL(__asan_unregister_globals); #define DEFINE_ASAN_LOAD_STORE(size) \ - void __asan_load##size(unsigned long addr) \ + void __asan_load##size(void *addr) \ { \ check_region_inline(addr, size, false, _RET_IP_); \ } \ EXPORT_SYMBOL(__asan_load##size); \ __alias(__asan_load##size) \ - void __asan_load##size##_noabort(unsigned long); \ + void __asan_load##size##_noabort(void *); \ EXPORT_SYMBOL(__asan_load##size##_noabort); \ - void __asan_store##size(unsigned long addr) \ + void __asan_store##size(void *addr) \ { \ check_region_inline(addr, size, true, _RET_IP_); \ } \ EXPORT_SYMBOL(__asan_store##size); \ __alias(__asan_store##size) \ - void __asan_store##size##_noabort(unsigned long); \ + void __asan_store##size##_noabort(void *); \ EXPORT_SYMBOL(__asan_store##size##_noabort) DEFINE_ASAN_LOAD_STORE(1); @@ -259,24 +259,24 @@ DEFINE_ASAN_LOAD_STORE(4); DEFINE_ASAN_LOAD_STORE(8); DEFINE_ASAN_LOAD_STORE(16); -void __asan_loadN(unsigned long addr, size_t size) +void __asan_loadN(void *addr, ssize_t size) { kasan_check_range(addr, size, false, _RET_IP_); } EXPORT_SYMBOL(__asan_loadN); __alias(__asan_loadN) -void __asan_loadN_noabort(unsigned long, size_t); +void __asan_loadN_noabort(void *, ssize_t); EXPORT_SYMBOL(__asan_loadN_noabort); -void __asan_storeN(unsigned long addr, size_t size) +void __asan_storeN(void *addr, ssize_t size) { kasan_check_range(addr, size, true, _RET_IP_); } EXPORT_SYMBOL(__asan_storeN); __alias(__asan_storeN) -void __asan_storeN_noabort(unsigned long, size_t); +void __asan_storeN_noabort(void *, ssize_t); EXPORT_SYMBOL(__asan_storeN_noabort); /* to shut up compiler complaints */ @@ -284,7 +284,7 @@ void __asan_handle_no_return(void) {} EXPORT_SYMBOL(__asan_handle_no_return); /* Emitted by compiler to poison alloca()ed objects. */ -void __asan_alloca_poison(unsigned long addr, size_t size) +void __asan_alloca_poison(void *addr, ssize_t size) { size_t rounded_up_size = round_up(size, KASAN_GRANULE_SIZE); size_t padding_size = round_up(size, KASAN_ALLOCA_REDZONE_SIZE) - @@ -295,7 +295,7 @@ void __asan_alloca_poison(unsigned long addr, size_t size) KASAN_ALLOCA_REDZONE_SIZE); const void *right_redzone = (const void *)(addr + rounded_up_size); - WARN_ON(!IS_ALIGNED(addr, KASAN_ALLOCA_REDZONE_SIZE)); + WARN_ON(!IS_ALIGNED((unsigned long)addr, KASAN_ALLOCA_REDZONE_SIZE)); kasan_unpoison((const void *)(addr + rounded_down_size), size - rounded_down_size, false); @@ -307,18 +307,18 @@ void __asan_alloca_poison(unsigned long addr, size_t size) EXPORT_SYMBOL(__asan_alloca_poison); /* Emitted by compiler to unpoison alloca()ed areas when the stack unwinds. */ -void __asan_allocas_unpoison(const void *stack_top, const void *stack_bottom) +void __asan_allocas_unpoison(void *stack_top, ssize_t stack_bottom) { - if (unlikely(!stack_top || stack_top > stack_bottom)) + if (unlikely(!stack_top || stack_top > (void *)stack_bottom)) return; - kasan_unpoison(stack_top, stack_bottom - stack_top, false); + kasan_unpoison(stack_top, (void *)stack_bottom - stack_top, false); } EXPORT_SYMBOL(__asan_allocas_unpoison); /* Emitted by the compiler to [un]poison local variables. */ #define DEFINE_ASAN_SET_SHADOW(byte) \ - void __asan_set_shadow_##byte(const void *addr, size_t size) \ + void __asan_set_shadow_##byte(const void *addr, ssize_t size) \ { \ __memset((void *)addr, 0x##byte, size); \ } \ diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index cd846ca34f44..b799f11e45dc 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -198,13 +198,13 @@ enum kasan_report_type { struct kasan_report_info { /* Filled in by kasan_report_*(). */ enum kasan_report_type type; - void *access_addr; + const void *access_addr; size_t access_size; bool is_write; unsigned long ip; /* Filled in by the common reporting code. */ - void *first_bad_addr; + const void *first_bad_addr; struct kmem_cache *cache; void *object; size_t alloc_size; @@ -311,7 +311,7 @@ static __always_inline bool addr_has_metadata(const void *addr) * @ret_ip: return address * @return: true if access was valid, false if invalid */ -bool kasan_check_range(unsigned long addr, size_t size, bool write, +bool kasan_check_range(const void *addr, size_t size, bool write, unsigned long ret_ip); #else /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */ @@ -323,7 +323,7 @@ static __always_inline bool addr_has_metadata(const void *addr) #endif /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */ -void *kasan_find_first_bad_addr(void *addr, size_t size); +const void *kasan_find_first_bad_addr(const void *addr, size_t size); size_t kasan_get_alloc_size(void *object, struct kmem_cache *cache); void kasan_complete_mode_report_info(struct kasan_report_info *info); void kasan_metadata_fetch_row(char *buffer, void *row); @@ -346,7 +346,7 @@ void kasan_print_aux_stacks(struct kmem_cache *cache, const void *object); static inline void kasan_print_aux_stacks(struct kmem_cache *cache, const void *object) { } #endif -bool kasan_report(unsigned long addr, size_t size, +bool kasan_report(const void *addr, size_t size, bool is_write, unsigned long ip); void kasan_report_invalid_free(void *object, unsigned long ip, enum kasan_report_type type); @@ -571,82 +571,82 @@ void kasan_restore_multi_shot(bool enabled); */ asmlinkage void kasan_unpoison_task_stack_below(const void *watermark); -void __asan_register_globals(struct kasan_global *globals, size_t size); -void __asan_unregister_globals(struct kasan_global *globals, size_t size); +void __asan_register_globals(void *globals, ssize_t size); +void __asan_unregister_globals(void *globals, ssize_t size); void __asan_handle_no_return(void); -void __asan_alloca_poison(unsigned long addr, size_t size); -void __asan_allocas_unpoison(const void *stack_top, const void *stack_bottom); - -void __asan_load1(unsigned long addr); -void __asan_store1(unsigned long addr); -void __asan_load2(unsigned long addr); -void __asan_store2(unsigned long addr); -void __asan_load4(unsigned long addr); -void __asan_store4(unsigned long addr); -void __asan_load8(unsigned long addr); -void __asan_store8(unsigned long addr); -void __asan_load16(unsigned long addr); -void __asan_store16(unsigned long addr); -void __asan_loadN(unsigned long addr, size_t size); -void __asan_storeN(unsigned long addr, size_t size); - -void __asan_load1_noabort(unsigned long addr); -void __asan_store1_noabort(unsigned long addr); -void __asan_load2_noabort(unsigned long addr); -void __asan_store2_noabort(unsigned long addr); -void __asan_load4_noabort(unsigned long addr); -void __asan_store4_noabort(unsigned long addr); -void __asan_load8_noabort(unsigned long addr); -void __asan_store8_noabort(unsigned long addr); -void __asan_load16_noabort(unsigned long addr); -void __asan_store16_noabort(unsigned long addr); -void __asan_loadN_noabort(unsigned long addr, size_t size); -void __asan_storeN_noabort(unsigned long addr, size_t size); - -void __asan_report_load1_noabort(unsigned long addr); -void __asan_report_store1_noabort(unsigned long addr); -void __asan_report_load2_noabort(unsigned long addr); -void __asan_report_store2_noabort(unsigned long addr); -void __asan_report_load4_noabort(unsigned long addr); -void __asan_report_store4_noabort(unsigned long addr); -void __asan_report_load8_noabort(unsigned long addr); -void __asan_report_store8_noabort(unsigned long addr); -void __asan_report_load16_noabort(unsigned long addr); -void __asan_report_store16_noabort(unsigned long addr); -void __asan_report_load_n_noabort(unsigned long addr, size_t size); -void __asan_report_store_n_noabort(unsigned long addr, size_t size); - -void __asan_set_shadow_00(const void *addr, size_t size); -void __asan_set_shadow_f1(const void *addr, size_t size); -void __asan_set_shadow_f2(const void *addr, size_t size); -void __asan_set_shadow_f3(const void *addr, size_t size); -void __asan_set_shadow_f5(const void *addr, size_t size); -void __asan_set_shadow_f8(const void *addr, size_t size); - -void *__asan_memset(void *addr, int c, size_t len); -void *__asan_memmove(void *dest, const void *src, size_t len); -void *__asan_memcpy(void *dest, const void *src, size_t len); - -void __hwasan_load1_noabort(unsigned long addr); -void __hwasan_store1_noabort(unsigned long addr); -void __hwasan_load2_noabort(unsigned long addr); -void __hwasan_store2_noabort(unsigned long addr); -void __hwasan_load4_noabort(unsigned long addr); -void __hwasan_store4_noabort(unsigned long addr); -void __hwasan_load8_noabort(unsigned long addr); -void __hwasan_store8_noabort(unsigned long addr); -void __hwasan_load16_noabort(unsigned long addr); -void __hwasan_store16_noabort(unsigned long addr); -void __hwasan_loadN_noabort(unsigned long addr, size_t size); -void __hwasan_storeN_noabort(unsigned long addr, size_t size); - -void __hwasan_tag_memory(unsigned long addr, u8 tag, unsigned long size); - -void *__hwasan_memset(void *addr, int c, size_t len); -void *__hwasan_memmove(void *dest, const void *src, size_t len); -void *__hwasan_memcpy(void *dest, const void *src, size_t len); - -void kasan_tag_mismatch(unsigned long addr, unsigned long access_info, +void __asan_alloca_poison(void *, ssize_t size); +void __asan_allocas_unpoison(void *stack_top, ssize_t stack_bottom); + +void __asan_load1(void *); +void __asan_store1(void *); +void __asan_load2(void *); +void __asan_store2(void *); +void __asan_load4(void *); +void __asan_store4(void *); +void __asan_load8(void *); +void __asan_store8(void *); +void __asan_load16(void *); +void __asan_store16(void *); +void __asan_loadN(void *, ssize_t size); +void __asan_storeN(void *, ssize_t size); + +void __asan_load1_noabort(void *); +void __asan_store1_noabort(void *); +void __asan_load2_noabort(void *); +void __asan_store2_noabort(void *); +void __asan_load4_noabort(void *); +void __asan_store4_noabort(void *); +void __asan_load8_noabort(void *); +void __asan_store8_noabort(void *); +void __asan_load16_noabort(void *); +void __asan_store16_noabort(void *); +void __asan_loadN_noabort(void *, ssize_t size); +void __asan_storeN_noabort(void *, ssize_t size); + +void __asan_report_load1_noabort(void *); +void __asan_report_store1_noabort(void *); +void __asan_report_load2_noabort(void *); +void __asan_report_store2_noabort(void *); +void __asan_report_load4_noabort(void *); +void __asan_report_store4_noabort(void *); +void __asan_report_load8_noabort(void *); +void __asan_report_store8_noabort(void *); +void __asan_report_load16_noabort(void *); +void __asan_report_store16_noabort(void *); +void __asan_report_load_n_noabort(void *, ssize_t size); +void __asan_report_store_n_noabort(void *, ssize_t size); + +void __asan_set_shadow_00(const void *addr, ssize_t size); +void __asan_set_shadow_f1(const void *addr, ssize_t size); +void __asan_set_shadow_f2(const void *addr, ssize_t size); +void __asan_set_shadow_f3(const void *addr, ssize_t size); +void __asan_set_shadow_f5(const void *addr, ssize_t size); +void __asan_set_shadow_f8(const void *addr, ssize_t size); + +void *__asan_memset(void *addr, int c, ssize_t len); +void *__asan_memmove(void *dest, const void *src, ssize_t len); +void *__asan_memcpy(void *dest, const void *src, ssize_t len); + +void __hwasan_load1_noabort(void *); +void __hwasan_store1_noabort(void *); +void __hwasan_load2_noabort(void *); +void __hwasan_store2_noabort(void *); +void __hwasan_load4_noabort(void *); +void __hwasan_store4_noabort(void *); +void __hwasan_load8_noabort(void *); +void __hwasan_store8_noabort(void *); +void __hwasan_load16_noabort(void *); +void __hwasan_store16_noabort(void *); +void __hwasan_loadN_noabort(void *, ssize_t size); +void __hwasan_storeN_noabort(void *, ssize_t size); + +void __hwasan_tag_memory(void *, u8 tag, ssize_t size); + +void *__hwasan_memset(void *addr, int c, ssize_t len); +void *__hwasan_memmove(void *dest, const void *src, ssize_t len); +void *__hwasan_memcpy(void *dest, const void *src, ssize_t len); + +void kasan_tag_mismatch(void *addr, unsigned long access_info, unsigned long ret_ip); #endif /* __MM_KASAN_KASAN_H */ diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 892a9dc9d4d3..84d9f3b37014 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -211,7 +211,7 @@ static void start_report(unsigned long *flags, bool sync) pr_err("==================================================================\n"); } -static void end_report(unsigned long *flags, void *addr) +static void end_report(unsigned long *flags, const void *addr) { if (addr) trace_error_report_end(ERROR_DETECTOR_KASAN, @@ -450,8 +450,8 @@ static void print_memory_metadata(const void *addr) static void print_report(struct kasan_report_info *info) { - void *addr = kasan_reset_tag(info->access_addr); - u8 tag = get_tag(info->access_addr); + void *addr = kasan_reset_tag((void *)info->access_addr); + u8 tag = get_tag((void *)info->access_addr); print_error_description(info); if (addr_has_metadata(addr)) @@ -468,12 +468,12 @@ static void print_report(struct kasan_report_info *info) static void complete_report_info(struct kasan_report_info *info) { - void *addr = kasan_reset_tag(info->access_addr); + void *addr = kasan_reset_tag((void *)info->access_addr); struct slab *slab; if (info->type == KASAN_REPORT_ACCESS) info->first_bad_addr = kasan_find_first_bad_addr( - info->access_addr, info->access_size); + (void *)info->access_addr, info->access_size); else info->first_bad_addr = addr; @@ -544,11 +544,10 @@ void kasan_report_invalid_free(void *ptr, unsigned long ip, enum kasan_report_ty * user_access_save/restore(): kasan_report_invalid_free() cannot be called * from a UACCESS region, and kasan_report_async() is not used on x86. */ -bool kasan_report(unsigned long addr, size_t size, bool is_write, +bool kasan_report(const void *addr, size_t size, bool is_write, unsigned long ip) { bool ret = true; - void *ptr = (void *)addr; unsigned long ua_flags = user_access_save(); unsigned long irq_flags; struct kasan_report_info info; @@ -562,7 +561,7 @@ bool kasan_report(unsigned long addr, size_t size, bool is_write, memset(&info, 0, sizeof(info)); info.type = KASAN_REPORT_ACCESS; - info.access_addr = ptr; + info.access_addr = addr; info.access_size = size; info.is_write = is_write; info.ip = ip; @@ -571,7 +570,7 @@ bool kasan_report(unsigned long addr, size_t size, bool is_write, print_report(&info); - end_report(&irq_flags, ptr); + end_report(&irq_flags, (void *)addr); out: user_access_restore(ua_flags); diff --git a/mm/kasan/report_generic.c b/mm/kasan/report_generic.c index 87d39bc0a673..51a1e8a8877f 100644 --- a/mm/kasan/report_generic.c +++ b/mm/kasan/report_generic.c @@ -30,9 +30,9 @@ #include "kasan.h" #include "../slab.h" -void *kasan_find_first_bad_addr(void *addr, size_t size) +const void *kasan_find_first_bad_addr(const void *addr, size_t size) { - void *p = addr; + const void *p = addr; if (!addr_has_metadata(p)) return p; @@ -362,14 +362,14 @@ void kasan_print_address_stack_frame(const void *addr) #endif /* CONFIG_KASAN_STACK */ #define DEFINE_ASAN_REPORT_LOAD(size) \ -void __asan_report_load##size##_noabort(unsigned long addr) \ +void __asan_report_load##size##_noabort(void *addr) \ { \ kasan_report(addr, size, false, _RET_IP_); \ } \ EXPORT_SYMBOL(__asan_report_load##size##_noabort) #define DEFINE_ASAN_REPORT_STORE(size) \ -void __asan_report_store##size##_noabort(unsigned long addr) \ +void __asan_report_store##size##_noabort(void *addr) \ { \ kasan_report(addr, size, true, _RET_IP_); \ } \ @@ -386,13 +386,13 @@ DEFINE_ASAN_REPORT_STORE(4); DEFINE_ASAN_REPORT_STORE(8); DEFINE_ASAN_REPORT_STORE(16); -void __asan_report_load_n_noabort(unsigned long addr, size_t size) +void __asan_report_load_n_noabort(void *addr, ssize_t size) { kasan_report(addr, size, false, _RET_IP_); } EXPORT_SYMBOL(__asan_report_load_n_noabort); -void __asan_report_store_n_noabort(unsigned long addr, size_t size) +void __asan_report_store_n_noabort(void *addr, ssize_t size) { kasan_report(addr, size, true, _RET_IP_); } diff --git a/mm/kasan/report_hw_tags.c b/mm/kasan/report_hw_tags.c index 32e80f78de7d..065e1b2fc484 100644 --- a/mm/kasan/report_hw_tags.c +++ b/mm/kasan/report_hw_tags.c @@ -15,7 +15,7 @@ #include "kasan.h" -void *kasan_find_first_bad_addr(void *addr, size_t size) +const void *kasan_find_first_bad_addr(const void *addr, size_t size) { /* * Hardware Tag-Based KASAN only calls this function for normal memory diff --git a/mm/kasan/report_sw_tags.c b/mm/kasan/report_sw_tags.c index 8b1f5a73ee6d..689e94f9fe3c 100644 --- a/mm/kasan/report_sw_tags.c +++ b/mm/kasan/report_sw_tags.c @@ -30,7 +30,7 @@ #include "kasan.h" #include "../slab.h" -void *kasan_find_first_bad_addr(void *addr, size_t size) +const void *kasan_find_first_bad_addr(const void *addr, size_t size) { u8 tag = get_tag(addr); void *p = kasan_reset_tag(addr); diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c index c8b86f3273b5..3e62728ae25d 100644 --- a/mm/kasan/shadow.c +++ b/mm/kasan/shadow.c @@ -28,13 +28,13 @@ bool __kasan_check_read(const volatile void *p, unsigned int size) { - return kasan_check_range((unsigned long)p, size, false, _RET_IP_); + return kasan_check_range((void *)p, size, false, _RET_IP_); } EXPORT_SYMBOL(__kasan_check_read); bool __kasan_check_write(const volatile void *p, unsigned int size) { - return kasan_check_range((unsigned long)p, size, true, _RET_IP_); + return kasan_check_range((void *)p, size, true, _RET_IP_); } EXPORT_SYMBOL(__kasan_check_write); @@ -50,7 +50,7 @@ EXPORT_SYMBOL(__kasan_check_write); #undef memset void *memset(void *addr, int c, size_t len) { - if (!kasan_check_range((unsigned long)addr, len, true, _RET_IP_)) + if (!kasan_check_range(addr, len, true, _RET_IP_)) return NULL; return __memset(addr, c, len); @@ -60,8 +60,8 @@ void *memset(void *addr, int c, size_t len) #undef memmove void *memmove(void *dest, const void *src, size_t len) { - if (!kasan_check_range((unsigned long)src, len, false, _RET_IP_) || - !kasan_check_range((unsigned long)dest, len, true, _RET_IP_)) + if (!kasan_check_range(src, len, false, _RET_IP_) || + !kasan_check_range(dest, len, true, _RET_IP_)) return NULL; return __memmove(dest, src, len); @@ -71,17 +71,17 @@ void *memmove(void *dest, const void *src, size_t len) #undef memcpy void *memcpy(void *dest, const void *src, size_t len) { - if (!kasan_check_range((unsigned long)src, len, false, _RET_IP_) || - !kasan_check_range((unsigned long)dest, len, true, _RET_IP_)) + if (!kasan_check_range(src, len, false, _RET_IP_) || + !kasan_check_range(dest, len, true, _RET_IP_)) return NULL; return __memcpy(dest, src, len); } #endif -void *__asan_memset(void *addr, int c, size_t len) +void *__asan_memset(void *addr, int c, ssize_t len) { - if (!kasan_check_range((unsigned long)addr, len, true, _RET_IP_)) + if (!kasan_check_range(addr, len, true, _RET_IP_)) return NULL; return __memset(addr, c, len); @@ -89,10 +89,10 @@ void *__asan_memset(void *addr, int c, size_t len) EXPORT_SYMBOL(__asan_memset); #ifdef __HAVE_ARCH_MEMMOVE -void *__asan_memmove(void *dest, const void *src, size_t len) +void *__asan_memmove(void *dest, const void *src, ssize_t len) { - if (!kasan_check_range((unsigned long)src, len, false, _RET_IP_) || - !kasan_check_range((unsigned long)dest, len, true, _RET_IP_)) + if (!kasan_check_range(src, len, false, _RET_IP_) || + !kasan_check_range(dest, len, true, _RET_IP_)) return NULL; return __memmove(dest, src, len); @@ -100,10 +100,10 @@ void *__asan_memmove(void *dest, const void *src, size_t len) EXPORT_SYMBOL(__asan_memmove); #endif -void *__asan_memcpy(void *dest, const void *src, size_t len) +void *__asan_memcpy(void *dest, const void *src, ssize_t len) { - if (!kasan_check_range((unsigned long)src, len, false, _RET_IP_) || - !kasan_check_range((unsigned long)dest, len, true, _RET_IP_)) + if (!kasan_check_range(src, len, false, _RET_IP_) || + !kasan_check_range(dest, len, true, _RET_IP_)) return NULL; return __memcpy(dest, src, len); @@ -111,13 +111,13 @@ void *__asan_memcpy(void *dest, const void *src, size_t len) EXPORT_SYMBOL(__asan_memcpy); #ifdef CONFIG_KASAN_SW_TAGS -void *__hwasan_memset(void *addr, int c, size_t len) __alias(__asan_memset); +void *__hwasan_memset(void *addr, int c, ssize_t len) __alias(__asan_memset); EXPORT_SYMBOL(__hwasan_memset); #ifdef __HAVE_ARCH_MEMMOVE -void *__hwasan_memmove(void *dest, const void *src, size_t len) __alias(__asan_memmove); +void *__hwasan_memmove(void *dest, const void *src, ssize_t len) __alias(__asan_memmove); EXPORT_SYMBOL(__hwasan_memmove); #endif -void *__hwasan_memcpy(void *dest, const void *src, size_t len) __alias(__asan_memcpy); +void *__hwasan_memcpy(void *dest, const void *src, ssize_t len) __alias(__asan_memcpy); EXPORT_SYMBOL(__hwasan_memcpy); #endif diff --git a/mm/kasan/sw_tags.c b/mm/kasan/sw_tags.c index 30da65fa02a1..220b5d4c6876 100644 --- a/mm/kasan/sw_tags.c +++ b/mm/kasan/sw_tags.c @@ -70,8 +70,8 @@ u8 kasan_random_tag(void) return (u8)(state % (KASAN_TAG_MAX + 1)); } -bool kasan_check_range(unsigned long addr, size_t size, bool write, - unsigned long ret_ip) +bool kasan_check_range(const void *addr, size_t size, bool write, + unsigned long ret_ip) { u8 tag; u8 *shadow_first, *shadow_last, *shadow; @@ -133,12 +133,12 @@ bool kasan_byte_accessible(const void *addr) } #define DEFINE_HWASAN_LOAD_STORE(size) \ - void __hwasan_load##size##_noabort(unsigned long addr) \ + void __hwasan_load##size##_noabort(void *addr) \ { \ - kasan_check_range(addr, size, false, _RET_IP_); \ + kasan_check_range(addr, size, false, _RET_IP_); \ } \ EXPORT_SYMBOL(__hwasan_load##size##_noabort); \ - void __hwasan_store##size##_noabort(unsigned long addr) \ + void __hwasan_store##size##_noabort(void *addr) \ { \ kasan_check_range(addr, size, true, _RET_IP_); \ } \ @@ -150,25 +150,25 @@ DEFINE_HWASAN_LOAD_STORE(4); DEFINE_HWASAN_LOAD_STORE(8); DEFINE_HWASAN_LOAD_STORE(16); -void __hwasan_loadN_noabort(unsigned long addr, unsigned long size) +void __hwasan_loadN_noabort(void *addr, ssize_t size) { kasan_check_range(addr, size, false, _RET_IP_); } EXPORT_SYMBOL(__hwasan_loadN_noabort); -void __hwasan_storeN_noabort(unsigned long addr, unsigned long size) +void __hwasan_storeN_noabort(void *addr, ssize_t size) { kasan_check_range(addr, size, true, _RET_IP_); } EXPORT_SYMBOL(__hwasan_storeN_noabort); -void __hwasan_tag_memory(unsigned long addr, u8 tag, unsigned long size) +void __hwasan_tag_memory(void *addr, u8 tag, ssize_t size) { - kasan_poison((void *)addr, size, tag, false); + kasan_poison(addr, size, tag, false); } EXPORT_SYMBOL(__hwasan_tag_memory); -void kasan_tag_mismatch(unsigned long addr, unsigned long access_info, +void kasan_tag_mismatch(void *addr, unsigned long access_info, unsigned long ret_ip) { kasan_report(addr, 1 << (access_info & 0xf), access_info & 0x10, -- cgit v1.2.3 From 870388db25324fec267862baddc28aaaf0baca73 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Mon, 8 May 2023 19:41:27 +0800 Subject: mm: memory_failure: move memory_failure_attr_group under MEMORY_FAILURE The memory_failure_attr_group is only called if MEMORY_FAILURE enabled, move it under this configuration. Link: https://lkml.kernel.org/r/20230508114128.37081-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Acked-by: Naoya Horiguchi Signed-off-by: Andrew Morton --- include/linux/mm.h | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 27ce77080c79..f64bfbd53c65 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3586,6 +3586,10 @@ extern void shake_page(struct page *p); extern atomic_long_t num_poisoned_pages __read_mostly; extern int soft_offline_page(unsigned long pfn, int flags); #ifdef CONFIG_MEMORY_FAILURE +/* + * Sysfs entries for memory failure handling statistics. + */ +extern const struct attribute_group memory_failure_attr_group; extern void memory_failure_queue(unsigned long pfn, int flags); extern int __get_huge_page_for_hwpoison(unsigned long pfn, int flags, bool *migratable_cleared); @@ -3678,11 +3682,6 @@ enum mf_action_page_type { MF_MSG_UNKNOWN, }; -/* - * Sysfs entries for memory failure handling statistics. - */ -extern const struct attribute_group memory_failure_attr_group; - #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS) extern void clear_huge_page(struct page *page, unsigned long addr_hint, -- cgit v1.2.3 From 904d58578fce531be07619a2bc2cdc16c9fd49b6 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Tue, 16 May 2023 14:38:11 +0800 Subject: mm: page_alloc: move set_zone_contiguous() into mm_init.c set_zone_contiguous() is only used in mm init/hotplug, and clear_zone_contiguous() only used in hotplug, move them from page_alloc.c to the more appropriate file. Link: https://lkml.kernel.org/r/20230516063821.121844-4-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Iurii Zaikin Cc: Kees Cook Cc: Len Brown Cc: Luis Chamberlain Cc: Mike Rapoport (IBM) Cc: Oscar Salvador Cc: Pavel Machek Cc: Rafael J. Wysocki Signed-off-by: Andrew Morton --- include/linux/memory_hotplug.h | 3 --- mm/internal.h | 7 +++++++ mm/mm_init.c | 22 ++++++++++++++++++++++ mm/page_alloc.c | 27 --------------------------- 4 files changed, 29 insertions(+), 30 deletions(-) (limited to 'include') diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 9fcbf5706595..04bc286eed42 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -326,9 +326,6 @@ static inline int remove_memory(u64 start, u64 size) static inline void __remove_memory(u64 start, u64 size) {} #endif /* CONFIG_MEMORY_HOTREMOVE */ -extern void set_zone_contiguous(struct zone *zone); -extern void clear_zone_contiguous(struct zone *zone); - #ifdef CONFIG_MEMORY_HOTPLUG extern void __ref free_area_init_core_hotplug(struct pglist_data *pgdat); extern int __add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags); diff --git a/mm/internal.h b/mm/internal.h index 68410c6d97ac..c99da2cfac71 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -371,6 +371,13 @@ static inline struct page *pageblock_pfn_to_page(unsigned long start_pfn, return __pageblock_pfn_to_page(start_pfn, end_pfn, zone); } +void set_zone_contiguous(struct zone *zone); + +static inline void clear_zone_contiguous(struct zone *zone) +{ + zone->contiguous = false; +} + extern int __isolate_free_page(struct page *page, unsigned int order); extern void __putback_isolated_page(struct page *page, unsigned int order, int mt); diff --git a/mm/mm_init.c b/mm/mm_init.c index 15201887f8e0..0fd4ddfdfb2e 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -2330,6 +2330,28 @@ void __init init_cma_reserved_pageblock(struct page *page) } #endif +void set_zone_contiguous(struct zone *zone) +{ + unsigned long block_start_pfn = zone->zone_start_pfn; + unsigned long block_end_pfn; + + block_end_pfn = pageblock_end_pfn(block_start_pfn); + for (; block_start_pfn < zone_end_pfn(zone); + block_start_pfn = block_end_pfn, + block_end_pfn += pageblock_nr_pages) { + + block_end_pfn = min(block_end_pfn, zone_end_pfn(zone)); + + if (!__pageblock_pfn_to_page(block_start_pfn, + block_end_pfn, zone)) + return; + cond_resched(); + } + + /* We confirm that there is no hole */ + zone->contiguous = true; +} + void __init page_alloc_init_late(void) { struct zone *zone; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4f094ba7c8fb..7bb0d6abfe3d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1532,33 +1532,6 @@ struct page *__pageblock_pfn_to_page(unsigned long start_pfn, return start_page; } -void set_zone_contiguous(struct zone *zone) -{ - unsigned long block_start_pfn = zone->zone_start_pfn; - unsigned long block_end_pfn; - - block_end_pfn = pageblock_end_pfn(block_start_pfn); - for (; block_start_pfn < zone_end_pfn(zone); - block_start_pfn = block_end_pfn, - block_end_pfn += pageblock_nr_pages) { - - block_end_pfn = min(block_end_pfn, zone_end_pfn(zone)); - - if (!__pageblock_pfn_to_page(block_start_pfn, - block_end_pfn, zone)) - return; - cond_resched(); - } - - /* We confirm that there is no hole */ - zone->contiguous = true; -} - -void clear_zone_contiguous(struct zone *zone) -{ - zone->contiguous = false; -} - /* * The order of subdivision here is critical for the IO subsystem. * Please do not alter this order without good reasons and regression -- cgit v1.2.3 From 0866e82e40fba45dae07e6e8385929b574201752 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Tue, 16 May 2023 14:38:15 +0800 Subject: mm: page_alloc: split out FAIL_PAGE_ALLOC ... to a single file to reduce a bit of page_alloc.c. Link: https://lkml.kernel.org/r/20230516063821.121844-8-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Iurii Zaikin Cc: Kees Cook Cc: Len Brown Cc: Luis Chamberlain Cc: Mike Rapoport (IBM) Cc: Oscar Salvador Cc: Pavel Machek Cc: Rafael J. Wysocki Signed-off-by: Andrew Morton --- include/linux/fault-inject.h | 9 ++++++ mm/Makefile | 1 + mm/fail_page_alloc.c | 66 +++++++++++++++++++++++++++++++++++++++ mm/page_alloc.c | 74 -------------------------------------------- 4 files changed, 76 insertions(+), 74 deletions(-) create mode 100644 mm/fail_page_alloc.c (limited to 'include') diff --git a/include/linux/fault-inject.h b/include/linux/fault-inject.h index 481abf530b3c..6d5edef09d45 100644 --- a/include/linux/fault-inject.h +++ b/include/linux/fault-inject.h @@ -93,6 +93,15 @@ struct kmem_cache; bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order); +#ifdef CONFIG_FAIL_PAGE_ALLOC +bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order); +#else +static inline bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) +{ + return false; +} +#endif /* CONFIG_FAIL_PAGE_ALLOC */ + int should_failslab(struct kmem_cache *s, gfp_t gfpflags); #ifdef CONFIG_FAILSLAB extern bool __should_failslab(struct kmem_cache *s, gfp_t gfpflags); diff --git a/mm/Makefile b/mm/Makefile index 5262ce5baa28..0eec4bc72d3f 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -89,6 +89,7 @@ obj-$(CONFIG_KASAN) += kasan/ obj-$(CONFIG_KFENCE) += kfence/ obj-$(CONFIG_KMSAN) += kmsan/ obj-$(CONFIG_FAILSLAB) += failslab.o +obj-$(CONFIG_FAIL_PAGE_ALLOC) += fail_page_alloc.o obj-$(CONFIG_MEMTEST) += memtest.o obj-$(CONFIG_MIGRATION) += migrate.o obj-$(CONFIG_NUMA) += memory-tiers.o diff --git a/mm/fail_page_alloc.c b/mm/fail_page_alloc.c new file mode 100644 index 000000000000..b1b09cce9394 --- /dev/null +++ b/mm/fail_page_alloc.c @@ -0,0 +1,66 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include + +static struct { + struct fault_attr attr; + + bool ignore_gfp_highmem; + bool ignore_gfp_reclaim; + u32 min_order; +} fail_page_alloc = { + .attr = FAULT_ATTR_INITIALIZER, + .ignore_gfp_reclaim = true, + .ignore_gfp_highmem = true, + .min_order = 1, +}; + +static int __init setup_fail_page_alloc(char *str) +{ + return setup_fault_attr(&fail_page_alloc.attr, str); +} +__setup("fail_page_alloc=", setup_fail_page_alloc); + +bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) +{ + int flags = 0; + + if (order < fail_page_alloc.min_order) + return false; + if (gfp_mask & __GFP_NOFAIL) + return false; + if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM)) + return false; + if (fail_page_alloc.ignore_gfp_reclaim && + (gfp_mask & __GFP_DIRECT_RECLAIM)) + return false; + + /* See comment in __should_failslab() */ + if (gfp_mask & __GFP_NOWARN) + flags |= FAULT_NOWARN; + + return should_fail_ex(&fail_page_alloc.attr, 1 << order, flags); +} + +#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS + +static int __init fail_page_alloc_debugfs(void) +{ + umode_t mode = S_IFREG | 0600; + struct dentry *dir; + + dir = fault_create_debugfs_attr("fail_page_alloc", NULL, + &fail_page_alloc.attr); + + debugfs_create_bool("ignore-gfp-wait", mode, dir, + &fail_page_alloc.ignore_gfp_reclaim); + debugfs_create_bool("ignore-gfp-highmem", mode, dir, + &fail_page_alloc.ignore_gfp_highmem); + debugfs_create_u32("min-order", mode, dir, &fail_page_alloc.min_order); + + return 0; +} + +late_initcall(fail_page_alloc_debugfs); + +#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 37ac82083229..2b8e4a086c3d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2994,80 +2994,6 @@ out: return page; } -#ifdef CONFIG_FAIL_PAGE_ALLOC - -static struct { - struct fault_attr attr; - - bool ignore_gfp_highmem; - bool ignore_gfp_reclaim; - u32 min_order; -} fail_page_alloc = { - .attr = FAULT_ATTR_INITIALIZER, - .ignore_gfp_reclaim = true, - .ignore_gfp_highmem = true, - .min_order = 1, -}; - -static int __init setup_fail_page_alloc(char *str) -{ - return setup_fault_attr(&fail_page_alloc.attr, str); -} -__setup("fail_page_alloc=", setup_fail_page_alloc); - -static bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) -{ - int flags = 0; - - if (order < fail_page_alloc.min_order) - return false; - if (gfp_mask & __GFP_NOFAIL) - return false; - if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM)) - return false; - if (fail_page_alloc.ignore_gfp_reclaim && - (gfp_mask & __GFP_DIRECT_RECLAIM)) - return false; - - /* See comment in __should_failslab() */ - if (gfp_mask & __GFP_NOWARN) - flags |= FAULT_NOWARN; - - return should_fail_ex(&fail_page_alloc.attr, 1 << order, flags); -} - -#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS - -static int __init fail_page_alloc_debugfs(void) -{ - umode_t mode = S_IFREG | 0600; - struct dentry *dir; - - dir = fault_create_debugfs_attr("fail_page_alloc", NULL, - &fail_page_alloc.attr); - - debugfs_create_bool("ignore-gfp-wait", mode, dir, - &fail_page_alloc.ignore_gfp_reclaim); - debugfs_create_bool("ignore-gfp-highmem", mode, dir, - &fail_page_alloc.ignore_gfp_highmem); - debugfs_create_u32("min-order", mode, dir, &fail_page_alloc.min_order); - - return 0; -} - -late_initcall(fail_page_alloc_debugfs); - -#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ - -#else /* CONFIG_FAIL_PAGE_ALLOC */ - -static inline bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) -{ - return false; -} - -#endif /* CONFIG_FAIL_PAGE_ALLOC */ - noinline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) { return __should_fail_alloc_page(gfp_mask, order); -- cgit v1.2.3 From 884c175f12ce1fabff18ac113349628149fc6cf2 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Tue, 16 May 2023 14:38:16 +0800 Subject: mm: page_alloc: split out DEBUG_PAGEALLOC Move DEBUG_PAGEALLOC related functions into a single file to reduce a bit of page_alloc.c. Link: https://lkml.kernel.org/r/20230516063821.121844-9-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Iurii Zaikin Cc: Kees Cook Cc: Len Brown Cc: Luis Chamberlain Cc: Mike Rapoport (IBM) Cc: Oscar Salvador Cc: Pavel Machek Cc: Rafael J. Wysocki Signed-off-by: Andrew Morton --- include/linux/mm.h | 76 +++++++++++++++++++++++++++++++++------------------ mm/Makefile | 1 + mm/debug_page_alloc.c | 59 +++++++++++++++++++++++++++++++++++++++ mm/page_alloc.c | 69 ---------------------------------------------- 4 files changed, 109 insertions(+), 96 deletions(-) create mode 100644 mm/debug_page_alloc.c (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index f64bfbd53c65..2382eaf6fd81 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3471,9 +3471,58 @@ static inline void debug_pagealloc_unmap_pages(struct page *page, int numpages) if (debug_pagealloc_enabled_static()) __kernel_map_pages(page, numpages, 0); } + +extern unsigned int _debug_guardpage_minorder; +DECLARE_STATIC_KEY_FALSE(_debug_guardpage_enabled); + +static inline unsigned int debug_guardpage_minorder(void) +{ + return _debug_guardpage_minorder; +} + +static inline bool debug_guardpage_enabled(void) +{ + return static_branch_unlikely(&_debug_guardpage_enabled); +} + +static inline bool page_is_guard(struct page *page) +{ + if (!debug_guardpage_enabled()) + return false; + + return PageGuard(page); +} + +bool __set_page_guard(struct zone *zone, struct page *page, unsigned int order, + int migratetype); +static inline bool set_page_guard(struct zone *zone, struct page *page, + unsigned int order, int migratetype) +{ + if (!debug_guardpage_enabled()) + return false; + return __set_page_guard(zone, page, order, migratetype); +} + +void __clear_page_guard(struct zone *zone, struct page *page, unsigned int order, + int migratetype); +static inline void clear_page_guard(struct zone *zone, struct page *page, + unsigned int order, int migratetype) +{ + if (!debug_guardpage_enabled()) + return; + __clear_page_guard(zone, page, order, migratetype); +} + #else /* CONFIG_DEBUG_PAGEALLOC */ static inline void debug_pagealloc_map_pages(struct page *page, int numpages) {} static inline void debug_pagealloc_unmap_pages(struct page *page, int numpages) {} +static inline unsigned int debug_guardpage_minorder(void) { return 0; } +static inline bool debug_guardpage_enabled(void) { return false; } +static inline bool page_is_guard(struct page *page) { return false; } +static inline bool set_page_guard(struct zone *zone, struct page *page, + unsigned int order, int migratetype) { return false; } +static inline void clear_page_guard(struct zone *zone, struct page *page, + unsigned int order, int migratetype) {} #endif /* CONFIG_DEBUG_PAGEALLOC */ #ifdef __HAVE_ARCH_GATE_AREA @@ -3711,33 +3760,6 @@ static inline bool vma_is_special_huge(const struct vm_area_struct *vma) #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ -#ifdef CONFIG_DEBUG_PAGEALLOC -extern unsigned int _debug_guardpage_minorder; -DECLARE_STATIC_KEY_FALSE(_debug_guardpage_enabled); - -static inline unsigned int debug_guardpage_minorder(void) -{ - return _debug_guardpage_minorder; -} - -static inline bool debug_guardpage_enabled(void) -{ - return static_branch_unlikely(&_debug_guardpage_enabled); -} - -static inline bool page_is_guard(struct page *page) -{ - if (!debug_guardpage_enabled()) - return false; - - return PageGuard(page); -} -#else -static inline unsigned int debug_guardpage_minorder(void) { return 0; } -static inline bool debug_guardpage_enabled(void) { return false; } -static inline bool page_is_guard(struct page *page) { return false; } -#endif /* CONFIG_DEBUG_PAGEALLOC */ - #if MAX_NUMNODES > 1 void __init setup_nr_node_ids(void); #else diff --git a/mm/Makefile b/mm/Makefile index 0eec4bc72d3f..678530a07326 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -124,6 +124,7 @@ obj-$(CONFIG_SECRETMEM) += secretmem.o obj-$(CONFIG_CMA_SYSFS) += cma_sysfs.o obj-$(CONFIG_USERFAULTFD) += userfaultfd.o obj-$(CONFIG_IDLE_PAGE_TRACKING) += page_idle.o +obj-$(CONFIG_DEBUG_PAGEALLOC) += debug_page_alloc.o obj-$(CONFIG_DEBUG_PAGE_REF) += debug_page_ref.o obj-$(CONFIG_DAMON) += damon/ obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o diff --git a/mm/debug_page_alloc.c b/mm/debug_page_alloc.c new file mode 100644 index 000000000000..f9d145730fd1 --- /dev/null +++ b/mm/debug_page_alloc.c @@ -0,0 +1,59 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include + +unsigned int _debug_guardpage_minorder; + +bool _debug_pagealloc_enabled_early __read_mostly + = IS_ENABLED(CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT); +EXPORT_SYMBOL(_debug_pagealloc_enabled_early); +DEFINE_STATIC_KEY_FALSE(_debug_pagealloc_enabled); +EXPORT_SYMBOL(_debug_pagealloc_enabled); + +DEFINE_STATIC_KEY_FALSE(_debug_guardpage_enabled); + +static int __init early_debug_pagealloc(char *buf) +{ + return kstrtobool(buf, &_debug_pagealloc_enabled_early); +} +early_param("debug_pagealloc", early_debug_pagealloc); + +static int __init debug_guardpage_minorder_setup(char *buf) +{ + unsigned long res; + + if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) { + pr_err("Bad debug_guardpage_minorder value\n"); + return 0; + } + _debug_guardpage_minorder = res; + pr_info("Setting debug_guardpage_minorder to %lu\n", res); + return 0; +} +early_param("debug_guardpage_minorder", debug_guardpage_minorder_setup); + +bool __set_page_guard(struct zone *zone, struct page *page, unsigned int order, + int migratetype) +{ + if (order >= debug_guardpage_minorder()) + return false; + + __SetPageGuard(page); + INIT_LIST_HEAD(&page->buddy_list); + set_page_private(page, order); + /* Guard pages are not available for any usage */ + if (!is_migrate_isolate(migratetype)) + __mod_zone_freepage_state(zone, -(1 << order), migratetype); + + return true; +} + +void __clear_page_guard(struct zone *zone, struct page *page, unsigned int order, + int migratetype) +{ + __ClearPageGuard(page); + + set_page_private(page, 0); + if (!is_migrate_isolate(migratetype)) + __mod_zone_freepage_state(zone, (1 << order), migratetype); +} diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2b8e4a086c3d..40fa763c5074 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -664,75 +664,6 @@ void destroy_large_folio(struct folio *folio) compound_page_dtors[dtor](&folio->page); } -#ifdef CONFIG_DEBUG_PAGEALLOC -unsigned int _debug_guardpage_minorder; - -bool _debug_pagealloc_enabled_early __read_mostly - = IS_ENABLED(CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT); -EXPORT_SYMBOL(_debug_pagealloc_enabled_early); -DEFINE_STATIC_KEY_FALSE(_debug_pagealloc_enabled); -EXPORT_SYMBOL(_debug_pagealloc_enabled); - -DEFINE_STATIC_KEY_FALSE(_debug_guardpage_enabled); - -static int __init early_debug_pagealloc(char *buf) -{ - return kstrtobool(buf, &_debug_pagealloc_enabled_early); -} -early_param("debug_pagealloc", early_debug_pagealloc); - -static int __init debug_guardpage_minorder_setup(char *buf) -{ - unsigned long res; - - if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) { - pr_err("Bad debug_guardpage_minorder value\n"); - return 0; - } - _debug_guardpage_minorder = res; - pr_info("Setting debug_guardpage_minorder to %lu\n", res); - return 0; -} -early_param("debug_guardpage_minorder", debug_guardpage_minorder_setup); - -static inline bool set_page_guard(struct zone *zone, struct page *page, - unsigned int order, int migratetype) -{ - if (!debug_guardpage_enabled()) - return false; - - if (order >= debug_guardpage_minorder()) - return false; - - __SetPageGuard(page); - INIT_LIST_HEAD(&page->buddy_list); - set_page_private(page, order); - /* Guard pages are not available for any usage */ - if (!is_migrate_isolate(migratetype)) - __mod_zone_freepage_state(zone, -(1 << order), migratetype); - - return true; -} - -static inline void clear_page_guard(struct zone *zone, struct page *page, - unsigned int order, int migratetype) -{ - if (!debug_guardpage_enabled()) - return; - - __ClearPageGuard(page); - - set_page_private(page, 0); - if (!is_migrate_isolate(migratetype)) - __mod_zone_freepage_state(zone, (1 << order), migratetype); -} -#else -static inline bool set_page_guard(struct zone *zone, struct page *page, - unsigned int order, int migratetype) { return false; } -static inline void clear_page_guard(struct zone *zone, struct page *page, - unsigned int order, int migratetype) {} -#endif - static inline void set_buddy_order(struct page *page, unsigned int order) { set_page_private(page, order); -- cgit v1.2.3 From 31a1b9d7fe768db521b12287ec6426983e9787e3 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Tue, 16 May 2023 14:38:17 +0800 Subject: mm: page_alloc: move mark_free_page() into snapshot.c The mark_free_page() is only used in kernel/power/snapshot.c, move it out to reduce a bit of page_alloc.c Link: https://lkml.kernel.org/r/20230516063821.121844-10-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Iurii Zaikin Cc: Kees Cook Cc: Len Brown Cc: Luis Chamberlain Cc: Mike Rapoport (IBM) Cc: Oscar Salvador Cc: Pavel Machek Cc: Rafael J. Wysocki Signed-off-by: Andrew Morton --- include/linux/suspend.h | 3 --- kernel/power/snapshot.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++ mm/page_alloc.c | 55 ------------------------------------------------- 3 files changed, 52 insertions(+), 58 deletions(-) (limited to 'include') diff --git a/include/linux/suspend.h b/include/linux/suspend.h index d0d4598a7b3f..3950a7bf33ae 100644 --- a/include/linux/suspend.h +++ b/include/linux/suspend.h @@ -364,9 +364,6 @@ struct pbe { struct pbe *next; }; -/* mm/page_alloc.c */ -extern void mark_free_pages(struct zone *zone); - /** * struct platform_hibernation_ops - hibernation platform support * diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index cd8b7b35f1e8..45ef0bf81c85 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1228,6 +1228,58 @@ unsigned int snapshot_additional_pages(struct zone *zone) return 2 * rtree; } +/* + * Touch the watchdog for every WD_PAGE_COUNT pages. + */ +#define WD_PAGE_COUNT (128*1024) + +static void mark_free_pages(struct zone *zone) +{ + unsigned long pfn, max_zone_pfn, page_count = WD_PAGE_COUNT; + unsigned long flags; + unsigned int order, t; + struct page *page; + + if (zone_is_empty(zone)) + return; + + spin_lock_irqsave(&zone->lock, flags); + + max_zone_pfn = zone_end_pfn(zone); + for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) + if (pfn_valid(pfn)) { + page = pfn_to_page(pfn); + + if (!--page_count) { + touch_nmi_watchdog(); + page_count = WD_PAGE_COUNT; + } + + if (page_zone(page) != zone) + continue; + + if (!swsusp_page_is_forbidden(page)) + swsusp_unset_page_free(page); + } + + for_each_migratetype_order(order, t) { + list_for_each_entry(page, + &zone->free_area[order].free_list[t], buddy_list) { + unsigned long i; + + pfn = page_to_pfn(page); + for (i = 0; i < (1UL << order); i++) { + if (!--page_count) { + touch_nmi_watchdog(); + page_count = WD_PAGE_COUNT; + } + swsusp_set_page_free(pfn_to_page(pfn + i)); + } + } + } + spin_unlock_irqrestore(&zone->lock, flags); +} + #ifdef CONFIG_HIGHMEM /** * count_free_highmem_pages - Compute the total number of free highmem pages. diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 40fa763c5074..8d306203e555 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2365,61 +2365,6 @@ void drain_all_pages(struct zone *zone) __drain_all_pages(zone, false); } -#ifdef CONFIG_HIBERNATION - -/* - * Touch the watchdog for every WD_PAGE_COUNT pages. - */ -#define WD_PAGE_COUNT (128*1024) - -void mark_free_pages(struct zone *zone) -{ - unsigned long pfn, max_zone_pfn, page_count = WD_PAGE_COUNT; - unsigned long flags; - unsigned int order, t; - struct page *page; - - if (zone_is_empty(zone)) - return; - - spin_lock_irqsave(&zone->lock, flags); - - max_zone_pfn = zone_end_pfn(zone); - for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) - if (pfn_valid(pfn)) { - page = pfn_to_page(pfn); - - if (!--page_count) { - touch_nmi_watchdog(); - page_count = WD_PAGE_COUNT; - } - - if (page_zone(page) != zone) - continue; - - if (!swsusp_page_is_forbidden(page)) - swsusp_unset_page_free(page); - } - - for_each_migratetype_order(order, t) { - list_for_each_entry(page, - &zone->free_area[order].free_list[t], buddy_list) { - unsigned long i; - - pfn = page_to_pfn(page); - for (i = 0; i < (1UL << order); i++) { - if (!--page_count) { - touch_nmi_watchdog(); - page_count = WD_PAGE_COUNT; - } - swsusp_set_page_free(pfn_to_page(pfn + i)); - } - } - } - spin_unlock_irqrestore(&zone->lock, flags); -} -#endif /* CONFIG_PM */ - static bool free_unref_page_prepare(struct page *page, unsigned long pfn, unsigned int order) { -- cgit v1.2.3 From 07f44ac3c90c50a201307d3fe4dda120ee8394f5 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Tue, 16 May 2023 14:38:18 +0800 Subject: mm: page_alloc: move pm_* function into power pm_restrict_gfp_mask()/pm_restore_gfp_mask() only used in power, let's move them out of page_alloc.c. Adding a general gfp_has_io_fs() function which return true if gfp with both __GFP_IO and __GFP_FS flags, then use it inside of pm_suspended_storage(), also the pm_suspended_storage() is moved into suspend.h. Link: https://lkml.kernel.org/r/20230516063821.121844-11-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Iurii Zaikin Cc: Kees Cook Cc: Len Brown Cc: Luis Chamberlain Cc: Mike Rapoport (IBM) Cc: Oscar Salvador Cc: Pavel Machek Cc: Rafael J. Wysocki Signed-off-by: Andrew Morton --- include/linux/gfp.h | 15 ++++----------- include/linux/suspend.h | 6 ++++++ kernel/power/main.c | 27 +++++++++++++++++++++++++++ kernel/power/power.h | 5 +++++ mm/page_alloc.c | 38 -------------------------------------- mm/swapfile.c | 1 + 6 files changed, 43 insertions(+), 49 deletions(-) (limited to 'include') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index ed8cb537c6a7..665f06675c83 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -338,19 +338,12 @@ extern gfp_t gfp_allowed_mask; /* Returns true if the gfp_mask allows use of ALLOC_NO_WATERMARK */ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask); -extern void pm_restrict_gfp_mask(void); -extern void pm_restore_gfp_mask(void); - -extern gfp_t vma_thp_gfp_mask(struct vm_area_struct *vma); - -#ifdef CONFIG_PM_SLEEP -extern bool pm_suspended_storage(void); -#else -static inline bool pm_suspended_storage(void) +static inline bool gfp_has_io_fs(gfp_t gfp) { - return false; + return (gfp & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS); } -#endif /* CONFIG_PM_SLEEP */ + +extern gfp_t vma_thp_gfp_mask(struct vm_area_struct *vma); #ifdef CONFIG_CONTIG_ALLOC /* The below functions must be run on a range from a single zone. */ diff --git a/include/linux/suspend.h b/include/linux/suspend.h index 3950a7bf33ae..76923051c03d 100644 --- a/include/linux/suspend.h +++ b/include/linux/suspend.h @@ -502,6 +502,11 @@ extern void pm_report_max_hw_sleep(u64 t); extern bool events_check_enabled; extern suspend_state_t pm_suspend_target_state; +static inline bool pm_suspended_storage(void) +{ + return !gfp_has_io_fs(gfp_allowed_mask); +} + extern bool pm_wakeup_pending(void); extern void pm_system_wakeup(void); extern void pm_system_cancel_wakeup(void); @@ -535,6 +540,7 @@ static inline void ksys_sync_helper(void) {} #define pm_notifier(fn, pri) do { (void)(fn); } while (0) +static inline bool pm_suspended_storage(void) { return false; } static inline bool pm_wakeup_pending(void) { return false; } static inline void pm_system_wakeup(void) {} static inline void pm_wakeup_clear(bool reset) {} diff --git a/kernel/power/main.c b/kernel/power/main.c index 3113ec2f1db4..34fc8359145b 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -21,6 +21,33 @@ #include "power.h" #ifdef CONFIG_PM_SLEEP +/* + * The following functions are used by the suspend/hibernate code to temporarily + * change gfp_allowed_mask in order to avoid using I/O during memory allocations + * while devices are suspended. To avoid races with the suspend/hibernate code, + * they should always be called with system_transition_mutex held + * (gfp_allowed_mask also should only be modified with system_transition_mutex + * held, unless the suspend/hibernate code is guaranteed not to run in parallel + * with that modification). + */ +static gfp_t saved_gfp_mask; + +void pm_restore_gfp_mask(void) +{ + WARN_ON(!mutex_is_locked(&system_transition_mutex)); + if (saved_gfp_mask) { + gfp_allowed_mask = saved_gfp_mask; + saved_gfp_mask = 0; + } +} + +void pm_restrict_gfp_mask(void) +{ + WARN_ON(!mutex_is_locked(&system_transition_mutex)); + WARN_ON(saved_gfp_mask); + saved_gfp_mask = gfp_allowed_mask; + gfp_allowed_mask &= ~(__GFP_IO | __GFP_FS); +} unsigned int lock_system_sleep(void) { diff --git a/kernel/power/power.h b/kernel/power/power.h index b83c8d5e188d..ac14d1b463d1 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -216,6 +216,11 @@ static inline void suspend_test_finish(const char *label) {} /* kernel/power/main.c */ extern int pm_notifier_call_chain_robust(unsigned long val_up, unsigned long val_down); extern int pm_notifier_call_chain(unsigned long val); +void pm_restrict_gfp_mask(void); +void pm_restore_gfp_mask(void); +#else +static inline void pm_restrict_gfp_mask(void) {} +static inline void pm_restore_gfp_mask(void) {} #endif #ifdef CONFIG_HIGHMEM diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8d306203e555..005aa0202ae0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -227,44 +227,6 @@ static inline void set_pcppage_migratetype(struct page *page, int migratetype) page->index = migratetype; } -#ifdef CONFIG_PM_SLEEP -/* - * The following functions are used by the suspend/hibernate code to temporarily - * change gfp_allowed_mask in order to avoid using I/O during memory allocations - * while devices are suspended. To avoid races with the suspend/hibernate code, - * they should always be called with system_transition_mutex held - * (gfp_allowed_mask also should only be modified with system_transition_mutex - * held, unless the suspend/hibernate code is guaranteed not to run in parallel - * with that modification). - */ - -static gfp_t saved_gfp_mask; - -void pm_restore_gfp_mask(void) -{ - WARN_ON(!mutex_is_locked(&system_transition_mutex)); - if (saved_gfp_mask) { - gfp_allowed_mask = saved_gfp_mask; - saved_gfp_mask = 0; - } -} - -void pm_restrict_gfp_mask(void) -{ - WARN_ON(!mutex_is_locked(&system_transition_mutex)); - WARN_ON(saved_gfp_mask); - saved_gfp_mask = gfp_allowed_mask; - gfp_allowed_mask &= ~(__GFP_IO | __GFP_FS); -} - -bool pm_suspended_storage(void) -{ - if ((gfp_allowed_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS)) - return false; - return true; -} -#endif /* CONFIG_PM_SLEEP */ - #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE unsigned int pageblock_order __read_mostly; #endif diff --git a/mm/swapfile.c b/mm/swapfile.c index 274bbf797480..c74259001d5e 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -41,6 +41,7 @@ #include #include #include +#include #include #include -- cgit v1.2.3 From e95d372c4cd46b6ec4eeacc07adcb7260ab4cfa0 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Tue, 16 May 2023 14:38:20 +0800 Subject: mm: page_alloc: move sysctls into it own fils This moves all page alloc related sysctls to its own file, as part of the kernel/sysctl.c spring cleaning, also move some functions declarations from mm.h into internal.h. Link: https://lkml.kernel.org/r/20230516063821.121844-13-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Iurii Zaikin Cc: Kees Cook Cc: Len Brown Cc: Luis Chamberlain Cc: Mike Rapoport (IBM) Cc: Oscar Salvador Cc: Pavel Machek Cc: Rafael J. Wysocki Signed-off-by: Andrew Morton --- include/linux/mm.h | 11 ------ include/linux/mmzone.h | 21 ---------- kernel/sysctl.c | 67 -------------------------------- mm/internal.h | 11 ++++++ mm/mm_init.c | 2 + mm/page_alloc.c | 103 ++++++++++++++++++++++++++++++++++++++++++------- 6 files changed, 102 insertions(+), 113 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 2382eaf6fd81..6d7e03d83da7 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2994,12 +2994,6 @@ extern int __meminit early_pfn_to_nid(unsigned long pfn); #endif extern void set_dma_reserve(unsigned long new_dma_reserve); -extern void memmap_init_range(unsigned long, int, unsigned long, - unsigned long, unsigned long, enum meminit_context, - struct vmem_altmap *, int migratetype); -extern void setup_per_zone_wmarks(void); -extern void calculate_min_free_kbytes(void); -extern int __meminit init_per_zone_wmark_min(void); extern void mem_init(void); extern void __init mmap_init(void); @@ -3020,11 +3014,6 @@ void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...); extern void setup_per_cpu_pageset(void); -/* page_alloc.c */ -extern int min_free_kbytes; -extern int watermark_boost_factor; -extern int watermark_scale_factor; - /* nommu.c */ extern atomic_long_t mmap_pages_allocated; extern int nommu_shrink_inode_mappings(struct inode *, size_t, size_t); diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index a4889c9d4055..3a68326c9989 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1512,27 +1512,6 @@ static inline bool has_managed_dma(void) } #endif -/* These two functions are used to setup the per zone pages min values */ -struct ctl_table; - -int min_free_kbytes_sysctl_handler(struct ctl_table *, int, void *, size_t *, - loff_t *); -int watermark_scale_factor_sysctl_handler(struct ctl_table *, int, void *, - size_t *, loff_t *); -extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES]; -int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, void *, - size_t *, loff_t *); -int percpu_pagelist_high_fraction_sysctl_handler(struct ctl_table *, int, - void *, size_t *, loff_t *); -int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *, int, - void *, size_t *, loff_t *); -int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *, int, - void *, size_t *, loff_t *); -int numa_zonelist_order_handler(struct ctl_table *, int, - void *, size_t *, loff_t *); -extern int percpu_pagelist_high_fraction; -extern char numa_zonelist_order[]; -#define NUMA_ZONELIST_ORDER_LEN 16 #ifndef CONFIG_NUMA diff --git a/kernel/sysctl.c b/kernel/sysctl.c index bfe53e835524..a57de67f032f 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2119,13 +2119,6 @@ static struct ctl_table vm_table[] = { .extra2 = SYSCTL_ONE, }, #endif - { - .procname = "lowmem_reserve_ratio", - .data = &sysctl_lowmem_reserve_ratio, - .maxlen = sizeof(sysctl_lowmem_reserve_ratio), - .mode = 0644, - .proc_handler = lowmem_reserve_ratio_sysctl_handler, - }, { .procname = "drop_caches", .data = &sysctl_drop_caches, @@ -2135,39 +2128,6 @@ static struct ctl_table vm_table[] = { .extra1 = SYSCTL_ONE, .extra2 = SYSCTL_FOUR, }, - { - .procname = "min_free_kbytes", - .data = &min_free_kbytes, - .maxlen = sizeof(min_free_kbytes), - .mode = 0644, - .proc_handler = min_free_kbytes_sysctl_handler, - .extra1 = SYSCTL_ZERO, - }, - { - .procname = "watermark_boost_factor", - .data = &watermark_boost_factor, - .maxlen = sizeof(watermark_boost_factor), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - }, - { - .procname = "watermark_scale_factor", - .data = &watermark_scale_factor, - .maxlen = sizeof(watermark_scale_factor), - .mode = 0644, - .proc_handler = watermark_scale_factor_sysctl_handler, - .extra1 = SYSCTL_ONE, - .extra2 = SYSCTL_THREE_THOUSAND, - }, - { - .procname = "percpu_pagelist_high_fraction", - .data = &percpu_pagelist_high_fraction, - .maxlen = sizeof(percpu_pagelist_high_fraction), - .mode = 0644, - .proc_handler = percpu_pagelist_high_fraction_sysctl_handler, - .extra1 = SYSCTL_ZERO, - }, { .procname = "page_lock_unfairness", .data = &sysctl_page_lock_unfairness, @@ -2223,24 +2183,6 @@ static struct ctl_table vm_table[] = { .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, }, - { - .procname = "min_unmapped_ratio", - .data = &sysctl_min_unmapped_ratio, - .maxlen = sizeof(sysctl_min_unmapped_ratio), - .mode = 0644, - .proc_handler = sysctl_min_unmapped_ratio_sysctl_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE_HUNDRED, - }, - { - .procname = "min_slab_ratio", - .data = &sysctl_min_slab_ratio, - .maxlen = sizeof(sysctl_min_slab_ratio), - .mode = 0644, - .proc_handler = sysctl_min_slab_ratio_sysctl_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE_HUNDRED, - }, #endif #ifdef CONFIG_SMP { @@ -2267,15 +2209,6 @@ static struct ctl_table vm_table[] = { .proc_handler = mmap_min_addr_handler, }, #endif -#ifdef CONFIG_NUMA - { - .procname = "numa_zonelist_order", - .data = &numa_zonelist_order, - .maxlen = NUMA_ZONELIST_ORDER_LEN, - .mode = 0644, - .proc_handler = numa_zonelist_order_handler, - }, -#endif #if (defined(CONFIG_X86_32) && !defined(CONFIG_UML))|| \ (defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL)) { diff --git a/mm/internal.h b/mm/internal.h index c99da2cfac71..66d7ddf7e211 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -213,6 +213,13 @@ static inline bool is_check_pages_enabled(void) return static_branch_unlikely(&check_pages_enabled); } +extern int min_free_kbytes; + +void setup_per_zone_wmarks(void); +void calculate_min_free_kbytes(void); +int __meminit init_per_zone_wmark_min(void); +void page_alloc_sysctl_init(void); + /* * Structure for holding the mostly immutable allocation parameters passed * between functions involved in allocations, including the alloc_pages* @@ -423,6 +430,10 @@ extern void *memmap_alloc(phys_addr_t size, phys_addr_t align, phys_addr_t min_addr, int nid, bool exact_nid); +void memmap_init_range(unsigned long, int, unsigned long, unsigned long, + unsigned long, enum meminit_context, struct vmem_altmap *, int); + + int split_free_page(struct page *free_page, unsigned int order, unsigned long split_pfn_offset); diff --git a/mm/mm_init.c b/mm/mm_init.c index 0fd4ddfdfb2e..10bf560302c4 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -2392,6 +2392,8 @@ void __init page_alloc_init_late(void) /* Initialize page ext after all struct pages are initialized. */ if (deferred_struct_pages) page_ext_init(); + + page_alloc_sysctl_init(); } #ifndef __HAVE_ARCH_RESERVED_KERNEL_PAGES diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 005aa0202ae0..d19a05264125 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -206,7 +206,6 @@ nodemask_t node_states[NR_NODE_STATES] __read_mostly = { }; EXPORT_SYMBOL(node_states); -int percpu_pagelist_high_fraction; gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; /* @@ -302,8 +301,8 @@ compound_page_dtor * const compound_page_dtors[NR_COMPOUND_DTORS] = { int min_free_kbytes = 1024; int user_min_free_kbytes = -1; -int watermark_boost_factor __read_mostly = 15000; -int watermark_scale_factor = 10; +static int watermark_boost_factor __read_mostly = 15000; +static int watermark_scale_factor = 10; /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ int movable_zone; @@ -4880,12 +4879,12 @@ static int __parse_numa_zonelist_order(char *s) return 0; } -char numa_zonelist_order[] = "Node"; - +static char numa_zonelist_order[] = "Node"; +#define NUMA_ZONELIST_ORDER_LEN 16 /* * sysctl handler for numa_zonelist_order */ -int numa_zonelist_order_handler(struct ctl_table *table, int write, +static int numa_zonelist_order_handler(struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos) { if (write) @@ -4893,7 +4892,6 @@ int numa_zonelist_order_handler(struct ctl_table *table, int write, return proc_dostring(table, write, buffer, length, ppos); } - static int node_load[MAX_NUMNODES]; /** @@ -5296,6 +5294,7 @@ static int zone_batchsize(struct zone *zone) #endif } +static int percpu_pagelist_high_fraction; static int zone_highsize(struct zone *zone, int batch, int cpu_online) { #ifdef CONFIG_MMU @@ -5825,7 +5824,7 @@ postcore_initcall(init_per_zone_wmark_min) * that we can call two helper functions whenever min_free_kbytes * changes. */ -int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write, +static int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos) { int rc; @@ -5841,7 +5840,7 @@ int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write, return 0; } -int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write, +static int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos) { int rc; @@ -5871,7 +5870,7 @@ static void setup_min_unmapped_ratio(void) } -int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write, +static int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos) { int rc; @@ -5898,7 +5897,7 @@ static void setup_min_slab_ratio(void) sysctl_min_slab_ratio) / 100; } -int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write, +static int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos) { int rc; @@ -5922,8 +5921,8 @@ int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write, * minimum watermarks. The lowmem reserve ratio can only make sense * if in function of the boot time zone sizes. */ -int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, int write, - void *buffer, size_t *length, loff_t *ppos) +static int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, + int write, void *buffer, size_t *length, loff_t *ppos) { int i; @@ -5943,7 +5942,7 @@ int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, int write, * cpu. It is the fraction of total pages in each zone that a hot per cpu * pagelist can have before it gets flushed back to buddy allocator. */ -int percpu_pagelist_high_fraction_sysctl_handler(struct ctl_table *table, +static int percpu_pagelist_high_fraction_sysctl_handler(struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos) { struct zone *zone; @@ -5976,6 +5975,82 @@ out: return ret; } +static struct ctl_table page_alloc_sysctl_table[] = { + { + .procname = "min_free_kbytes", + .data = &min_free_kbytes, + .maxlen = sizeof(min_free_kbytes), + .mode = 0644, + .proc_handler = min_free_kbytes_sysctl_handler, + .extra1 = SYSCTL_ZERO, + }, + { + .procname = "watermark_boost_factor", + .data = &watermark_boost_factor, + .maxlen = sizeof(watermark_boost_factor), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + }, + { + .procname = "watermark_scale_factor", + .data = &watermark_scale_factor, + .maxlen = sizeof(watermark_scale_factor), + .mode = 0644, + .proc_handler = watermark_scale_factor_sysctl_handler, + .extra1 = SYSCTL_ONE, + .extra2 = SYSCTL_THREE_THOUSAND, + }, + { + .procname = "percpu_pagelist_high_fraction", + .data = &percpu_pagelist_high_fraction, + .maxlen = sizeof(percpu_pagelist_high_fraction), + .mode = 0644, + .proc_handler = percpu_pagelist_high_fraction_sysctl_handler, + .extra1 = SYSCTL_ZERO, + }, + { + .procname = "lowmem_reserve_ratio", + .data = &sysctl_lowmem_reserve_ratio, + .maxlen = sizeof(sysctl_lowmem_reserve_ratio), + .mode = 0644, + .proc_handler = lowmem_reserve_ratio_sysctl_handler, + }, +#ifdef CONFIG_NUMA + { + .procname = "numa_zonelist_order", + .data = &numa_zonelist_order, + .maxlen = NUMA_ZONELIST_ORDER_LEN, + .mode = 0644, + .proc_handler = numa_zonelist_order_handler, + }, + { + .procname = "min_unmapped_ratio", + .data = &sysctl_min_unmapped_ratio, + .maxlen = sizeof(sysctl_min_unmapped_ratio), + .mode = 0644, + .proc_handler = sysctl_min_unmapped_ratio_sysctl_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE_HUNDRED, + }, + { + .procname = "min_slab_ratio", + .data = &sysctl_min_slab_ratio, + .maxlen = sizeof(sysctl_min_slab_ratio), + .mode = 0644, + .proc_handler = sysctl_min_slab_ratio_sysctl_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE_HUNDRED, + }, +#endif + {} +}; + +void __init page_alloc_sysctl_init(void) +{ + register_sysctl_init("vm", page_alloc_sysctl_table); +} + #ifdef CONFIG_CONTIG_ALLOC /* Usage: See admin-guide/dynamic-debug-howto.rst */ static void alloc_contig_dump_pages(struct list_head *page_list) -- cgit v1.2.3 From f6797adff7f09b4d7f7607c99116409b5ddb54d9 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Tue, 16 May 2023 15:52:05 -0700 Subject: mm/hugetlb: remove hugetlb_page_subpool() All users of hugetlb_page_subpool() have been converted to use the folio equivalent. This function can be safely removed. Link: https://lkml.kernel.org/r/20230516225205.1429196-1-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: David Hildenbrand Reviewed-by: Mike Kravetz Cc: Matthew Wilcox (Oracle) Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'include') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 6d041aa9f0fe..f1543a0568ff 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -757,14 +757,6 @@ static inline struct hugepage_subpool *hugetlb_folio_subpool(struct folio *folio return folio->_hugetlb_subpool; } -/* - * hugetlb page subpool pointer located in hpage[2].hugetlb_subpool - */ -static inline struct hugepage_subpool *hugetlb_page_subpool(struct page *hpage) -{ - return hugetlb_folio_subpool(page_folio(hpage)); -} - static inline void hugetlb_set_folio_subpool(struct folio *folio, struct hugepage_subpool *subpool) { @@ -1031,11 +1023,6 @@ static inline struct hugepage_subpool *hugetlb_folio_subpool(struct folio *folio return NULL; } -static inline struct hugepage_subpool *hugetlb_page_subpool(struct page *hpage) -{ - return NULL; -} - static inline int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list) { -- cgit v1.2.3 From 54d020692b342f7bd02d7f5795fb5c401caecfcc Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Wed, 17 May 2023 20:25:33 +0100 Subject: mm/gup: remove unused vmas parameter from get_user_pages() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "remove the vmas parameter from GUP APIs", v6. (pin_/get)_user_pages[_remote]() each provide an optional output parameter for an array of VMA objects associated with each page in the input range. These provide the means for VMAs to be returned, as long as mm->mmap_lock is never released during the GUP operation (i.e. the internal flag FOLL_UNLOCKABLE is not specified). In addition, these VMAs can only be accessed with the mmap_lock held and become invalidated the moment it is released. The vast majority of invocations do not use this functionality and of those that do, all but one case retrieve a single VMA to perform checks upon. It is not egregious in the single VMA cases to simply replace the operation with a vma_lookup(). In these cases we duplicate the (fast) lookup on a slow path already under the mmap_lock, abstracted to a new get_user_page_vma_remote() inline helper function which also performs error checking and reference count maintenance. The special case is io_uring, where io_pin_pages() specifically needs to assert that the VMAs underlying the range do not result in broken long-term GUP file-backed mappings. As GUP now internally asserts that FOLL_LONGTERM mappings are not file-backed in a broken fashion (i.e. requiring dirty tracking) - as implemented in "mm/gup: disallow FOLL_LONGTERM GUP-nonfast writing to file-backed mappings" - this logic is no longer required and so we can simply remove it altogether from io_uring. Eliminating the vmas parameter eliminates an entire class of danging pointer errors that might have occured should the lock have been incorrectly released. In addition, the API is simplified and now clearly expresses what it is intended for - applying the specified GUP flags and (if pinning) returning pinned pages. This change additionally opens the door to further potential improvements in GUP and the possible marrying of disparate code paths. I have run this series against gup_test with no issues. Thanks to Matthew Wilcox for suggesting this refactoring! This patch (of 6): No invocation of get_user_pages() use the vmas parameter, so remove it. The GUP API is confusing and caveated. Recent changes have done much to improve that, however there is more we can do. Exporting vmas is a prime target as the caller has to be extremely careful to preclude their use after the mmap_lock has expired or otherwise be left with dangling pointers. Removing the vmas parameter focuses the GUP functions upon their primary purpose - pinning (and outputting) pages as well as performing the actions implied by the input flags. This is part of a patch series aiming to remove the vmas parameter altogether. Link: https://lkml.kernel.org/r/cover.1684350871.git.lstoakes@gmail.com Link: https://lkml.kernel.org/r/589e0c64794668ffc799651e8d85e703262b1e9d.1684350871.git.lstoakes@gmail.com Signed-off-by: Lorenzo Stoakes Suggested-by: Matthew Wilcox (Oracle) Acked-by: Greg Kroah-Hartman Acked-by: David Hildenbrand Reviewed-by: Jason Gunthorpe Acked-by: Christian König (for radeon parts) Acked-by: Jarkko Sakkinen Reviewed-by: Christoph Hellwig Acked-by: Sean Christopherson (KVM) Cc: Catalin Marinas Cc: Dennis Dalessandro Cc: Janosch Frank Cc: Jens Axboe Cc: Sakari Ailus Signed-off-by: Andrew Morton --- arch/x86/kernel/cpu/sgx/ioctl.c | 2 +- drivers/gpu/drm/radeon/radeon_ttm.c | 2 +- drivers/misc/sgi-gru/grufault.c | 2 +- include/linux/mm.h | 3 +-- mm/gup.c | 9 +++------ mm/gup_test.c | 5 ++--- virt/kvm/kvm_main.c | 2 +- 7 files changed, 10 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c index 21ca0a831b70..5d390df21440 100644 --- a/arch/x86/kernel/cpu/sgx/ioctl.c +++ b/arch/x86/kernel/cpu/sgx/ioctl.c @@ -214,7 +214,7 @@ static int __sgx_encl_add_page(struct sgx_encl *encl, if (!(vma->vm_flags & VM_MAYEXEC)) return -EACCES; - ret = get_user_pages(src, 1, 0, &src_page, NULL); + ret = get_user_pages(src, 1, 0, &src_page); if (ret < 1) return -EFAULT; diff --git a/drivers/gpu/drm/radeon/radeon_ttm.c b/drivers/gpu/drm/radeon/radeon_ttm.c index 2220cdf6a3f6..3a9db030f98f 100644 --- a/drivers/gpu/drm/radeon/radeon_ttm.c +++ b/drivers/gpu/drm/radeon/radeon_ttm.c @@ -359,7 +359,7 @@ static int radeon_ttm_tt_pin_userptr(struct ttm_device *bdev, struct ttm_tt *ttm struct page **pages = ttm->pages + pinned; r = get_user_pages(userptr, num_pages, write ? FOLL_WRITE : 0, - pages, NULL); + pages); if (r < 0) goto release_pages; diff --git a/drivers/misc/sgi-gru/grufault.c b/drivers/misc/sgi-gru/grufault.c index b836936e9747..378cf02a2aa1 100644 --- a/drivers/misc/sgi-gru/grufault.c +++ b/drivers/misc/sgi-gru/grufault.c @@ -185,7 +185,7 @@ static int non_atomic_pte_lookup(struct vm_area_struct *vma, #else *pageshift = PAGE_SHIFT; #endif - if (get_user_pages(vaddr, 1, write ? FOLL_WRITE : 0, &page, NULL) <= 0) + if (get_user_pages(vaddr, 1, write ? FOLL_WRITE : 0, &page) <= 0) return -EFAULT; *paddr = page_to_phys(page); put_page(page); diff --git a/include/linux/mm.h b/include/linux/mm.h index 6d7e03d83da7..6336253c18e2 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2369,8 +2369,7 @@ long pin_user_pages_remote(struct mm_struct *mm, unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas, int *locked); long get_user_pages(unsigned long start, unsigned long nr_pages, - unsigned int gup_flags, struct page **pages, - struct vm_area_struct **vmas); + unsigned int gup_flags, struct page **pages); long pin_user_pages(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas); diff --git a/mm/gup.c b/mm/gup.c index e19b06a66229..21daeee5f163 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -2251,8 +2251,6 @@ long get_user_pages_remote(struct mm_struct *mm, * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. Or NULL, if caller * only intends to ensure the pages are faulted in. - * @vmas: array of pointers to vmas corresponding to each page. - * Or NULL if the caller does not require them. * * This is the same as get_user_pages_remote(), just with a less-flexible * calling convention where we assume that the mm being operated on belongs to @@ -2260,16 +2258,15 @@ long get_user_pages_remote(struct mm_struct *mm, * obviously don't pass FOLL_REMOTE in here. */ long get_user_pages(unsigned long start, unsigned long nr_pages, - unsigned int gup_flags, struct page **pages, - struct vm_area_struct **vmas) + unsigned int gup_flags, struct page **pages) { int locked = 1; - if (!is_valid_gup_args(pages, vmas, NULL, &gup_flags, FOLL_TOUCH)) + if (!is_valid_gup_args(pages, NULL, NULL, &gup_flags, FOLL_TOUCH)) return -EINVAL; return __get_user_pages_locked(current->mm, start, nr_pages, pages, - vmas, &locked, gup_flags); + NULL, &locked, gup_flags); } EXPORT_SYMBOL(get_user_pages); diff --git a/mm/gup_test.c b/mm/gup_test.c index 8ae7307a1bb6..9ba8ea23f84e 100644 --- a/mm/gup_test.c +++ b/mm/gup_test.c @@ -139,8 +139,7 @@ static int __gup_test_ioctl(unsigned int cmd, pages + i); break; case GUP_BASIC_TEST: - nr = get_user_pages(addr, nr, gup->gup_flags, pages + i, - NULL); + nr = get_user_pages(addr, nr, gup->gup_flags, pages + i); break; case PIN_FAST_BENCHMARK: nr = pin_user_pages_fast(addr, nr, gup->gup_flags, @@ -161,7 +160,7 @@ static int __gup_test_ioctl(unsigned int cmd, pages + i, NULL); else nr = get_user_pages(addr, nr, gup->gup_flags, - pages + i, NULL); + pages + i); break; default: ret = -EINVAL; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 479802a892d4..51e4882d0873 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2477,7 +2477,7 @@ static inline int check_user_page_hwpoison(unsigned long addr) { int rc, flags = FOLL_HWPOISON | FOLL_WRITE; - rc = get_user_pages(addr, 1, flags, NULL, NULL); + rc = get_user_pages(addr, 1, flags, NULL); return rc == -EHWPOISON; } -- cgit v1.2.3 From 0b295316b3a9b7858eafbebdc31b4827a6edde03 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Wed, 17 May 2023 20:25:36 +0100 Subject: mm/gup: remove unused vmas parameter from pin_user_pages_remote() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit No invocation of pin_user_pages_remote() uses the vmas parameter, so remove it. This forms part of a larger patch set eliminating the use of the vmas parameters altogether. Link: https://lkml.kernel.org/r/28f000beb81e45bf538a2aaa77c90f5482b67a32.1684350871.git.lstoakes@gmail.com Signed-off-by: Lorenzo Stoakes Acked-by: David Hildenbrand Reviewed-by: Jason Gunthorpe Reviewed-by: Christoph Hellwig Cc: Catalin Marinas Cc: Christian König Cc: Dennis Dalessandro Cc: Greg Kroah-Hartman Cc: Janosch Frank Cc: Jarkko Sakkinen Cc: Jens Axboe Cc: Matthew Wilcox (Oracle) Cc: Sakari Ailus Cc: Sean Christopherson Signed-off-by: Andrew Morton --- drivers/iommu/iommufd/pages.c | 4 ++-- drivers/vfio/vfio_iommu_type1.c | 2 +- include/linux/mm.h | 2 +- kernel/trace/trace_events_user.c | 2 +- mm/gup.c | 8 +++----- mm/process_vm_access.c | 2 +- 6 files changed, 9 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/drivers/iommu/iommufd/pages.c b/drivers/iommu/iommufd/pages.c index 3c47846cc5ef..412ca96be128 100644 --- a/drivers/iommu/iommufd/pages.c +++ b/drivers/iommu/iommufd/pages.c @@ -786,7 +786,7 @@ static int pfn_reader_user_pin(struct pfn_reader_user *user, user->locked = 1; } rc = pin_user_pages_remote(pages->source_mm, uptr, npages, - user->gup_flags, user->upages, NULL, + user->gup_flags, user->upages, &user->locked); } if (rc <= 0) { @@ -1799,7 +1799,7 @@ static int iopt_pages_rw_page(struct iopt_pages *pages, unsigned long index, rc = pin_user_pages_remote( pages->source_mm, (uintptr_t)(pages->uptr + index * PAGE_SIZE), 1, (flags & IOMMUFD_ACCESS_RW_WRITE) ? FOLL_WRITE : 0, &page, - NULL, NULL); + NULL); mmap_read_unlock(pages->source_mm); if (rc != 1) { if (WARN_ON(rc >= 0)) diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index 0d2f805468e1..306e6f1d1c70 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -562,7 +562,7 @@ static int vaddr_get_pfns(struct mm_struct *mm, unsigned long vaddr, mmap_read_lock(mm); ret = pin_user_pages_remote(mm, vaddr, npages, flags | FOLL_LONGTERM, - pages, NULL, NULL); + pages, NULL); if (ret > 0) { int i; diff --git a/include/linux/mm.h b/include/linux/mm.h index 6336253c18e2..cf17ffdf4fbf 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2367,7 +2367,7 @@ long get_user_pages_remote(struct mm_struct *mm, long pin_user_pages_remote(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, - struct vm_area_struct **vmas, int *locked); + int *locked); long get_user_pages(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages); long pin_user_pages(unsigned long start, unsigned long nr_pages, diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index b1ecd7677642..bdc2666e8d39 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -406,7 +406,7 @@ static int user_event_enabler_write(struct user_event_mm *mm, return -EBUSY; ret = pin_user_pages_remote(mm->mm, uaddr, 1, FOLL_WRITE | FOLL_NOFAULT, - &page, NULL, NULL); + &page, NULL); if (unlikely(ret <= 0)) { if (!fixup_fault) diff --git a/mm/gup.c b/mm/gup.c index 21daeee5f163..edf0fe2695b0 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -3100,8 +3100,6 @@ EXPORT_SYMBOL_GPL(pin_user_pages_fast); * @gup_flags: flags modifying lookup behaviour * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. - * @vmas: array of pointers to vmas corresponding to each page. - * Or NULL if the caller does not require them. * @locked: pointer to lock flag indicating whether lock is held and * subsequently whether VM_FAULT_RETRY functionality can be * utilised. Lock must initially be held. @@ -3116,14 +3114,14 @@ EXPORT_SYMBOL_GPL(pin_user_pages_fast); long pin_user_pages_remote(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, - struct vm_area_struct **vmas, int *locked) + int *locked) { int local_locked = 1; - if (!is_valid_gup_args(pages, vmas, locked, &gup_flags, + if (!is_valid_gup_args(pages, NULL, locked, &gup_flags, FOLL_PIN | FOLL_TOUCH | FOLL_REMOTE)) return 0; - return __gup_longterm_locked(mm, start, nr_pages, pages, vmas, + return __gup_longterm_locked(mm, start, nr_pages, pages, NULL, locked ? locked : &local_locked, gup_flags); } diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c index 78dfaf9e8990..0523edab03a6 100644 --- a/mm/process_vm_access.c +++ b/mm/process_vm_access.c @@ -104,7 +104,7 @@ static int process_vm_rw_single_vec(unsigned long addr, mmap_read_lock(mm); pinned_pages = pin_user_pages_remote(mm, pa, pinned_pages, flags, process_pages, - NULL, &locked); + &locked); if (locked) mmap_read_unlock(mm); if (pinned_pages <= 0) -- cgit v1.2.3 From ca5e863233e8f6acd1792fd85d6bc2729a1b2c10 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Wed, 17 May 2023 20:25:39 +0100 Subject: mm/gup: remove vmas parameter from get_user_pages_remote() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The only instances of get_user_pages_remote() invocations which used the vmas parameter were for a single page which can instead simply look up the VMA directly. In particular:- - __update_ref_ctr() looked up the VMA but did nothing with it so we simply remove it. - __access_remote_vm() was already using vma_lookup() when the original lookup failed so by doing the lookup directly this also de-duplicates the code. We are able to perform these VMA operations as we already hold the mmap_lock in order to be able to call get_user_pages_remote(). As part of this work we add get_user_page_vma_remote() which abstracts the VMA lookup, error handling and decrementing the page reference count should the VMA lookup fail. This forms part of a broader set of patches intended to eliminate the vmas parameter altogether. [akpm@linux-foundation.org: avoid passing NULL to PTR_ERR] Link: https://lkml.kernel.org/r/d20128c849ecdbf4dd01cc828fcec32127ed939a.1684350871.git.lstoakes@gmail.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Catalin Marinas (for arm64) Acked-by: David Hildenbrand Reviewed-by: Janosch Frank (for s390) Reviewed-by: Christoph Hellwig Cc: Christian König Cc: Dennis Dalessandro Cc: Greg Kroah-Hartman Cc: Jarkko Sakkinen Cc: Jason Gunthorpe Cc: Jens Axboe Cc: Matthew Wilcox (Oracle) Cc: Sakari Ailus Cc: Sean Christopherson Signed-off-by: Andrew Morton --- arch/arm64/kernel/mte.c | 17 +++++++++-------- arch/s390/kvm/interrupt.c | 2 +- fs/exec.c | 2 +- include/linux/mm.h | 34 +++++++++++++++++++++++++++++++--- kernel/events/uprobes.c | 13 +++++-------- mm/gup.c | 12 ++++-------- mm/memory.c | 20 ++++++++++---------- mm/rmap.c | 2 +- security/tomoyo/domain.c | 2 +- virt/kvm/async_pf.c | 3 +-- 10 files changed, 64 insertions(+), 43 deletions(-) (limited to 'include') diff --git a/arch/arm64/kernel/mte.c b/arch/arm64/kernel/mte.c index 7e89968bd282..4c5ef9b20065 100644 --- a/arch/arm64/kernel/mte.c +++ b/arch/arm64/kernel/mte.c @@ -416,10 +416,9 @@ long get_mte_ctrl(struct task_struct *task) static int __access_remote_tags(struct mm_struct *mm, unsigned long addr, struct iovec *kiov, unsigned int gup_flags) { - struct vm_area_struct *vma; void __user *buf = kiov->iov_base; size_t len = kiov->iov_len; - int ret; + int err = 0; int write = gup_flags & FOLL_WRITE; if (!access_ok(buf, len)) @@ -429,14 +428,16 @@ static int __access_remote_tags(struct mm_struct *mm, unsigned long addr, return -EIO; while (len) { + struct vm_area_struct *vma; unsigned long tags, offset; void *maddr; - struct page *page = NULL; + struct page *page = get_user_page_vma_remote(mm, addr, + gup_flags, &vma); - ret = get_user_pages_remote(mm, addr, 1, gup_flags, &page, - &vma, NULL); - if (ret <= 0) + if (IS_ERR_OR_NULL(page)) { + err = page == NULL ? -EIO : PTR_ERR(page); break; + } /* * Only copy tags if the page has been mapped as PROT_MTE @@ -446,7 +447,7 @@ static int __access_remote_tags(struct mm_struct *mm, unsigned long addr, * was never mapped with PROT_MTE. */ if (!(vma->vm_flags & VM_MTE)) { - ret = -EOPNOTSUPP; + err = -EOPNOTSUPP; put_page(page); break; } @@ -479,7 +480,7 @@ static int __access_remote_tags(struct mm_struct *mm, unsigned long addr, kiov->iov_len = buf - kiov->iov_base; if (!kiov->iov_len) { /* check for error accessing the tracee's address space */ - if (ret <= 0) + if (err) return -EIO; else return -EFAULT; diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index da6dac36e959..9bd0a873f3b1 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -2777,7 +2777,7 @@ static struct page *get_map_page(struct kvm *kvm, u64 uaddr) mmap_read_lock(kvm->mm); get_user_pages_remote(kvm->mm, uaddr, 1, FOLL_WRITE, - &page, NULL, NULL); + &page, NULL); mmap_read_unlock(kvm->mm); return page; } diff --git a/fs/exec.c b/fs/exec.c index a466e797c8e2..25c65b64544b 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -220,7 +220,7 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, */ mmap_read_lock(bprm->mm); ret = get_user_pages_remote(bprm->mm, pos, 1, gup_flags, - &page, NULL, NULL); + &page, NULL); mmap_read_unlock(bprm->mm); if (ret <= 0) return NULL; diff --git a/include/linux/mm.h b/include/linux/mm.h index cf17ffdf4fbf..fcbfb961b49f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2353,6 +2353,9 @@ static inline void unmap_shared_mapping_range(struct address_space *mapping, unmap_mapping_range(mapping, holebegin, holelen, 0); } +static inline struct vm_area_struct *vma_lookup(struct mm_struct *mm, + unsigned long addr); + extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, unsigned int gup_flags); extern int access_remote_vm(struct mm_struct *mm, unsigned long addr, @@ -2361,13 +2364,38 @@ extern int __access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf, int len, unsigned int gup_flags); long get_user_pages_remote(struct mm_struct *mm, - unsigned long start, unsigned long nr_pages, - unsigned int gup_flags, struct page **pages, - struct vm_area_struct **vmas, int *locked); + unsigned long start, unsigned long nr_pages, + unsigned int gup_flags, struct page **pages, + int *locked); long pin_user_pages_remote(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, int *locked); + +static inline struct page *get_user_page_vma_remote(struct mm_struct *mm, + unsigned long addr, + int gup_flags, + struct vm_area_struct **vmap) +{ + struct page *page; + struct vm_area_struct *vma; + int got = get_user_pages_remote(mm, addr, 1, gup_flags, &page, NULL); + + if (got < 0) + return ERR_PTR(got); + if (got == 0) + return NULL; + + vma = vma_lookup(mm, addr); + if (WARN_ON_ONCE(!vma)) { + put_page(page); + return ERR_PTR(-EINVAL); + } + + *vmap = vma; + return page; +} + long get_user_pages(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages); long pin_user_pages(unsigned long start, unsigned long nr_pages, diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 59887c69d54c..607d742caa61 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -365,7 +365,6 @@ __update_ref_ctr(struct mm_struct *mm, unsigned long vaddr, short d) { void *kaddr; struct page *page; - struct vm_area_struct *vma; int ret; short *ptr; @@ -373,7 +372,7 @@ __update_ref_ctr(struct mm_struct *mm, unsigned long vaddr, short d) return -EINVAL; ret = get_user_pages_remote(mm, vaddr, 1, - FOLL_WRITE, &page, &vma, NULL); + FOLL_WRITE, &page, NULL); if (unlikely(ret <= 0)) { /* * We are asking for 1 page. If get_user_pages_remote() fails, @@ -474,10 +473,9 @@ retry: if (is_register) gup_flags |= FOLL_SPLIT_PMD; /* Read the page with vaddr into memory */ - ret = get_user_pages_remote(mm, vaddr, 1, gup_flags, - &old_page, &vma, NULL); - if (ret <= 0) - return ret; + old_page = get_user_page_vma_remote(mm, vaddr, gup_flags, &vma); + if (IS_ERR_OR_NULL(old_page)) + return old_page ? PTR_ERR(old_page) : 0; ret = verify_opcode(old_page, vaddr, &opcode); if (ret <= 0) @@ -2027,8 +2025,7 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr) * but we treat this as a 'remote' access since it is * essentially a kernel access to the memory. */ - result = get_user_pages_remote(mm, vaddr, 1, FOLL_FORCE, &page, - NULL, NULL); + result = get_user_pages_remote(mm, vaddr, 1, FOLL_FORCE, &page, NULL); if (result < 0) return result; diff --git a/mm/gup.c b/mm/gup.c index edf0fe2695b0..764bf0c20827 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -2165,8 +2165,6 @@ static bool is_valid_gup_args(struct page **pages, struct vm_area_struct **vmas, * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. Or NULL, if caller * only intends to ensure the pages are faulted in. - * @vmas: array of pointers to vmas corresponding to each page. - * Or NULL if the caller does not require them. * @locked: pointer to lock flag indicating whether lock is held and * subsequently whether VM_FAULT_RETRY functionality can be * utilised. Lock must initially be held. @@ -2181,8 +2179,6 @@ static bool is_valid_gup_args(struct page **pages, struct vm_area_struct **vmas, * * The caller is responsible for releasing returned @pages, via put_page(). * - * @vmas are valid only as long as mmap_lock is held. - * * Must be called with mmap_lock held for read or write. * * get_user_pages_remote walks a process's page tables and takes a reference @@ -2219,15 +2215,15 @@ static bool is_valid_gup_args(struct page **pages, struct vm_area_struct **vmas, long get_user_pages_remote(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, - struct vm_area_struct **vmas, int *locked) + int *locked) { int local_locked = 1; - if (!is_valid_gup_args(pages, vmas, locked, &gup_flags, + if (!is_valid_gup_args(pages, NULL, locked, &gup_flags, FOLL_TOUCH | FOLL_REMOTE)) return -EINVAL; - return __get_user_pages_locked(mm, start, nr_pages, pages, vmas, + return __get_user_pages_locked(mm, start, nr_pages, pages, NULL, locked ? locked : &local_locked, gup_flags); } @@ -2237,7 +2233,7 @@ EXPORT_SYMBOL(get_user_pages_remote); long get_user_pages_remote(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, - struct vm_area_struct **vmas, int *locked) + int *locked) { return 0; } diff --git a/mm/memory.c b/mm/memory.c index f69fbc251198..4dd09f930c61 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -5587,7 +5587,6 @@ EXPORT_SYMBOL_GPL(generic_access_phys); int __access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf, int len, unsigned int gup_flags) { - struct vm_area_struct *vma; void *old_buf = buf; int write = gup_flags & FOLL_WRITE; @@ -5596,29 +5595,30 @@ int __access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf, /* ignore errors, just check how much was successfully transferred */ while (len) { - int bytes, ret, offset; + int bytes, offset; void *maddr; - struct page *page = NULL; + struct vm_area_struct *vma = NULL; + struct page *page = get_user_page_vma_remote(mm, addr, + gup_flags, &vma); - ret = get_user_pages_remote(mm, addr, 1, - gup_flags, &page, &vma, NULL); - if (ret <= 0) { + if (IS_ERR_OR_NULL(page)) { #ifndef CONFIG_HAVE_IOREMAP_PROT break; #else + int res = 0; + /* * Check if this is a VM_IO | VM_PFNMAP VMA, which * we can access using slightly different code. */ - vma = vma_lookup(mm, addr); if (!vma) break; if (vma->vm_ops && vma->vm_ops->access) - ret = vma->vm_ops->access(vma, addr, buf, + res = vma->vm_ops->access(vma, addr, buf, len, write); - if (ret <= 0) + if (res <= 0) break; - bytes = ret; + bytes = res; #endif } else { bytes = len; diff --git a/mm/rmap.c b/mm/rmap.c index 19392e090bec..cd918cb9a431 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -2328,7 +2328,7 @@ int make_device_exclusive_range(struct mm_struct *mm, unsigned long start, npages = get_user_pages_remote(mm, start, npages, FOLL_GET | FOLL_WRITE | FOLL_SPLIT_PMD, - pages, NULL, NULL); + pages, NULL); if (npages < 0) return npages; diff --git a/security/tomoyo/domain.c b/security/tomoyo/domain.c index 31af29f669d2..ac20c0bdff9d 100644 --- a/security/tomoyo/domain.c +++ b/security/tomoyo/domain.c @@ -916,7 +916,7 @@ bool tomoyo_dump_page(struct linux_binprm *bprm, unsigned long pos, */ mmap_read_lock(bprm->mm); ret = get_user_pages_remote(bprm->mm, pos, 1, - FOLL_FORCE, &page, NULL, NULL); + FOLL_FORCE, &page, NULL); mmap_read_unlock(bprm->mm); if (ret <= 0) return false; diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c index 9bfe1d6f6529..e033c79d528e 100644 --- a/virt/kvm/async_pf.c +++ b/virt/kvm/async_pf.c @@ -61,8 +61,7 @@ static void async_pf_execute(struct work_struct *work) * access remotely. */ mmap_read_lock(mm); - get_user_pages_remote(mm, addr, 1, FOLL_WRITE, NULL, NULL, - &locked); + get_user_pages_remote(mm, addr, 1, FOLL_WRITE, NULL, &locked); if (locked) mmap_read_unlock(mm); -- cgit v1.2.3 From 4c630f307455c06f99bdeca7f7a1ab5318604fe0 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Wed, 17 May 2023 20:25:45 +0100 Subject: mm/gup: remove vmas parameter from pin_user_pages() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We are now in a position where no caller of pin_user_pages() requires the vmas parameter at all, so eliminate this parameter from the function and all callers. This clears the way to removing the vmas parameter from GUP altogether. Link: https://lkml.kernel.org/r/195a99ae949c9f5cb589d2222b736ced96ec199a.1684350871.git.lstoakes@gmail.com Signed-off-by: Lorenzo Stoakes Acked-by: David Hildenbrand Acked-by: Dennis Dalessandro [qib] Reviewed-by: Christoph Hellwig Acked-by: Sakari Ailus [drivers/media] Cc: Catalin Marinas Cc: Christian König Cc: Greg Kroah-Hartman Cc: Janosch Frank Cc: Jarkko Sakkinen Cc: Jason Gunthorpe Cc: Jens Axboe Cc: Matthew Wilcox (Oracle) Cc: Sean Christopherson Signed-off-by: Andrew Morton --- arch/powerpc/mm/book3s64/iommu_api.c | 2 +- drivers/infiniband/hw/qib/qib_user_pages.c | 2 +- drivers/infiniband/hw/usnic/usnic_uiom.c | 2 +- drivers/infiniband/sw/siw/siw_mem.c | 2 +- drivers/media/v4l2-core/videobuf-dma-sg.c | 2 +- drivers/vdpa/vdpa_user/vduse_dev.c | 2 +- drivers/vhost/vdpa.c | 2 +- include/linux/mm.h | 3 +-- io_uring/rsrc.c | 2 +- mm/gup.c | 9 +++------ mm/gup_test.c | 9 ++++----- net/xdp/xdp_umem.c | 2 +- 12 files changed, 17 insertions(+), 22 deletions(-) (limited to 'include') diff --git a/arch/powerpc/mm/book3s64/iommu_api.c b/arch/powerpc/mm/book3s64/iommu_api.c index 81d7185e2ae8..d19fb1f3007d 100644 --- a/arch/powerpc/mm/book3s64/iommu_api.c +++ b/arch/powerpc/mm/book3s64/iommu_api.c @@ -105,7 +105,7 @@ static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua, ret = pin_user_pages(ua + (entry << PAGE_SHIFT), n, FOLL_WRITE | FOLL_LONGTERM, - mem->hpages + entry, NULL); + mem->hpages + entry); if (ret == n) { pinned += n; continue; diff --git a/drivers/infiniband/hw/qib/qib_user_pages.c b/drivers/infiniband/hw/qib/qib_user_pages.c index f693bc753b6b..1bb7507325bc 100644 --- a/drivers/infiniband/hw/qib/qib_user_pages.c +++ b/drivers/infiniband/hw/qib/qib_user_pages.c @@ -111,7 +111,7 @@ int qib_get_user_pages(unsigned long start_page, size_t num_pages, ret = pin_user_pages(start_page + got * PAGE_SIZE, num_pages - got, FOLL_LONGTERM | FOLL_WRITE, - p + got, NULL); + p + got); if (ret < 0) { mmap_read_unlock(current->mm); goto bail_release; diff --git a/drivers/infiniband/hw/usnic/usnic_uiom.c b/drivers/infiniband/hw/usnic/usnic_uiom.c index 2a5cac2658ec..84e0f41e7dfa 100644 --- a/drivers/infiniband/hw/usnic/usnic_uiom.c +++ b/drivers/infiniband/hw/usnic/usnic_uiom.c @@ -140,7 +140,7 @@ static int usnic_uiom_get_pages(unsigned long addr, size_t size, int writable, ret = pin_user_pages(cur_base, min_t(unsigned long, npages, PAGE_SIZE / sizeof(struct page *)), - gup_flags, page_list, NULL); + gup_flags, page_list); if (ret < 0) goto out; diff --git a/drivers/infiniband/sw/siw/siw_mem.c b/drivers/infiniband/sw/siw/siw_mem.c index f51ab2ccf151..e6e25f15567d 100644 --- a/drivers/infiniband/sw/siw/siw_mem.c +++ b/drivers/infiniband/sw/siw/siw_mem.c @@ -422,7 +422,7 @@ struct siw_umem *siw_umem_get(u64 start, u64 len, bool writable) umem->page_chunk[i].plist = plist; while (nents) { rv = pin_user_pages(first_page_va, nents, foll_flags, - plist, NULL); + plist); if (rv < 0) goto out_sem_up; diff --git a/drivers/media/v4l2-core/videobuf-dma-sg.c b/drivers/media/v4l2-core/videobuf-dma-sg.c index 53001532e8e3..405b89ea1054 100644 --- a/drivers/media/v4l2-core/videobuf-dma-sg.c +++ b/drivers/media/v4l2-core/videobuf-dma-sg.c @@ -180,7 +180,7 @@ static int videobuf_dma_init_user_locked(struct videobuf_dmabuf *dma, data, size, dma->nr_pages); err = pin_user_pages(data & PAGE_MASK, dma->nr_pages, gup_flags, - dma->pages, NULL); + dma->pages); if (err != dma->nr_pages) { dma->nr_pages = (err >= 0) ? err : 0; diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c index de97e38c3b82..4d4405f058e8 100644 --- a/drivers/vdpa/vdpa_user/vduse_dev.c +++ b/drivers/vdpa/vdpa_user/vduse_dev.c @@ -1052,7 +1052,7 @@ static int vduse_dev_reg_umem(struct vduse_dev *dev, goto out; pinned = pin_user_pages(uaddr, npages, FOLL_LONGTERM | FOLL_WRITE, - page_list, NULL); + page_list); if (pinned != npages) { ret = pinned < 0 ? pinned : -ENOMEM; goto out; diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index 8c1aefc865f0..61223fcbe82b 100644 --- a/drivers/vhost/vdpa.c +++ b/drivers/vhost/vdpa.c @@ -983,7 +983,7 @@ static int vhost_vdpa_pa_map(struct vhost_vdpa *v, while (npages) { sz2pin = min_t(unsigned long, npages, list_size); pinned = pin_user_pages(cur_base, sz2pin, - gup_flags, page_list, NULL); + gup_flags, page_list); if (sz2pin != pinned) { if (pinned < 0) { ret = pinned; diff --git a/include/linux/mm.h b/include/linux/mm.h index fcbfb961b49f..280429ffa91d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2399,8 +2399,7 @@ static inline struct page *get_user_page_vma_remote(struct mm_struct *mm, long get_user_pages(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages); long pin_user_pages(unsigned long start, unsigned long nr_pages, - unsigned int gup_flags, struct page **pages, - struct vm_area_struct **vmas); + unsigned int gup_flags, struct page **pages); long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, struct page **pages, unsigned int gup_flags); long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages, diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index b6451f8bc5d5..b56bda46a9eb 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -1044,7 +1044,7 @@ struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages) ret = 0; mmap_read_lock(current->mm); pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM, - pages, NULL); + pages); if (pret == nr_pages) *npages = nr_pages; else diff --git a/mm/gup.c b/mm/gup.c index 764bf0c20827..18e3bc2ee3f1 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -3131,8 +3131,6 @@ EXPORT_SYMBOL(pin_user_pages_remote); * @gup_flags: flags modifying lookup behaviour * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. - * @vmas: array of pointers to vmas corresponding to each page. - * Or NULL if the caller does not require them. * * Nearly the same as get_user_pages(), except that FOLL_TOUCH is not set, and * FOLL_PIN is set. @@ -3141,15 +3139,14 @@ EXPORT_SYMBOL(pin_user_pages_remote); * see Documentation/core-api/pin_user_pages.rst for details. */ long pin_user_pages(unsigned long start, unsigned long nr_pages, - unsigned int gup_flags, struct page **pages, - struct vm_area_struct **vmas) + unsigned int gup_flags, struct page **pages) { int locked = 1; - if (!is_valid_gup_args(pages, vmas, NULL, &gup_flags, FOLL_PIN)) + if (!is_valid_gup_args(pages, NULL, NULL, &gup_flags, FOLL_PIN)) return 0; return __gup_longterm_locked(current->mm, start, nr_pages, - pages, vmas, &locked, gup_flags); + pages, NULL, &locked, gup_flags); } EXPORT_SYMBOL(pin_user_pages); diff --git a/mm/gup_test.c b/mm/gup_test.c index 9ba8ea23f84e..1668ce0e0783 100644 --- a/mm/gup_test.c +++ b/mm/gup_test.c @@ -146,18 +146,17 @@ static int __gup_test_ioctl(unsigned int cmd, pages + i); break; case PIN_BASIC_TEST: - nr = pin_user_pages(addr, nr, gup->gup_flags, pages + i, - NULL); + nr = pin_user_pages(addr, nr, gup->gup_flags, pages + i); break; case PIN_LONGTERM_BENCHMARK: nr = pin_user_pages(addr, nr, gup->gup_flags | FOLL_LONGTERM, - pages + i, NULL); + pages + i); break; case DUMP_USER_PAGES_TEST: if (gup->test_flags & GUP_TEST_FLAG_DUMP_PAGES_USE_PIN) nr = pin_user_pages(addr, nr, gup->gup_flags, - pages + i, NULL); + pages + i); else nr = get_user_pages(addr, nr, gup->gup_flags, pages + i); @@ -270,7 +269,7 @@ static inline int pin_longterm_test_start(unsigned long arg) gup_flags, pages); else cur_pages = pin_user_pages(addr, remaining_pages, - gup_flags, pages, NULL); + gup_flags, pages); if (cur_pages < 0) { pin_longterm_test_stop(); ret = cur_pages; diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index 02207e852d79..06cead2b8e34 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -103,7 +103,7 @@ static int xdp_umem_pin_pages(struct xdp_umem *umem, unsigned long address) mmap_read_lock(current->mm); npgs = pin_user_pages(address, umem->npgs, - gup_flags | FOLL_LONGTERM, &umem->pgs[0], NULL); + gup_flags | FOLL_LONGTERM, &umem->pgs[0]); mmap_read_unlock(current->mm); if (npgs != umem->npgs) { -- cgit v1.2.3 From b2cac248191b7466c5819e0da617b0705a26e197 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Wed, 17 May 2023 20:25:48 +0100 Subject: mm/gup: remove vmas array from internal GUP functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now we have eliminated all callers to GUP APIs which use the vmas parameter, eliminate it altogether. This eliminates a class of bugs where vmas might have been kept around longer than the mmap_lock and thus we need not be concerned about locks being dropped during this operation leaving behind dangling pointers. This simplifies the GUP API and makes it considerably clearer as to its purpose - follow flags are applied and if pinning, an array of pages is returned. Link: https://lkml.kernel.org/r/6811b4b2b4b3baf3dd07f422bb18853bb2cd09fb.1684350871.git.lstoakes@gmail.com Signed-off-by: Lorenzo Stoakes Acked-by: David Hildenbrand Reviewed-by: Christoph Hellwig Cc: Catalin Marinas Cc: Christian König Cc: Dennis Dalessandro Cc: Greg Kroah-Hartman Cc: Janosch Frank Cc: Jarkko Sakkinen Cc: Jason Gunthorpe Cc: Jens Axboe Cc: Matthew Wilcox (Oracle) Cc: Sakari Ailus Cc: Sean Christopherson Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 10 +++--- mm/gup.c | 83 ++++++++++++++++++------------------------------- mm/hugetlb.c | 24 ++++++-------- 3 files changed, 45 insertions(+), 72 deletions(-) (limited to 'include') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index f1543a0568ff..21f942025fec 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -133,9 +133,8 @@ int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma, unsigned long address, unsigned int flags); long follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *, - struct page **, struct vm_area_struct **, - unsigned long *, unsigned long *, long, unsigned int, - int *); + struct page **, unsigned long *, unsigned long *, + long, unsigned int, int *); void unmap_hugepage_range(struct vm_area_struct *, unsigned long, unsigned long, struct page *, zap_flags_t); @@ -306,9 +305,8 @@ static inline struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma, static inline long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, struct page **pages, - struct vm_area_struct **vmas, unsigned long *position, - unsigned long *nr_pages, long i, unsigned int flags, - int *nonblocking) + unsigned long *position, unsigned long *nr_pages, + long i, unsigned int flags, int *nonblocking) { BUG(); return 0; diff --git a/mm/gup.c b/mm/gup.c index 18e3bc2ee3f1..8db58305f4eb 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1024,8 +1024,6 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. Or NULL, if caller * only intends to ensure the pages are faulted in. - * @vmas: array of pointers to vmas corresponding to each page. - * Or NULL if the caller does not require them. * @locked: whether we're still with the mmap_lock held * * Returns either number of pages pinned (which may be less than the @@ -1039,8 +1037,6 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) * * The caller is responsible for releasing returned @pages, via put_page(). * - * @vmas are valid only as long as mmap_lock is held. - * * Must be called with mmap_lock held. It may be released. See below. * * __get_user_pages walks a process's page tables and takes a reference to @@ -1076,7 +1072,7 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) static long __get_user_pages(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, - struct vm_area_struct **vmas, int *locked) + int *locked) { long ret = 0, i = 0; struct vm_area_struct *vma = NULL; @@ -1116,9 +1112,9 @@ static long __get_user_pages(struct mm_struct *mm, goto out; if (is_vm_hugetlb_page(vma)) { - i = follow_hugetlb_page(mm, vma, pages, vmas, - &start, &nr_pages, i, - gup_flags, locked); + i = follow_hugetlb_page(mm, vma, pages, + &start, &nr_pages, i, + gup_flags, locked); if (!*locked) { /* * We've got a VM_FAULT_RETRY @@ -1183,10 +1179,6 @@ retry: ctx.page_mask = 0; } next_page: - if (vmas) { - vmas[i] = vma; - ctx.page_mask = 0; - } page_increm = 1 + (~(start >> PAGE_SHIFT) & ctx.page_mask); if (page_increm > nr_pages) page_increm = nr_pages; @@ -1341,7 +1333,6 @@ static __always_inline long __get_user_pages_locked(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, struct page **pages, - struct vm_area_struct **vmas, int *locked, unsigned int flags) { @@ -1379,7 +1370,7 @@ static __always_inline long __get_user_pages_locked(struct mm_struct *mm, pages_done = 0; for (;;) { ret = __get_user_pages(mm, start, nr_pages, flags, pages, - vmas, locked); + locked); if (!(flags & FOLL_UNLOCKABLE)) { /* VM_FAULT_RETRY couldn't trigger, bypass */ pages_done = ret; @@ -1443,7 +1434,7 @@ retry: *locked = 1; ret = __get_user_pages(mm, start, 1, flags | FOLL_TRIED, - pages, NULL, locked); + pages, locked); if (!*locked) { /* Continue to retry until we succeeded */ BUG_ON(ret != 0); @@ -1541,7 +1532,7 @@ long populate_vma_page_range(struct vm_area_struct *vma, * not result in a stack expansion that recurses back here. */ ret = __get_user_pages(mm, start, nr_pages, gup_flags, - NULL, NULL, locked ? locked : &local_locked); + NULL, locked ? locked : &local_locked); lru_add_drain(); return ret; } @@ -1599,7 +1590,7 @@ long faultin_vma_page_range(struct vm_area_struct *vma, unsigned long start, return -EINVAL; ret = __get_user_pages(mm, start, nr_pages, gup_flags, - NULL, NULL, locked); + NULL, locked); lru_add_drain(); return ret; } @@ -1667,8 +1658,7 @@ int __mm_populate(unsigned long start, unsigned long len, int ignore_errors) #else /* CONFIG_MMU */ static long __get_user_pages_locked(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, struct page **pages, - struct vm_area_struct **vmas, int *locked, - unsigned int foll_flags) + int *locked, unsigned int foll_flags) { struct vm_area_struct *vma; bool must_unlock = false; @@ -1712,8 +1702,7 @@ static long __get_user_pages_locked(struct mm_struct *mm, unsigned long start, if (pages[i]) get_page(pages[i]); } - if (vmas) - vmas[i] = vma; + start = (start + PAGE_SIZE) & PAGE_MASK; } @@ -1894,8 +1883,7 @@ struct page *get_dump_page(unsigned long addr) int locked = 0; int ret; - ret = __get_user_pages_locked(current->mm, addr, 1, &page, NULL, - &locked, + ret = __get_user_pages_locked(current->mm, addr, 1, &page, &locked, FOLL_FORCE | FOLL_DUMP | FOLL_GET); return (ret == 1) ? page : NULL; } @@ -2068,7 +2056,6 @@ static long __gup_longterm_locked(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, struct page **pages, - struct vm_area_struct **vmas, int *locked, unsigned int gup_flags) { @@ -2076,13 +2063,13 @@ static long __gup_longterm_locked(struct mm_struct *mm, long rc, nr_pinned_pages; if (!(gup_flags & FOLL_LONGTERM)) - return __get_user_pages_locked(mm, start, nr_pages, pages, vmas, + return __get_user_pages_locked(mm, start, nr_pages, pages, locked, gup_flags); flags = memalloc_pin_save(); do { nr_pinned_pages = __get_user_pages_locked(mm, start, nr_pages, - pages, vmas, locked, + pages, locked, gup_flags); if (nr_pinned_pages <= 0) { rc = nr_pinned_pages; @@ -2100,9 +2087,8 @@ static long __gup_longterm_locked(struct mm_struct *mm, * Check that the given flags are valid for the exported gup/pup interface, and * update them with the required flags that the caller must have set. */ -static bool is_valid_gup_args(struct page **pages, struct vm_area_struct **vmas, - int *locked, unsigned int *gup_flags_p, - unsigned int to_set) +static bool is_valid_gup_args(struct page **pages, int *locked, + unsigned int *gup_flags_p, unsigned int to_set) { unsigned int gup_flags = *gup_flags_p; @@ -2144,13 +2130,6 @@ static bool is_valid_gup_args(struct page **pages, struct vm_area_struct **vmas, (gup_flags & FOLL_PCI_P2PDMA))) return false; - /* - * Can't use VMAs with locked, as locked allows GUP to unlock - * which invalidates the vmas array - */ - if (WARN_ON_ONCE(vmas && (gup_flags & FOLL_UNLOCKABLE))) - return false; - *gup_flags_p = gup_flags; return true; } @@ -2219,11 +2198,11 @@ long get_user_pages_remote(struct mm_struct *mm, { int local_locked = 1; - if (!is_valid_gup_args(pages, NULL, locked, &gup_flags, + if (!is_valid_gup_args(pages, locked, &gup_flags, FOLL_TOUCH | FOLL_REMOTE)) return -EINVAL; - return __get_user_pages_locked(mm, start, nr_pages, pages, NULL, + return __get_user_pages_locked(mm, start, nr_pages, pages, locked ? locked : &local_locked, gup_flags); } @@ -2258,11 +2237,11 @@ long get_user_pages(unsigned long start, unsigned long nr_pages, { int locked = 1; - if (!is_valid_gup_args(pages, NULL, NULL, &gup_flags, FOLL_TOUCH)) + if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_TOUCH)) return -EINVAL; return __get_user_pages_locked(current->mm, start, nr_pages, pages, - NULL, &locked, gup_flags); + &locked, gup_flags); } EXPORT_SYMBOL(get_user_pages); @@ -2286,12 +2265,12 @@ long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, { int locked = 0; - if (!is_valid_gup_args(pages, NULL, NULL, &gup_flags, + if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_TOUCH | FOLL_UNLOCKABLE)) return -EINVAL; return __get_user_pages_locked(current->mm, start, nr_pages, pages, - NULL, &locked, gup_flags); + &locked, gup_flags); } EXPORT_SYMBOL(get_user_pages_unlocked); @@ -2981,7 +2960,7 @@ static int internal_get_user_pages_fast(unsigned long start, start += nr_pinned << PAGE_SHIFT; pages += nr_pinned; ret = __gup_longterm_locked(current->mm, start, nr_pages - nr_pinned, - pages, NULL, &locked, + pages, &locked, gup_flags | FOLL_TOUCH | FOLL_UNLOCKABLE); if (ret < 0) { /* @@ -3023,7 +3002,7 @@ int get_user_pages_fast_only(unsigned long start, int nr_pages, * FOLL_FAST_ONLY is required in order to match the API description of * this routine: no fall back to regular ("slow") GUP. */ - if (!is_valid_gup_args(pages, NULL, NULL, &gup_flags, + if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_GET | FOLL_FAST_ONLY)) return -EINVAL; @@ -3056,7 +3035,7 @@ int get_user_pages_fast(unsigned long start, int nr_pages, * FOLL_GET, because gup fast is always a "pin with a +1 page refcount" * request. */ - if (!is_valid_gup_args(pages, NULL, NULL, &gup_flags, FOLL_GET)) + if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_GET)) return -EINVAL; return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages); } @@ -3081,7 +3060,7 @@ EXPORT_SYMBOL_GPL(get_user_pages_fast); int pin_user_pages_fast(unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages) { - if (!is_valid_gup_args(pages, NULL, NULL, &gup_flags, FOLL_PIN)) + if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_PIN)) return -EINVAL; return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages); } @@ -3114,10 +3093,10 @@ long pin_user_pages_remote(struct mm_struct *mm, { int local_locked = 1; - if (!is_valid_gup_args(pages, NULL, locked, &gup_flags, + if (!is_valid_gup_args(pages, locked, &gup_flags, FOLL_PIN | FOLL_TOUCH | FOLL_REMOTE)) return 0; - return __gup_longterm_locked(mm, start, nr_pages, pages, NULL, + return __gup_longterm_locked(mm, start, nr_pages, pages, locked ? locked : &local_locked, gup_flags); } @@ -3143,10 +3122,10 @@ long pin_user_pages(unsigned long start, unsigned long nr_pages, { int locked = 1; - if (!is_valid_gup_args(pages, NULL, NULL, &gup_flags, FOLL_PIN)) + if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_PIN)) return 0; return __gup_longterm_locked(current->mm, start, nr_pages, - pages, NULL, &locked, gup_flags); + pages, &locked, gup_flags); } EXPORT_SYMBOL(pin_user_pages); @@ -3160,11 +3139,11 @@ long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages, { int locked = 0; - if (!is_valid_gup_args(pages, NULL, NULL, &gup_flags, + if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_PIN | FOLL_TOUCH | FOLL_UNLOCKABLE)) return 0; - return __gup_longterm_locked(current->mm, start, nr_pages, pages, NULL, + return __gup_longterm_locked(current->mm, start, nr_pages, pages, &locked, gup_flags); } EXPORT_SYMBOL(pin_user_pages_unlocked); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index f154019e6b84..ea24718db4af 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6425,17 +6425,14 @@ out_release_nounlock: } #endif /* CONFIG_USERFAULTFD */ -static void record_subpages_vmas(struct page *page, struct vm_area_struct *vma, - int refs, struct page **pages, - struct vm_area_struct **vmas) +static void record_subpages(struct page *page, struct vm_area_struct *vma, + int refs, struct page **pages) { int nr; for (nr = 0; nr < refs; nr++) { if (likely(pages)) pages[nr] = nth_page(page, nr); - if (vmas) - vmas[nr] = vma; } } @@ -6508,9 +6505,9 @@ out_unlock: } long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, - struct page **pages, struct vm_area_struct **vmas, - unsigned long *position, unsigned long *nr_pages, - long i, unsigned int flags, int *locked) + struct page **pages, unsigned long *position, + unsigned long *nr_pages, long i, unsigned int flags, + int *locked) { unsigned long pfn_offset; unsigned long vaddr = *position; @@ -6638,7 +6635,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, * If subpage information not requested, update counters * and skip the same_page loop below. */ - if (!pages && !vmas && !pfn_offset && + if (!pages && !pfn_offset && (vaddr + huge_page_size(h) < vma->vm_end) && (remainder >= pages_per_huge_page(h))) { vaddr += huge_page_size(h); @@ -6653,11 +6650,10 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, refs = min3(pages_per_huge_page(h) - pfn_offset, remainder, (vma->vm_end - ALIGN_DOWN(vaddr, PAGE_SIZE)) >> PAGE_SHIFT); - if (pages || vmas) - record_subpages_vmas(nth_page(page, pfn_offset), - vma, refs, - likely(pages) ? pages + i : NULL, - vmas ? vmas + i : NULL); + if (pages) + record_subpages(nth_page(page, pfn_offset), + vma, refs, + likely(pages) ? pages + i : NULL); if (pages) { /* -- cgit v1.2.3 From 4e096ae1801e24b338e02715c65c3ffa8883ba5d Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sat, 13 May 2023 01:11:01 +0100 Subject: mm: convert migrate_pages() to work on folios Almost all of the callers & implementors of migrate_pages() were already converted to use folios. compaction_alloc() & compaction_free() are trivial to convert a part of this patch and not worth splitting out. Link: https://lkml.kernel.org/r/20230513001101.276972-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: "Huang, Ying" Signed-off-by: Andrew Morton --- Documentation/mm/page_migration.rst | 7 +- .../translations/zh_CN/mm/page_migration.rst | 2 +- include/linux/migrate.h | 16 +- mm/compaction.c | 15 +- mm/mempolicy.c | 15 +- mm/migrate.c | 161 ++++++++++----------- mm/vmscan.c | 15 +- 7 files changed, 108 insertions(+), 123 deletions(-) (limited to 'include') diff --git a/Documentation/mm/page_migration.rst b/Documentation/mm/page_migration.rst index 313dce18893e..e35af7805be5 100644 --- a/Documentation/mm/page_migration.rst +++ b/Documentation/mm/page_migration.rst @@ -73,14 +73,13 @@ In kernel use of migrate_pages() It also prevents the swapper or other scans from encountering the page. -2. We need to have a function of type new_page_t that can be +2. We need to have a function of type new_folio_t that can be passed to migrate_pages(). This function should figure out - how to allocate the correct new page given the old page. + how to allocate the correct new folio given the old folio. 3. The migrate_pages() function is called which attempts to do the migration. It will call the function to allocate - the new page for each page that is considered for - moving. + the new folio for each folio that is considered for moving. How migrate_pages() works ========================= diff --git a/Documentation/translations/zh_CN/mm/page_migration.rst b/Documentation/translations/zh_CN/mm/page_migration.rst index 076081dc1635..f95063826a15 100644 --- a/Documentation/translations/zh_CN/mm/page_migration.rst +++ b/Documentation/translations/zh_CN/mm/page_migration.rst @@ -55,7 +55,7 @@ mbind()设置一个新的内存策略。一个进程的页面也可以通过sys_ 消失。它还可以防止交换器或其他扫描器遇到该页。 -2. 我们需要有一个new_page_t类型的函数,可以传递给migrate_pages()。这个函数应该计算 +2. 我们需要有一个new_folio_t类型的函数,可以传递给migrate_pages()。这个函数应该计算 出如何在给定的旧页面中分配正确的新页面。 3. migrate_pages()函数被调用,它试图进行迁移。它将调用该函数为每个被考虑迁移的页面分 diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 6241a1596a75..6de5756d8533 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -7,8 +7,8 @@ #include #include -typedef struct page *new_page_t(struct page *page, unsigned long private); -typedef void free_page_t(struct page *page, unsigned long private); +typedef struct folio *new_folio_t(struct folio *folio, unsigned long private); +typedef void free_folio_t(struct folio *folio, unsigned long private); struct migration_target_control; @@ -67,10 +67,10 @@ int migrate_folio_extra(struct address_space *mapping, struct folio *dst, struct folio *src, enum migrate_mode mode, int extra_count); int migrate_folio(struct address_space *mapping, struct folio *dst, struct folio *src, enum migrate_mode mode); -int migrate_pages(struct list_head *l, new_page_t new, free_page_t free, +int migrate_pages(struct list_head *l, new_folio_t new, free_folio_t free, unsigned long private, enum migrate_mode mode, int reason, unsigned int *ret_succeeded); -struct page *alloc_migration_target(struct page *page, unsigned long private); +struct folio *alloc_migration_target(struct folio *src, unsigned long private); bool isolate_movable_page(struct page *page, isolate_mode_t mode); int migrate_huge_page_move_mapping(struct address_space *mapping, @@ -85,11 +85,11 @@ int folio_migrate_mapping(struct address_space *mapping, #else static inline void putback_movable_pages(struct list_head *l) {} -static inline int migrate_pages(struct list_head *l, new_page_t new, - free_page_t free, unsigned long private, enum migrate_mode mode, - int reason, unsigned int *ret_succeeded) +static inline int migrate_pages(struct list_head *l, new_folio_t new, + free_folio_t free, unsigned long private, + enum migrate_mode mode, int reason, unsigned int *ret_succeeded) { return -ENOSYS; } -static inline struct page *alloc_migration_target(struct page *page, +static inline struct folio *alloc_migration_target(struct folio *src, unsigned long private) { return NULL; } static inline bool isolate_movable_page(struct page *page, isolate_mode_t mode) diff --git a/mm/compaction.c b/mm/compaction.c index f6465ae74d3f..e23e00bec030 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1685,11 +1685,10 @@ splitmap: * This is a migrate-callback that "allocates" freepages by taking pages * from the isolated freelists in the block we are migrating to. */ -static struct page *compaction_alloc(struct page *migratepage, - unsigned long data) +static struct folio *compaction_alloc(struct folio *src, unsigned long data) { struct compact_control *cc = (struct compact_control *)data; - struct page *freepage; + struct folio *dst; if (list_empty(&cc->freepages)) { isolate_freepages(cc); @@ -1698,11 +1697,11 @@ static struct page *compaction_alloc(struct page *migratepage, return NULL; } - freepage = list_entry(cc->freepages.next, struct page, lru); - list_del(&freepage->lru); + dst = list_entry(cc->freepages.next, struct folio, lru); + list_del(&dst->lru); cc->nr_freepages--; - return freepage; + return dst; } /* @@ -1710,11 +1709,11 @@ static struct page *compaction_alloc(struct page *migratepage, * freelist. All pages on the freelist are from the same zone, so there is no * special handling needed for NUMA. */ -static void compaction_free(struct page *page, unsigned long data) +static void compaction_free(struct folio *dst, unsigned long data) { struct compact_control *cc = (struct compact_control *)data; - list_add(&page->lru, &cc->freepages); + list_add(&dst->lru, &cc->freepages); cc->nr_freepages++; } diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 1756389a0609..f06ca8c18e62 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1195,24 +1195,22 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, * list of pages handed to migrate_pages()--which is how we get here-- * is in virtual address order. */ -static struct page *new_page(struct page *page, unsigned long start) +static struct folio *new_folio(struct folio *src, unsigned long start) { - struct folio *dst, *src = page_folio(page); struct vm_area_struct *vma; unsigned long address; VMA_ITERATOR(vmi, current->mm, start); gfp_t gfp = GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL; for_each_vma(vmi, vma) { - address = page_address_in_vma(page, vma); + address = page_address_in_vma(&src->page, vma); if (address != -EFAULT) break; } if (folio_test_hugetlb(src)) { - dst = alloc_hugetlb_folio_vma(folio_hstate(src), + return alloc_hugetlb_folio_vma(folio_hstate(src), vma, address); - return &dst->page; } if (folio_test_large(src)) @@ -1221,9 +1219,8 @@ static struct page *new_page(struct page *page, unsigned long start) /* * if !vma, vma_alloc_folio() will use task or system default policy */ - dst = vma_alloc_folio(gfp, folio_order(src), vma, address, + return vma_alloc_folio(gfp, folio_order(src), vma, address, folio_test_large(src)); - return &dst->page; } #else @@ -1239,7 +1236,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, return -ENOSYS; } -static struct page *new_page(struct page *page, unsigned long start) +static struct folio *new_folio(struct folio *src, unsigned long start) { return NULL; } @@ -1334,7 +1331,7 @@ static long do_mbind(unsigned long start, unsigned long len, if (!list_empty(&pagelist)) { WARN_ON_ONCE(flags & MPOL_MF_LAZY); - nr_failed = migrate_pages(&pagelist, new_page, NULL, + nr_failed = migrate_pages(&pagelist, new_folio, NULL, start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND, NULL); if (nr_failed) putback_movable_pages(&pagelist); diff --git a/mm/migrate.c b/mm/migrate.c index cb292d2a90ce..30b5ce10935e 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1067,15 +1067,13 @@ static void migrate_folio_undo_src(struct folio *src, } /* Restore the destination folio to the original state upon failure */ -static void migrate_folio_undo_dst(struct folio *dst, - bool locked, - free_page_t put_new_page, - unsigned long private) +static void migrate_folio_undo_dst(struct folio *dst, bool locked, + free_folio_t put_new_folio, unsigned long private) { if (locked) folio_unlock(dst); - if (put_new_page) - put_new_page(&dst->page, private); + if (put_new_folio) + put_new_folio(dst, private); else folio_put(dst); } @@ -1099,14 +1097,13 @@ static void migrate_folio_done(struct folio *src, } /* Obtain the lock on page, remove all ptes. */ -static int migrate_folio_unmap(new_page_t get_new_page, free_page_t put_new_page, - unsigned long private, struct folio *src, - struct folio **dstp, enum migrate_mode mode, - enum migrate_reason reason, struct list_head *ret) +static int migrate_folio_unmap(new_folio_t get_new_folio, + free_folio_t put_new_folio, unsigned long private, + struct folio *src, struct folio **dstp, enum migrate_mode mode, + enum migrate_reason reason, struct list_head *ret) { struct folio *dst; int rc = -EAGAIN; - struct page *newpage = NULL; int page_was_mapped = 0; struct anon_vma *anon_vma = NULL; bool is_lru = !__PageMovable(&src->page); @@ -1123,10 +1120,9 @@ static int migrate_folio_unmap(new_page_t get_new_page, free_page_t put_new_page return MIGRATEPAGE_SUCCESS; } - newpage = get_new_page(&src->page, private); - if (!newpage) + dst = get_new_folio(src, private); + if (!dst) return -ENOMEM; - dst = page_folio(newpage); *dstp = dst; dst->private = NULL; @@ -1254,13 +1250,13 @@ out: ret = NULL; migrate_folio_undo_src(src, page_was_mapped, anon_vma, locked, ret); - migrate_folio_undo_dst(dst, dst_locked, put_new_page, private); + migrate_folio_undo_dst(dst, dst_locked, put_new_folio, private); return rc; } /* Migrate the folio to the newly allocated folio in dst. */ -static int migrate_folio_move(free_page_t put_new_page, unsigned long private, +static int migrate_folio_move(free_folio_t put_new_folio, unsigned long private, struct folio *src, struct folio *dst, enum migrate_mode mode, enum migrate_reason reason, struct list_head *ret) @@ -1332,7 +1328,7 @@ out: } migrate_folio_undo_src(src, page_was_mapped, anon_vma, true, ret); - migrate_folio_undo_dst(dst, true, put_new_page, private); + migrate_folio_undo_dst(dst, true, put_new_folio, private); return rc; } @@ -1355,16 +1351,14 @@ out: * because then pte is replaced with migration swap entry and direct I/O code * will wait in the page fault for migration to complete. */ -static int unmap_and_move_huge_page(new_page_t get_new_page, - free_page_t put_new_page, unsigned long private, - struct page *hpage, int force, - enum migrate_mode mode, int reason, - struct list_head *ret) +static int unmap_and_move_huge_page(new_folio_t get_new_folio, + free_folio_t put_new_folio, unsigned long private, + struct folio *src, int force, enum migrate_mode mode, + int reason, struct list_head *ret) { - struct folio *dst, *src = page_folio(hpage); + struct folio *dst; int rc = -EAGAIN; int page_was_mapped = 0; - struct page *new_hpage; struct anon_vma *anon_vma = NULL; struct address_space *mapping = NULL; @@ -1374,10 +1368,9 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, return MIGRATEPAGE_SUCCESS; } - new_hpage = get_new_page(hpage, private); - if (!new_hpage) + dst = get_new_folio(src, private); + if (!dst) return -ENOMEM; - dst = page_folio(new_hpage); if (!folio_trylock(src)) { if (!force) @@ -1418,7 +1411,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, * semaphore in write mode here and set TTU_RMAP_LOCKED * to let lower levels know we have taken the lock. */ - mapping = hugetlb_page_mapping_lock_write(hpage); + mapping = hugetlb_page_mapping_lock_write(&src->page); if (unlikely(!mapping)) goto unlock_put_anon; @@ -1448,7 +1441,7 @@ put_anon: if (rc == MIGRATEPAGE_SUCCESS) { move_hugetlb_state(src, dst, reason); - put_new_page = NULL; + put_new_folio = NULL; } out_unlock: @@ -1464,8 +1457,8 @@ out: * it. Otherwise, put_page() will drop the reference grabbed during * isolation. */ - if (put_new_page) - put_new_page(new_hpage, private); + if (put_new_folio) + put_new_folio(dst, private); else folio_putback_active_hugetlb(dst); @@ -1512,8 +1505,8 @@ struct migrate_pages_stats { * exist any more. It is caller's responsibility to call putback_movable_pages() * only if ret != 0. */ -static int migrate_hugetlbs(struct list_head *from, new_page_t get_new_page, - free_page_t put_new_page, unsigned long private, +static int migrate_hugetlbs(struct list_head *from, new_folio_t get_new_folio, + free_folio_t put_new_folio, unsigned long private, enum migrate_mode mode, int reason, struct migrate_pages_stats *stats, struct list_head *ret_folios) @@ -1551,9 +1544,9 @@ static int migrate_hugetlbs(struct list_head *from, new_page_t get_new_page, continue; } - rc = unmap_and_move_huge_page(get_new_page, - put_new_page, private, - &folio->page, pass > 2, mode, + rc = unmap_and_move_huge_page(get_new_folio, + put_new_folio, private, + folio, pass > 2, mode, reason, ret_folios); /* * The rules are: @@ -1610,11 +1603,11 @@ static int migrate_hugetlbs(struct list_head *from, new_page_t get_new_page, * deadlock (e.g., for loop device). So, if mode != MIGRATE_ASYNC, the * length of the from list must be <= 1. */ -static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page, - free_page_t put_new_page, unsigned long private, - enum migrate_mode mode, int reason, struct list_head *ret_folios, - struct list_head *split_folios, struct migrate_pages_stats *stats, - int nr_pass) +static int migrate_pages_batch(struct list_head *from, + new_folio_t get_new_folio, free_folio_t put_new_folio, + unsigned long private, enum migrate_mode mode, int reason, + struct list_head *ret_folios, struct list_head *split_folios, + struct migrate_pages_stats *stats, int nr_pass) { int retry = 1; int thp_retry = 1; @@ -1664,8 +1657,9 @@ static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page, continue; } - rc = migrate_folio_unmap(get_new_page, put_new_page, private, - folio, &dst, mode, reason, ret_folios); + rc = migrate_folio_unmap(get_new_folio, put_new_folio, + private, folio, &dst, mode, reason, + ret_folios); /* * The rules are: * Success: folio will be freed @@ -1762,7 +1756,7 @@ move: cond_resched(); - rc = migrate_folio_move(put_new_page, private, + rc = migrate_folio_move(put_new_folio, private, folio, dst, mode, reason, ret_folios); /* @@ -1808,7 +1802,7 @@ out: migrate_folio_undo_src(folio, page_was_mapped, anon_vma, true, ret_folios); list_del(&dst->lru); - migrate_folio_undo_dst(dst, true, put_new_page, private); + migrate_folio_undo_dst(dst, true, put_new_folio, private); dst = dst2; dst2 = list_next_entry(dst, lru); } @@ -1816,10 +1810,11 @@ out: return rc; } -static int migrate_pages_sync(struct list_head *from, new_page_t get_new_page, - free_page_t put_new_page, unsigned long private, - enum migrate_mode mode, int reason, struct list_head *ret_folios, - struct list_head *split_folios, struct migrate_pages_stats *stats) +static int migrate_pages_sync(struct list_head *from, new_folio_t get_new_folio, + free_folio_t put_new_folio, unsigned long private, + enum migrate_mode mode, int reason, + struct list_head *ret_folios, struct list_head *split_folios, + struct migrate_pages_stats *stats) { int rc, nr_failed = 0; LIST_HEAD(folios); @@ -1827,7 +1822,7 @@ static int migrate_pages_sync(struct list_head *from, new_page_t get_new_page, memset(&astats, 0, sizeof(astats)); /* Try to migrate in batch with MIGRATE_ASYNC mode firstly */ - rc = migrate_pages_batch(from, get_new_page, put_new_page, private, MIGRATE_ASYNC, + rc = migrate_pages_batch(from, get_new_folio, put_new_folio, private, MIGRATE_ASYNC, reason, &folios, split_folios, &astats, NR_MAX_MIGRATE_ASYNC_RETRY); stats->nr_succeeded += astats.nr_succeeded; @@ -1849,7 +1844,7 @@ static int migrate_pages_sync(struct list_head *from, new_page_t get_new_page, list_splice_tail_init(&folios, from); while (!list_empty(from)) { list_move(from->next, &folios); - rc = migrate_pages_batch(&folios, get_new_page, put_new_page, + rc = migrate_pages_batch(&folios, get_new_folio, put_new_folio, private, mode, reason, ret_folios, split_folios, stats, NR_MAX_MIGRATE_SYNC_RETRY); list_splice_tail_init(&folios, ret_folios); @@ -1866,11 +1861,11 @@ static int migrate_pages_sync(struct list_head *from, new_page_t get_new_page, * supplied as the target for the page migration * * @from: The list of folios to be migrated. - * @get_new_page: The function used to allocate free folios to be used + * @get_new_folio: The function used to allocate free folios to be used * as the target of the folio migration. - * @put_new_page: The function used to free target folios if migration + * @put_new_folio: The function used to free target folios if migration * fails, or NULL if no special handling is necessary. - * @private: Private data to be passed on to get_new_page() + * @private: Private data to be passed on to get_new_folio() * @mode: The migration mode that specifies the constraints for * folio migration, if any. * @reason: The reason for folio migration. @@ -1887,8 +1882,8 @@ static int migrate_pages_sync(struct list_head *from, new_page_t get_new_page, * considered as the number of non-migrated large folio, no matter how many * split folios of the large folio are migrated successfully. */ -int migrate_pages(struct list_head *from, new_page_t get_new_page, - free_page_t put_new_page, unsigned long private, +int migrate_pages(struct list_head *from, new_folio_t get_new_folio, + free_folio_t put_new_folio, unsigned long private, enum migrate_mode mode, int reason, unsigned int *ret_succeeded) { int rc, rc_gather; @@ -1903,7 +1898,7 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, memset(&stats, 0, sizeof(stats)); - rc_gather = migrate_hugetlbs(from, get_new_page, put_new_page, private, + rc_gather = migrate_hugetlbs(from, get_new_folio, put_new_folio, private, mode, reason, &stats, &ret_folios); if (rc_gather < 0) goto out; @@ -1926,12 +1921,14 @@ again: else list_splice_init(from, &folios); if (mode == MIGRATE_ASYNC) - rc = migrate_pages_batch(&folios, get_new_page, put_new_page, private, - mode, reason, &ret_folios, &split_folios, &stats, - NR_MAX_MIGRATE_PAGES_RETRY); + rc = migrate_pages_batch(&folios, get_new_folio, put_new_folio, + private, mode, reason, &ret_folios, + &split_folios, &stats, + NR_MAX_MIGRATE_PAGES_RETRY); else - rc = migrate_pages_sync(&folios, get_new_page, put_new_page, private, - mode, reason, &ret_folios, &split_folios, &stats); + rc = migrate_pages_sync(&folios, get_new_folio, put_new_folio, + private, mode, reason, &ret_folios, + &split_folios, &stats); list_splice_tail_init(&folios, &ret_folios); if (rc < 0) { rc_gather = rc; @@ -1944,8 +1941,9 @@ again: * is counted as 1 failure already. And, we only try to migrate * with minimal effort, force MIGRATE_ASYNC mode and retry once. */ - migrate_pages_batch(&split_folios, get_new_page, put_new_page, private, - MIGRATE_ASYNC, reason, &ret_folios, NULL, &stats, 1); + migrate_pages_batch(&split_folios, get_new_folio, + put_new_folio, private, MIGRATE_ASYNC, reason, + &ret_folios, NULL, &stats, 1); list_splice_tail_init(&split_folios, &ret_folios); } rc_gather += rc; @@ -1980,14 +1978,11 @@ out: return rc_gather; } -struct page *alloc_migration_target(struct page *page, unsigned long private) +struct folio *alloc_migration_target(struct folio *src, unsigned long private) { - struct folio *folio = page_folio(page); struct migration_target_control *mtc; gfp_t gfp_mask; unsigned int order = 0; - struct folio *hugetlb_folio = NULL; - struct folio *new_folio = NULL; int nid; int zidx; @@ -1995,33 +1990,30 @@ struct page *alloc_migration_target(struct page *page, unsigned long private) gfp_mask = mtc->gfp_mask; nid = mtc->nid; if (nid == NUMA_NO_NODE) - nid = folio_nid(folio); + nid = folio_nid(src); - if (folio_test_hugetlb(folio)) { - struct hstate *h = folio_hstate(folio); + if (folio_test_hugetlb(src)) { + struct hstate *h = folio_hstate(src); gfp_mask = htlb_modify_alloc_mask(h, gfp_mask); - hugetlb_folio = alloc_hugetlb_folio_nodemask(h, nid, + return alloc_hugetlb_folio_nodemask(h, nid, mtc->nmask, gfp_mask); - return &hugetlb_folio->page; } - if (folio_test_large(folio)) { + if (folio_test_large(src)) { /* * clear __GFP_RECLAIM to make the migration callback * consistent with regular THP allocations. */ gfp_mask &= ~__GFP_RECLAIM; gfp_mask |= GFP_TRANSHUGE; - order = folio_order(folio); + order = folio_order(src); } - zidx = zone_idx(folio_zone(folio)); + zidx = zone_idx(folio_zone(src)); if (is_highmem_idx(zidx) || zidx == ZONE_MOVABLE) gfp_mask |= __GFP_HIGHMEM; - new_folio = __folio_alloc(gfp_mask, order, nid, mtc->nmask); - - return &new_folio->page; + return __folio_alloc(gfp_mask, order, nid, mtc->nmask); } #ifdef CONFIG_NUMA @@ -2472,13 +2464,12 @@ static bool migrate_balanced_pgdat(struct pglist_data *pgdat, return false; } -static struct page *alloc_misplaced_dst_page(struct page *page, +static struct folio *alloc_misplaced_dst_folio(struct folio *src, unsigned long data) { int nid = (int) data; - int order = compound_order(page); + int order = folio_order(src); gfp_t gfp = __GFP_THISNODE; - struct folio *new; if (order > 0) gfp |= GFP_TRANSHUGE_LIGHT; @@ -2487,9 +2478,7 @@ static struct page *alloc_misplaced_dst_page(struct page *page, __GFP_NOWARN; gfp &= ~__GFP_RECLAIM; } - new = __folio_alloc_node(gfp, order, nid); - - return &new->page; + return __folio_alloc_node(gfp, order, nid); } static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) @@ -2567,7 +2556,7 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, goto out; list_add(&page->lru, &migratepages); - nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page, + nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_folio, NULL, node, MIGRATE_ASYNC, MR_NUMA_MISPLACED, &nr_succeeded); if (nr_remaining) { diff --git a/mm/vmscan.c b/mm/vmscan.c index 15efbfbb1963..4637f6462e9c 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1621,9 +1621,10 @@ static void folio_check_dirty_writeback(struct folio *folio, mapping->a_ops->is_dirty_writeback(folio, dirty, writeback); } -static struct page *alloc_demote_page(struct page *page, unsigned long private) +static struct folio *alloc_demote_folio(struct folio *src, + unsigned long private) { - struct page *target_page; + struct folio *dst; nodemask_t *allowed_mask; struct migration_target_control *mtc; @@ -1641,14 +1642,14 @@ static struct page *alloc_demote_page(struct page *page, unsigned long private) */ mtc->nmask = NULL; mtc->gfp_mask |= __GFP_THISNODE; - target_page = alloc_migration_target(page, (unsigned long)mtc); - if (target_page) - return target_page; + dst = alloc_migration_target(src, (unsigned long)mtc); + if (dst) + return dst; mtc->gfp_mask &= ~__GFP_THISNODE; mtc->nmask = allowed_mask; - return alloc_migration_target(page, (unsigned long)mtc); + return alloc_migration_target(src, (unsigned long)mtc); } /* @@ -1683,7 +1684,7 @@ static unsigned int demote_folio_list(struct list_head *demote_folios, node_get_allowed_targets(pgdat, &allowed_mask); /* Demotion ignores all cpuset and mempolicy settings */ - migrate_pages(demote_folios, alloc_demote_page, NULL, + migrate_pages(demote_folios, alloc_demote_folio, NULL, (unsigned long)&mtc, MIGRATE_ASYNC, MR_DEMOTION, &nr_succeeded); -- cgit v1.2.3 From 89f499f35c11af61ba7075ddc23209d10805a25a Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Thu, 18 May 2023 10:55:14 -0400 Subject: maple_tree: add format option to mt_dump() Allow different formatting strings to be used when dumping the tree. Currently supports hex and decimal. Link: https://lkml.kernel.org/r/20230518145544.1722059-6-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: David Binderman Cc: Peng Zhang Cc: Sergey Senozhatsky Cc: Vernon Yang Cc: Wei Yang Signed-off-by: Andrew Morton --- include/linux/maple_tree.h | 9 ++++- lib/maple_tree.c | 87 ++++++++++++++++++++++++++-------------- lib/test_maple_tree.c | 10 ++--- mm/internal.h | 4 +- mm/mmap.c | 8 ++-- tools/testing/radix-tree/maple.c | 12 +++--- 6 files changed, 82 insertions(+), 48 deletions(-) (limited to 'include') diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index 1fadb5f5978b..140fb271be4a 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -670,10 +670,15 @@ void *mt_next(struct maple_tree *mt, unsigned long index, unsigned long max); #ifdef CONFIG_DEBUG_MAPLE_TREE +enum mt_dump_format { + mt_dump_dec, + mt_dump_hex, +}; + extern atomic_t maple_tree_tests_run; extern atomic_t maple_tree_tests_passed; -void mt_dump(const struct maple_tree *mt); +void mt_dump(const struct maple_tree *mt, enum mt_dump_format format); void mt_validate(struct maple_tree *mt); void mt_cache_shrink(void); #define MT_BUG_ON(__tree, __x) do { \ @@ -681,7 +686,7 @@ void mt_cache_shrink(void); if (__x) { \ pr_info("BUG at %s:%d (%u)\n", \ __func__, __LINE__, __x); \ - mt_dump(__tree); \ + mt_dump(__tree, mt_dump_hex); \ pr_info("Pass: %u Run:%u\n", \ atomic_read(&maple_tree_tests_passed), \ atomic_read(&maple_tree_tests_run)); \ diff --git a/lib/maple_tree.c b/lib/maple_tree.c index e095e2c39a1b..dfa0271101d2 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -5694,7 +5694,7 @@ void *mas_store(struct ma_state *mas, void *entry) trace_ma_write(__func__, mas, 0, entry); #ifdef CONFIG_DEBUG_MAPLE_TREE if (mas->index > mas->last) - pr_err("Error %lu > %lu %p\n", mas->index, mas->last, entry); + pr_err("Error %lX > %lX %p\n", mas->index, mas->last, entry); MT_BUG_ON(mas->tree, mas->index > mas->last); if (mas->index > mas->last) { mas_set_err(mas, -EINVAL); @@ -6748,22 +6748,33 @@ static void mas_dfs_postorder(struct ma_state *mas, unsigned long max) /* Tree validations */ static void mt_dump_node(const struct maple_tree *mt, void *entry, - unsigned long min, unsigned long max, unsigned int depth); + unsigned long min, unsigned long max, unsigned int depth, + enum mt_dump_format format); static void mt_dump_range(unsigned long min, unsigned long max, - unsigned int depth) + unsigned int depth, enum mt_dump_format format) { static const char spaces[] = " "; - if (min == max) - pr_info("%.*s%lu: ", depth * 2, spaces, min); - else - pr_info("%.*s%lu-%lu: ", depth * 2, spaces, min, max); + switch(format) { + case mt_dump_hex: + if (min == max) + pr_info("%.*s%lx: ", depth * 2, spaces, min); + else + pr_info("%.*s%lx-%lx: ", depth * 2, spaces, min, max); + break; + default: + case mt_dump_dec: + if (min == max) + pr_info("%.*s%lu: ", depth * 2, spaces, min); + else + pr_info("%.*s%lu-%lu: ", depth * 2, spaces, min, max); + } } static void mt_dump_entry(void *entry, unsigned long min, unsigned long max, - unsigned int depth) + unsigned int depth, enum mt_dump_format format) { - mt_dump_range(min, max, depth); + mt_dump_range(min, max, depth, format); if (xa_is_value(entry)) pr_cont("value %ld (0x%lx) [%p]\n", xa_to_value(entry), @@ -6777,7 +6788,8 @@ static void mt_dump_entry(void *entry, unsigned long min, unsigned long max, } static void mt_dump_range64(const struct maple_tree *mt, void *entry, - unsigned long min, unsigned long max, unsigned int depth) + unsigned long min, unsigned long max, unsigned int depth, + enum mt_dump_format format) { struct maple_range_64 *node = &mte_to_node(entry)->mr64; bool leaf = mte_is_leaf(entry); @@ -6785,8 +6797,16 @@ static void mt_dump_range64(const struct maple_tree *mt, void *entry, int i; pr_cont(" contents: "); - for (i = 0; i < MAPLE_RANGE64_SLOTS - 1; i++) - pr_cont("%p %lu ", node->slot[i], node->pivot[i]); + for (i = 0; i < MAPLE_RANGE64_SLOTS - 1; i++) { + switch(format) { + case mt_dump_hex: + pr_cont("%p %lX ", node->slot[i], node->pivot[i]); + break; + default: + case mt_dump_dec: + pr_cont("%p %lu ", node->slot[i], node->pivot[i]); + } + } pr_cont("%p\n", node->slot[i]); for (i = 0; i < MAPLE_RANGE64_SLOTS; i++) { unsigned long last = max; @@ -6799,24 +6819,32 @@ static void mt_dump_range64(const struct maple_tree *mt, void *entry, break; if (leaf) mt_dump_entry(mt_slot(mt, node->slot, i), - first, last, depth + 1); + first, last, depth + 1, format); else if (node->slot[i]) mt_dump_node(mt, mt_slot(mt, node->slot, i), - first, last, depth + 1); + first, last, depth + 1, format); if (last == max) break; if (last > max) { - pr_err("node %p last (%lu) > max (%lu) at pivot %d!\n", + switch(format) { + case mt_dump_hex: + pr_err("node %p last (%lx) > max (%lx) at pivot %d!\n", node, last, max, i); - break; + break; + default: + case mt_dump_dec: + pr_err("node %p last (%lu) > max (%lu) at pivot %d!\n", + node, last, max, i); + } } first = last + 1; } } static void mt_dump_arange64(const struct maple_tree *mt, void *entry, - unsigned long min, unsigned long max, unsigned int depth) + unsigned long min, unsigned long max, unsigned int depth, + enum mt_dump_format format) { struct maple_arange_64 *node = &mte_to_node(entry)->ma64; bool leaf = mte_is_leaf(entry); @@ -6841,10 +6869,10 @@ static void mt_dump_arange64(const struct maple_tree *mt, void *entry, break; if (leaf) mt_dump_entry(mt_slot(mt, node->slot, i), - first, last, depth + 1); + first, last, depth + 1, format); else if (node->slot[i]) mt_dump_node(mt, mt_slot(mt, node->slot, i), - first, last, depth + 1); + first, last, depth + 1, format); if (last == max) break; @@ -6858,13 +6886,14 @@ static void mt_dump_arange64(const struct maple_tree *mt, void *entry, } static void mt_dump_node(const struct maple_tree *mt, void *entry, - unsigned long min, unsigned long max, unsigned int depth) + unsigned long min, unsigned long max, unsigned int depth, + enum mt_dump_format format) { struct maple_node *node = mte_to_node(entry); unsigned int type = mte_node_type(entry); unsigned int i; - mt_dump_range(min, max, depth); + mt_dump_range(min, max, depth, format); pr_cont("node %p depth %d type %d parent %p", node, depth, type, node ? node->parent : NULL); @@ -6875,15 +6904,15 @@ static void mt_dump_node(const struct maple_tree *mt, void *entry, if (min + i > max) pr_cont("OUT OF RANGE: "); mt_dump_entry(mt_slot(mt, node->slot, i), - min + i, min + i, depth); + min + i, min + i, depth, format); } break; case maple_leaf_64: case maple_range_64: - mt_dump_range64(mt, entry, min, max, depth); + mt_dump_range64(mt, entry, min, max, depth, format); break; case maple_arange_64: - mt_dump_arange64(mt, entry, min, max, depth); + mt_dump_arange64(mt, entry, min, max, depth, format); break; default: @@ -6891,16 +6920,16 @@ static void mt_dump_node(const struct maple_tree *mt, void *entry, } } -void mt_dump(const struct maple_tree *mt) +void mt_dump(const struct maple_tree *mt, enum mt_dump_format format) { void *entry = rcu_dereference_check(mt->ma_root, mt_locked(mt)); pr_info("maple_tree(%p) flags %X, height %u root %p\n", mt, mt->ma_flags, mt_height(mt), entry); if (!xa_is_node(entry)) - mt_dump_entry(entry, 0, 0, 0); + mt_dump_entry(entry, 0, 0, 0, format); else if (entry) - mt_dump_node(mt, entry, 0, mt_node_max(entry), 0); + mt_dump_node(mt, entry, 0, mt_node_max(entry), 0, format); } EXPORT_SYMBOL_GPL(mt_dump); @@ -6953,7 +6982,7 @@ static void mas_validate_gaps(struct ma_state *mas) mas_mn(mas), i, mas_get_slot(mas, i), gap, p_end, p_start); - mt_dump(mas->tree); + mt_dump(mas->tree, mt_dump_hex); MT_BUG_ON(mas->tree, gap != p_end - p_start + 1); @@ -6986,7 +7015,7 @@ counted: MT_BUG_ON(mas->tree, max_gap > mas->max); if (ma_gaps(p_mn, mas_parent_type(mas, mte))[p_slot] != max_gap) { pr_err("gap %p[%u] != %lu\n", p_mn, p_slot, max_gap); - mt_dump(mas->tree); + mt_dump(mas->tree, mt_dump_hex); } MT_BUG_ON(mas->tree, diff --git a/lib/test_maple_tree.c b/lib/test_maple_tree.c index f1db333270e9..d6929270dd36 100644 --- a/lib/test_maple_tree.c +++ b/lib/test_maple_tree.c @@ -219,7 +219,7 @@ static noinline void check_rev_seq(struct maple_tree *mt, unsigned long max, #ifndef __KERNEL__ if (verbose) { rcu_barrier(); - mt_dump(mt); + mt_dump(mt, mt_dump_dec); pr_info(" %s test of 0-%lu %luK in %d active (%d total)\n", __func__, max, mt_get_alloc_size()/1024, mt_nr_allocated(), mt_nr_tallocated()); @@ -248,7 +248,7 @@ static noinline void check_seq(struct maple_tree *mt, unsigned long max, #ifndef __KERNEL__ if (verbose) { rcu_barrier(); - mt_dump(mt); + mt_dump(mt, mt_dump_dec); pr_info(" seq test of 0-%lu %luK in %d active (%d total)\n", max, mt_get_alloc_size()/1024, mt_nr_allocated(), mt_nr_tallocated()); @@ -893,7 +893,7 @@ static noinline void check_alloc_range(struct maple_tree *mt) #if DEBUG_ALLOC_RANGE pr_debug("\tInsert %lu-%lu\n", range[i] >> 12, (range[i + 1] >> 12) - 1); - mt_dump(mt); + mt_dump(mt, mt_dump_hex); #endif check_insert_range(mt, range[i] >> 12, (range[i + 1] >> 12) - 1, xa_mk_value(range[i] >> 12), 0); @@ -934,7 +934,7 @@ static noinline void check_alloc_range(struct maple_tree *mt) xa_mk_value(req_range[i] >> 12)); /* pointer */ mt_validate(mt); #if DEBUG_ALLOC_RANGE - mt_dump(mt); + mt_dump(mt, mt_dump_hex); #endif } @@ -1572,7 +1572,7 @@ static noinline void check_node_overwrite(struct maple_tree *mt) mtree_test_store_range(mt, i*100, i*100 + 50, xa_mk_value(i*100)); mtree_test_store_range(mt, 319951, 367950, NULL); - /*mt_dump(mt); */ + /*mt_dump(mt, mt_dump_dec); */ mt_validate(mt); } diff --git a/mm/internal.h b/mm/internal.h index ec55da813c13..692498a84fde 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1064,13 +1064,13 @@ static inline void vma_iter_store(struct vma_iterator *vmi, printk("%lu > %lu\n", vmi->mas.index, vma->vm_start); printk("store of vma %lu-%lu", vma->vm_start, vma->vm_end); printk("into slot %lu-%lu", vmi->mas.index, vmi->mas.last); - mt_dump(vmi->mas.tree); + mt_dump(vmi->mas.tree, mt_dump_hex); } if (WARN_ON(vmi->mas.node != MAS_START && vmi->mas.last < vma->vm_start)) { printk("%lu < %lu\n", vmi->mas.last, vma->vm_start); printk("store of vma %lu-%lu", vma->vm_start, vma->vm_end); printk("into slot %lu-%lu", vmi->mas.index, vmi->mas.last); - mt_dump(vmi->mas.tree); + mt_dump(vmi->mas.tree, mt_dump_hex); } #endif diff --git a/mm/mmap.c b/mm/mmap.c index 13678edaa22c..04bcf3b3c720 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -301,7 +301,7 @@ out: #if defined(CONFIG_DEBUG_VM_MAPLE_TREE) extern void mt_validate(struct maple_tree *mt); -extern void mt_dump(const struct maple_tree *mt); +extern void mt_dump(const struct maple_tree *mt, enum mt_dump_format fmt); /* Validate the maple tree */ static void validate_mm_mt(struct mm_struct *mm) @@ -323,18 +323,18 @@ static void validate_mm_mt(struct mm_struct *mm) pr_emerg("mt vma: %p %lu - %lu\n", vma_mt, vma_mt->vm_start, vma_mt->vm_end); - mt_dump(mas.tree); + mt_dump(mas.tree, mt_dump_hex); if (vma_mt->vm_end != mas.last + 1) { pr_err("vma: %p vma_mt %lu-%lu\tmt %lu-%lu\n", mm, vma_mt->vm_start, vma_mt->vm_end, mas.index, mas.last); - mt_dump(mas.tree); + mt_dump(mas.tree, mt_dump_hex); } VM_BUG_ON_MM(vma_mt->vm_end != mas.last + 1, mm); if (vma_mt->vm_start != mas.index) { pr_err("vma: %p vma_mt %p %lu - %lu doesn't match\n", mm, vma_mt, vma_mt->vm_start, vma_mt->vm_end); - mt_dump(mas.tree); + mt_dump(mas.tree, mt_dump_hex); } VM_BUG_ON_MM(vma_mt->vm_start != mas.index, mm); } diff --git a/tools/testing/radix-tree/maple.c b/tools/testing/radix-tree/maple.c index 75df543e019c..ebcb3faf85ea 100644 --- a/tools/testing/radix-tree/maple.c +++ b/tools/testing/radix-tree/maple.c @@ -1054,7 +1054,7 @@ static noinline void check_erase2_testset(struct maple_tree *mt, if (entry_count) MT_BUG_ON(mt, !mt_height(mt)); #if check_erase2_debug > 1 - mt_dump(mt); + mt_dump(mt, mt_dump_hex); #endif #if check_erase2_debug pr_err("Done\n"); @@ -1085,7 +1085,7 @@ static noinline void check_erase2_testset(struct maple_tree *mt, mas_for_each(&mas, foo, ULONG_MAX) { if (xa_is_zero(foo)) { if (addr == mas.index) { - mt_dump(mas.tree); + mt_dump(mas.tree, mt_dump_hex); pr_err("retry failed %lu - %lu\n", mas.index, mas.last); MT_BUG_ON(mt, 1); @@ -34513,7 +34513,7 @@ static void *rcu_reader_rev(void *ptr) if (mas.index != r_start) { alt = xa_mk_value(index + i * 2 + 1 + RCU_RANGE_COUNT); - mt_dump(test->mt); + mt_dump(test->mt, mt_dump_dec); printk("Error: %lu-%lu %p != %lu-%lu %p %p line %d i %d\n", mas.index, mas.last, entry, r_start, r_end, expected, alt, @@ -35784,10 +35784,10 @@ void farmer_tests(void) struct maple_node *node; DEFINE_MTREE(tree); - mt_dump(&tree); + mt_dump(&tree, mt_dump_dec); tree.ma_root = xa_mk_value(0); - mt_dump(&tree); + mt_dump(&tree, mt_dump_dec); node = mt_alloc_one(GFP_KERNEL); node->parent = (void *)((unsigned long)(&tree) | 1); @@ -35797,7 +35797,7 @@ void farmer_tests(void) node->mr64.pivot[1] = 1; node->mr64.pivot[2] = 0; tree.ma_root = mt_mk_node(node, maple_leaf_64); - mt_dump(&tree); + mt_dump(&tree, mt_dump_dec); node->parent = ma_parent_ptr(node); ma_free_rcu(node); -- cgit v1.2.3 From f0a1f866aba1ca62ef6f17d1c441eba65f2d6845 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Thu, 18 May 2023 10:55:15 -0400 Subject: maple_tree: add debug BUG_ON and WARN_ON variants Add debug macros to dump the maple state and/or the tree for both warning and bug_on calls. Link: https://lkml.kernel.org/r/20230518145544.1722059-7-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: David Binderman Cc: Peng Zhang Cc: Sergey Senozhatsky Cc: Vernon Yang Cc: Wei Yang Signed-off-by: Andrew Morton --- include/linux/maple_tree.h | 100 +++++++++++++++++++++++++++++++++++++++++++-- lib/maple_tree.c | 34 ++++++++++++++- 2 files changed, 129 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index 140fb271be4a..204d7941a39e 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -482,13 +482,13 @@ static inline void mas_init(struct ma_state *mas, struct maple_tree *tree, } /* Checks if a mas has not found anything */ -static inline bool mas_is_none(struct ma_state *mas) +static inline bool mas_is_none(const struct ma_state *mas) { return mas->node == MAS_NONE; } /* Checks if a mas has been paused */ -static inline bool mas_is_paused(struct ma_state *mas) +static inline bool mas_is_paused(const struct ma_state *mas) { return mas->node == MAS_PAUSE; } @@ -679,6 +679,8 @@ extern atomic_t maple_tree_tests_run; extern atomic_t maple_tree_tests_passed; void mt_dump(const struct maple_tree *mt, enum mt_dump_format format); +void mas_dump(const struct ma_state *mas); +void mas_wr_dump(const struct ma_wr_state *wr_mas); void mt_validate(struct maple_tree *mt); void mt_cache_shrink(void); #define MT_BUG_ON(__tree, __x) do { \ @@ -695,8 +697,100 @@ void mt_cache_shrink(void); atomic_inc(&maple_tree_tests_passed); \ } \ } while (0) + +#define MAS_BUG_ON(__mas, __x) do { \ + atomic_inc(&maple_tree_tests_run); \ + if (__x) { \ + pr_info("BUG at %s:%d (%u)\n", \ + __func__, __LINE__, __x); \ + mas_dump(__mas); \ + mt_dump((__mas)->tree, mt_dump_hex); \ + pr_info("Pass: %u Run:%u\n", \ + atomic_read(&maple_tree_tests_passed), \ + atomic_read(&maple_tree_tests_run)); \ + dump_stack(); \ + } else { \ + atomic_inc(&maple_tree_tests_passed); \ + } \ +} while (0) + +#define MAS_WR_BUG_ON(__wrmas, __x) do { \ + atomic_inc(&maple_tree_tests_run); \ + if (__x) { \ + pr_info("BUG at %s:%d (%u)\n", \ + __func__, __LINE__, __x); \ + mas_wr_dump(__wrmas); \ + mas_dump((__wrmas)->mas); \ + mt_dump((__wrmas)->mas->tree, mt_dump_hex); \ + pr_info("Pass: %u Run:%u\n", \ + atomic_read(&maple_tree_tests_passed), \ + atomic_read(&maple_tree_tests_run)); \ + dump_stack(); \ + } else { \ + atomic_inc(&maple_tree_tests_passed); \ + } \ +} while (0) + +#define MT_WARN_ON(__tree, __x) ({ \ + int ret = !!(__x); \ + atomic_inc(&maple_tree_tests_run); \ + if (ret) { \ + pr_info("WARN at %s:%d (%u)\n", \ + __func__, __LINE__, __x); \ + mt_dump(__tree, mt_dump_hex); \ + pr_info("Pass: %u Run:%u\n", \ + atomic_read(&maple_tree_tests_passed), \ + atomic_read(&maple_tree_tests_run)); \ + dump_stack(); \ + } else { \ + atomic_inc(&maple_tree_tests_passed); \ + } \ + unlikely(ret); \ +}) + +#define MAS_WARN_ON(__mas, __x) ({ \ + int ret = !!(__x); \ + atomic_inc(&maple_tree_tests_run); \ + if (ret) { \ + pr_info("WARN at %s:%d (%u)\n", \ + __func__, __LINE__, __x); \ + mas_dump(__mas); \ + mt_dump((__mas)->tree, mt_dump_hex); \ + pr_info("Pass: %u Run:%u\n", \ + atomic_read(&maple_tree_tests_passed), \ + atomic_read(&maple_tree_tests_run)); \ + dump_stack(); \ + } else { \ + atomic_inc(&maple_tree_tests_passed); \ + } \ + unlikely(ret); \ +}) + +#define MAS_WR_WARN_ON(__wrmas, __x) ({ \ + int ret = !!(__x); \ + atomic_inc(&maple_tree_tests_run); \ + if (ret) { \ + pr_info("WARN at %s:%d (%u)\n", \ + __func__, __LINE__, __x); \ + mas_wr_dump(__wrmas); \ + mas_dump((__wrmas)->mas); \ + mt_dump((__wrmas)->mas->tree, mt_dump_hex); \ + pr_info("Pass: %u Run:%u\n", \ + atomic_read(&maple_tree_tests_passed), \ + atomic_read(&maple_tree_tests_run)); \ + dump_stack(); \ + } else { \ + atomic_inc(&maple_tree_tests_passed); \ + } \ + unlikely(ret); \ +}) #else -#define MT_BUG_ON(__tree, __x) BUG_ON(__x) +#define MT_BUG_ON(__tree, __x) BUG_ON(__x) +#define MAS_BUG_ON(__mas, __x) BUG_ON(__x) +#define MAS_WR_BUG_ON(__mas, __x) BUG_ON(__x) +#define MT_WARN_ON(__tree, __x) WARN_ON(__x) +#define MAS_WARN_ON(__mas, __x) WARN_ON(__x) +#define MAS_WR_WARN_ON(__mas, __x) WARN_ON(__x) #endif /* CONFIG_DEBUG_MAPLE_TREE */ #endif /*_LINUX_MAPLE_TREE_H */ diff --git a/lib/maple_tree.c b/lib/maple_tree.c index dfa0271101d2..ff16b6c0ac08 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -240,12 +240,12 @@ static inline void mas_set_err(struct ma_state *mas, long err) mas->node = MA_ERROR(err); } -static inline bool mas_is_ptr(struct ma_state *mas) +static inline bool mas_is_ptr(const struct ma_state *mas) { return mas->node == MAS_ROOT; } -static inline bool mas_is_start(struct ma_state *mas) +static inline bool mas_is_start(const struct ma_state *mas) { return mas->node == MAS_START; } @@ -7246,4 +7246,34 @@ done: } EXPORT_SYMBOL_GPL(mt_validate); +void mas_dump(const struct ma_state *mas) +{ + pr_err("MAS: tree=%p enode=%p ", mas->tree, mas->node); + if (mas_is_none(mas)) + pr_err("(MAS_NONE) "); + else if (mas_is_ptr(mas)) + pr_err("(MAS_ROOT) "); + else if (mas_is_start(mas)) + pr_err("(MAS_START) "); + else if (mas_is_paused(mas)) + pr_err("(MAS_PAUSED) "); + + pr_err("[%u] index=%lx last=%lx\n", mas->offset, mas->index, mas->last); + pr_err(" min=%lx max=%lx alloc=%p, depth=%u, flags=%x\n", + mas->min, mas->max, mas->alloc, mas->depth, mas->mas_flags); + if (mas->index > mas->last) + pr_err("Check index & last\n"); +} +EXPORT_SYMBOL_GPL(mas_dump); + +void mas_wr_dump(const struct ma_wr_state *wr_mas) +{ + pr_err("WR_MAS: node=%p r_min=%lx r_max=%lx\n", + wr_mas->node, wr_mas->r_min, wr_mas->r_max); + pr_err(" type=%u off_end=%u, node_end=%u, end_piv=%lx\n", + wr_mas->type, wr_mas->offset_end, wr_mas->node_end, + wr_mas->end_piv); +} +EXPORT_SYMBOL_GPL(mas_wr_dump); + #endif /* CONFIG_DEBUG_MAPLE_TREE */ -- cgit v1.2.3 From 7f2f9dc16fee59afdec2df8c794e97c36e387257 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Thu, 18 May 2023 10:55:17 -0400 Subject: maple_tree: change RCU checks to WARN_ON() instead of BUG_ON() If RCU is enabled and the tree isn't locked, just warn the user and avoid crashing the kernel. Link: https://lkml.kernel.org/r/20230518145544.1722059-9-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: David Binderman Cc: Peng Zhang Cc: Sergey Senozhatsky Cc: Vernon Yang Cc: Wei Yang Signed-off-by: Andrew Morton --- include/linux/maple_tree.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index 204d7941a39e..ed92abf4c1fb 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -616,7 +616,7 @@ static inline void mt_clear_in_rcu(struct maple_tree *mt) return; if (mt_external_lock(mt)) { - BUG_ON(!mt_lock_is_held(mt)); + WARN_ON(!mt_lock_is_held(mt)); mt->ma_flags &= ~MT_FLAGS_USE_RCU; } else { mtree_lock(mt); @@ -635,7 +635,7 @@ static inline void mt_set_in_rcu(struct maple_tree *mt) return; if (mt_external_lock(mt)) { - BUG_ON(!mt_lock_is_held(mt)); + WARN_ON(!mt_lock_is_held(mt)); mt->ma_flags |= MT_FLAGS_USE_RCU; } else { mtree_lock(mt); -- cgit v1.2.3 From b50e195ff436625b26dcc9839bc52cc7c5bf1a54 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Thu, 18 May 2023 10:55:26 -0400 Subject: mm: update validate_mm() to use vma iterator Use the vma iterator in the validation code and combine the code to check the maple tree into the main validate_mm() function. Introduce a new function vma_iter_dump_tree() to dump the maple tree in hex layout. Replace all calls to validate_mm_mt() with validate_mm(). [Liam.Howlett@oracle.com: update validate_mm() to use vma iterator CONFIG flag] Link: https://lkml.kernel.org/r/20230606183538.588190-1-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20230518145544.1722059-18-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: David Binderman Cc: Peng Zhang Cc: Sergey Senozhatsky Cc: Vernon Yang Cc: Wei Yang Signed-off-by: Andrew Morton --- include/linux/mmdebug.h | 14 ++++++++ mm/debug.c | 9 +++++ mm/internal.h | 3 +- mm/mmap.c | 94 +++++++++++++++++++------------------------------ 4 files changed, 61 insertions(+), 59 deletions(-) (limited to 'include') diff --git a/include/linux/mmdebug.h b/include/linux/mmdebug.h index b8728d11c949..7c3e7b0b0e8f 100644 --- a/include/linux/mmdebug.h +++ b/include/linux/mmdebug.h @@ -8,10 +8,12 @@ struct page; struct vm_area_struct; struct mm_struct; +struct vma_iterator; void dump_page(struct page *page, const char *reason); void dump_vma(const struct vm_area_struct *vma); void dump_mm(const struct mm_struct *mm); +void vma_iter_dump_tree(const struct vma_iterator *vmi); #ifdef CONFIG_DEBUG_VM #define VM_BUG_ON(cond) BUG_ON(cond) @@ -74,6 +76,17 @@ void dump_mm(const struct mm_struct *mm); } \ unlikely(__ret_warn_once); \ }) +#define VM_WARN_ON_ONCE_MM(cond, mm) ({ \ + static bool __section(".data.once") __warned; \ + int __ret_warn_once = !!(cond); \ + \ + if (unlikely(__ret_warn_once && !__warned)) { \ + dump_mm(mm); \ + __warned = true; \ + WARN_ON(1); \ + } \ + unlikely(__ret_warn_once); \ +}) #define VM_WARN_ON(cond) (void)WARN_ON(cond) #define VM_WARN_ON_ONCE(cond) (void)WARN_ON_ONCE(cond) @@ -90,6 +103,7 @@ void dump_mm(const struct mm_struct *mm); #define VM_WARN_ON_ONCE_PAGE(cond, page) BUILD_BUG_ON_INVALID(cond) #define VM_WARN_ON_FOLIO(cond, folio) BUILD_BUG_ON_INVALID(cond) #define VM_WARN_ON_ONCE_FOLIO(cond, folio) BUILD_BUG_ON_INVALID(cond) +#define VM_WARN_ON_ONCE_MM(cond, mm) BUILD_BUG_ON_INVALID(cond) #define VM_WARN_ONCE(cond, format...) BUILD_BUG_ON_INVALID(cond) #define VM_WARN(cond, format...) BUILD_BUG_ON_INVALID(cond) #endif diff --git a/mm/debug.c b/mm/debug.c index c7b228097bd9..ee533a5ceb79 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -268,4 +268,13 @@ void page_init_poison(struct page *page, size_t size) if (page_init_poisoning) memset(page, PAGE_POISON_PATTERN, size); } + +void vma_iter_dump_tree(const struct vma_iterator *vmi) +{ +#if defined(CONFIG_DEBUG_VM_MAPLE_TREE) + mas_dump(&vmi->mas); + mt_dump(vmi->mas.tree, mt_dump_hex); +#endif /* CONFIG_DEBUG_VM_MAPLE_TREE */ +} + #endif /* CONFIG_DEBUG_VM */ diff --git a/mm/internal.h b/mm/internal.h index 692498a84fde..41cc5e6225fb 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1064,13 +1064,14 @@ static inline void vma_iter_store(struct vma_iterator *vmi, printk("%lu > %lu\n", vmi->mas.index, vma->vm_start); printk("store of vma %lu-%lu", vma->vm_start, vma->vm_end); printk("into slot %lu-%lu", vmi->mas.index, vmi->mas.last); - mt_dump(vmi->mas.tree, mt_dump_hex); + vma_iter_dump_tree(vmi); } if (WARN_ON(vmi->mas.node != MAS_START && vmi->mas.last < vma->vm_start)) { printk("%lu < %lu\n", vmi->mas.last, vma->vm_start); printk("store of vma %lu-%lu", vma->vm_start, vma->vm_end); printk("into slot %lu-%lu", vmi->mas.index, vmi->mas.last); mt_dump(vmi->mas.tree, mt_dump_hex); + vma_iter_dump_tree(vmi); } #endif diff --git a/mm/mmap.c b/mm/mmap.c index 04bcf3b3c720..8f67d80c6dde 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -300,61 +300,40 @@ out: } #if defined(CONFIG_DEBUG_VM_MAPLE_TREE) -extern void mt_validate(struct maple_tree *mt); -extern void mt_dump(const struct maple_tree *mt, enum mt_dump_format fmt); - -/* Validate the maple tree */ -static void validate_mm_mt(struct mm_struct *mm) -{ - struct maple_tree *mt = &mm->mm_mt; - struct vm_area_struct *vma_mt; - - MA_STATE(mas, mt, 0, 0); - - mt_validate(&mm->mm_mt); - mas_for_each(&mas, vma_mt, ULONG_MAX) { - if ((vma_mt->vm_start != mas.index) || - (vma_mt->vm_end - 1 != mas.last)) { - pr_emerg("issue in %s\n", current->comm); - dump_stack(); - dump_vma(vma_mt); - pr_emerg("mt piv: %p %lu - %lu\n", vma_mt, - mas.index, mas.last); - pr_emerg("mt vma: %p %lu - %lu\n", vma_mt, - vma_mt->vm_start, vma_mt->vm_end); - - mt_dump(mas.tree, mt_dump_hex); - if (vma_mt->vm_end != mas.last + 1) { - pr_err("vma: %p vma_mt %lu-%lu\tmt %lu-%lu\n", - mm, vma_mt->vm_start, vma_mt->vm_end, - mas.index, mas.last); - mt_dump(mas.tree, mt_dump_hex); - } - VM_BUG_ON_MM(vma_mt->vm_end != mas.last + 1, mm); - if (vma_mt->vm_start != mas.index) { - pr_err("vma: %p vma_mt %p %lu - %lu doesn't match\n", - mm, vma_mt, vma_mt->vm_start, vma_mt->vm_end); - mt_dump(mas.tree, mt_dump_hex); - } - VM_BUG_ON_MM(vma_mt->vm_start != mas.index, mm); - } - } -} - static void validate_mm(struct mm_struct *mm) { int bug = 0; int i = 0; struct vm_area_struct *vma; - MA_STATE(mas, &mm->mm_mt, 0, 0); - - validate_mm_mt(mm); + VMA_ITERATOR(vmi, mm, 0); - mas_for_each(&mas, vma, ULONG_MAX) { + mt_validate(&mm->mm_mt); + for_each_vma(vmi, vma) { #ifdef CONFIG_DEBUG_VM_RB struct anon_vma *anon_vma = vma->anon_vma; struct anon_vma_chain *avc; +#endif + unsigned long vmi_start, vmi_end; + bool warn = 0; + vmi_start = vma_iter_addr(&vmi); + vmi_end = vma_iter_end(&vmi); + if (VM_WARN_ON_ONCE_MM(vma->vm_end != vmi_end, mm)) + warn = 1; + + if (VM_WARN_ON_ONCE_MM(vma->vm_start != vmi_start, mm)) + warn = 1; + + if (warn) { + pr_emerg("issue in %s\n", current->comm); + dump_stack(); + dump_vma(vma); + pr_emerg("tree range: %px start %lx end %lx\n", vma, + vmi_start, vmi_end - 1); + vma_iter_dump_tree(&vmi); + } + +#ifdef CONFIG_DEBUG_VM_RB if (anon_vma) { anon_vma_lock_read(anon_vma); list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) @@ -365,14 +344,13 @@ static void validate_mm(struct mm_struct *mm) i++; } if (i != mm->map_count) { - pr_emerg("map_count %d mas_for_each %d\n", mm->map_count, i); + pr_emerg("map_count %d vma iterator %d\n", mm->map_count, i); bug = 1; } VM_BUG_ON_MM(bug, mm); } #else /* !CONFIG_DEBUG_VM_MAPLE_TREE */ -#define validate_mm_mt(root) do { } while (0) #define validate_mm(mm) do { } while (0) #endif /* CONFIG_DEBUG_VM_MAPLE_TREE */ @@ -2234,7 +2212,7 @@ int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, struct vm_area_struct *new; int err; - validate_mm_mt(vma->vm_mm); + validate_mm(vma->vm_mm); WARN_ON(vma->vm_start >= addr); WARN_ON(vma->vm_end <= addr); @@ -2292,7 +2270,7 @@ int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, /* Success. */ if (new_below) vma_next(vmi); - validate_mm_mt(vma->vm_mm); + validate_mm(vma->vm_mm); return 0; out_free_mpol: @@ -2301,7 +2279,7 @@ out_free_vmi: vma_iter_free(vmi); out_free_vma: vm_area_free(new); - validate_mm_mt(vma->vm_mm); + validate_mm(vma->vm_mm); return err; } @@ -2936,7 +2914,7 @@ int do_vma_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, arch_unmap(mm, start, end); ret = do_vmi_align_munmap(vmi, vma, mm, start, end, uf, downgrade); - validate_mm_mt(mm); + validate_mm(mm); return ret; } @@ -2958,7 +2936,7 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, struct mm_struct *mm = current->mm; struct vma_prepare vp; - validate_mm_mt(mm); + validate_mm(mm); /* * Check against address space limits by the changed size * Note: This happens *after* clearing old mappings in some code paths. @@ -3199,7 +3177,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, bool faulted_in_anon_vma = true; VMA_ITERATOR(vmi, mm, addr); - validate_mm_mt(mm); + validate_mm(mm); /* * If anonymous vma has not yet been faulted, update new pgoff * to match new location, to increase its chance of merging. @@ -3258,7 +3236,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, goto out_vma_link; *need_rmap_locks = false; } - validate_mm_mt(mm); + validate_mm(mm); return new_vma; out_vma_link: @@ -3274,7 +3252,7 @@ out_free_mempol: out_free_vma: vm_area_free(new_vma); out: - validate_mm_mt(mm); + validate_mm(mm); return NULL; } @@ -3411,7 +3389,7 @@ static struct vm_area_struct *__install_special_mapping( int ret; struct vm_area_struct *vma; - validate_mm_mt(mm); + validate_mm(mm); vma = vm_area_alloc(mm); if (unlikely(vma == NULL)) return ERR_PTR(-ENOMEM); @@ -3434,12 +3412,12 @@ static struct vm_area_struct *__install_special_mapping( perf_event_mmap(vma); - validate_mm_mt(mm); + validate_mm(mm); return vma; out: vm_area_free(vma); - validate_mm_mt(mm); + validate_mm(mm); return ERR_PTR(ret); } -- cgit v1.2.3 From 6169b553195a193c52a675e45a9578f595fe194f Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Thu, 18 May 2023 10:55:37 -0400 Subject: maple_tree: add mas_next_range() and mas_find_range() interfaces Some users of the maple tree may want to move to the next range in the tree, even if it stores a NULL. This family of function provides that functionality by advancing one slot at a time and returning the result, while mas_contiguous() will iterate over the range and stop on encountering the first NULL. Link: https://lkml.kernel.org/r/20230518145544.1722059-29-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: David Binderman Cc: Peng Zhang Cc: Sergey Senozhatsky Cc: Vernon Yang Cc: Wei Yang Signed-off-by: Andrew Morton --- include/linux/maple_tree.h | 3 +- lib/maple_tree.c | 172 ++++++++++++++++++++++++++++++++------------- 2 files changed, 124 insertions(+), 51 deletions(-) (limited to 'include') diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index ed92abf4c1fb..9d040043858a 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -455,6 +455,7 @@ void *mas_erase(struct ma_state *mas); int mas_store_gfp(struct ma_state *mas, void *entry, gfp_t gfp); void mas_store_prealloc(struct ma_state *mas, void *entry); void *mas_find(struct ma_state *mas, unsigned long max); +void *mas_find_range(struct ma_state *mas, unsigned long max); void *mas_find_rev(struct ma_state *mas, unsigned long min); int mas_preallocate(struct ma_state *mas, gfp_t gfp); bool mas_is_err(struct ma_state *mas); @@ -467,6 +468,7 @@ int mas_expected_entries(struct ma_state *mas, unsigned long nr_entries); void *mas_prev(struct ma_state *mas, unsigned long min); void *mas_next(struct ma_state *mas, unsigned long max); +void *mas_next_range(struct ma_state *mas, unsigned long max); int mas_empty_area(struct ma_state *mas, unsigned long min, unsigned long max, unsigned long size); @@ -528,7 +530,6 @@ static inline void mas_reset(struct ma_state *mas) #define mas_for_each(__mas, __entry, __max) \ while (((__entry) = mas_find((__mas), (__max))) != NULL) - /** * mas_set_range() - Set up Maple Tree operation state for a different index. * @mas: Maple Tree operation state. diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 205664f46e58..41c152718000 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -4793,13 +4793,10 @@ again: */ static inline void *mas_next_entry(struct ma_state *mas, unsigned long limit) { - void *entry = NULL; - if (mas->last >= limit) return NULL; - entry = mas_next_slot(mas, limit, false); - return entry; + return mas_next_slot(mas, limit, false); } /* @@ -5880,18 +5877,8 @@ int mas_expected_entries(struct ma_state *mas, unsigned long nr_entries) } EXPORT_SYMBOL_GPL(mas_expected_entries); -/** - * mas_next() - Get the next entry. - * @mas: The maple state - * @max: The maximum index to check. - * - * Returns the next entry after @mas->index. - * Must hold rcu_read_lock or the write lock. - * Can return the zero entry. - * - * Return: The next entry or %NULL - */ -void *mas_next(struct ma_state *mas, unsigned long max) +static inline bool mas_next_setup(struct ma_state *mas, unsigned long max, + void **entry) { bool was_none = mas_is_none(mas); @@ -5899,24 +5886,71 @@ void *mas_next(struct ma_state *mas, unsigned long max) mas->node = MAS_START; if (mas_is_start(mas)) - mas_walk(mas); /* Retries on dead nodes handled by mas_walk */ + *entry = mas_walk(mas); /* Retries on dead nodes handled by mas_walk */ if (mas_is_ptr(mas)) { + *entry = NULL; if (was_none && mas->index == 0) { mas->index = mas->last = 0; - return mas_root(mas); + return true; } mas->index = 1; mas->last = ULONG_MAX; mas->node = MAS_NONE; - return NULL; + return true; } - /* Retries on dead nodes handled by mas_next_entry */ - return mas_next_entry(mas, max); + if (mas_is_none(mas)) + return true; + return false; +} + +/** + * mas_next() - Get the next entry. + * @mas: The maple state + * @max: The maximum index to check. + * + * Returns the next entry after @mas->index. + * Must hold rcu_read_lock or the write lock. + * Can return the zero entry. + * + * Return: The next entry or %NULL + */ +void *mas_next(struct ma_state *mas, unsigned long max) +{ + void *entry = NULL; + + if (mas_next_setup(mas, max, &entry)) + return entry; + + /* Retries on dead nodes handled by mas_next_slot */ + return mas_next_slot(mas, max, false); } EXPORT_SYMBOL_GPL(mas_next); +/** + * mas_next_range() - Advance the maple state to the next range + * @mas: The maple state + * @max: The maximum index to check. + * + * Sets @mas->index and @mas->last to the range. + * Must hold rcu_read_lock or the write lock. + * Can return the zero entry. + * + * Return: The next entry or %NULL + */ +void *mas_next_range(struct ma_state *mas, unsigned long max) +{ + void *entry = NULL; + + if (mas_next_setup(mas, max, &entry)) + return entry; + + /* Retries on dead nodes handled by mas_next_slot */ + return mas_next_slot(mas, max, true); +} +EXPORT_SYMBOL_GPL(mas_next_range); + /** * mt_next() - get the next value in the maple tree * @mt: The maple tree @@ -6026,49 +6060,41 @@ void mas_pause(struct ma_state *mas) EXPORT_SYMBOL_GPL(mas_pause); /** - * mas_find() - On the first call, find the entry at or after mas->index up to - * %max. Otherwise, find the entry after mas->index. + * mas_find_setup() - Internal function to set up mas_find*(). * @mas: The maple state - * @max: The maximum value to check. - * - * Must hold rcu_read_lock or the write lock. - * If an entry exists, last and index are updated accordingly. - * May set @mas->node to MAS_NONE. + * @max: The maximum index + * @entry: Pointer to the entry * - * Return: The entry or %NULL. + * Returns: True if entry is the answer, false otherwise. */ -void *mas_find(struct ma_state *mas, unsigned long max) +static inline bool mas_find_setup(struct ma_state *mas, unsigned long max, + void **entry) { + *entry = NULL; + if (unlikely(mas_is_none(mas))) { if (unlikely(mas->last >= max)) - return NULL; + return true; mas->index = mas->last; mas->node = MAS_START; - } - - if (unlikely(mas_is_paused(mas))) { + } else if (unlikely(mas_is_paused(mas))) { if (unlikely(mas->last >= max)) - return NULL; + return true; mas->node = MAS_START; mas->index = ++mas->last; - } - - - if (unlikely(mas_is_ptr(mas))) + } else if (unlikely(mas_is_ptr(mas))) goto ptr_out_of_range; if (unlikely(mas_is_start(mas))) { /* First run or continue */ - void *entry; - if (mas->index > max) - return NULL; + return true; - entry = mas_walk(mas); - if (entry) - return entry; + *entry = mas_walk(mas); + if (*entry) + return true; } @@ -6076,23 +6102,69 @@ void *mas_find(struct ma_state *mas, unsigned long max) if (unlikely(mas_is_ptr(mas))) goto ptr_out_of_range; - return NULL; + return true; } if (mas->index == max) - return NULL; + return true; - /* Retries on dead nodes handled by mas_next_slot */ - return mas_next_slot(mas, max, false); + return false; ptr_out_of_range: mas->node = MAS_NONE; mas->index = 1; mas->last = ULONG_MAX; - return NULL; + return true; +} + +/** + * mas_find() - On the first call, find the entry at or after mas->index up to + * %max. Otherwise, find the entry after mas->index. + * @mas: The maple state + * @max: The maximum value to check. + * + * Must hold rcu_read_lock or the write lock. + * If an entry exists, last and index are updated accordingly. + * May set @mas->node to MAS_NONE. + * + * Return: The entry or %NULL. + */ +void *mas_find(struct ma_state *mas, unsigned long max) +{ + void *entry = NULL; + + if (mas_find_setup(mas, max, &entry)) + return entry; + + /* Retries on dead nodes handled by mas_next_slot */ + return mas_next_slot(mas, max, false); } EXPORT_SYMBOL_GPL(mas_find); +/** + * mas_find_range() - On the first call, find the entry at or after + * mas->index up to %max. Otherwise, advance to the next slot mas->index. + * @mas: The maple state + * @max: The maximum value to check. + * + * Must hold rcu_read_lock or the write lock. + * If an entry exists, last and index are updated accordingly. + * May set @mas->node to MAS_NONE. + * + * Return: The entry or %NULL. + */ +void *mas_find_range(struct ma_state *mas, unsigned long max) +{ + void *entry; + + if (mas_find_setup(mas, max, &entry)) + return entry; + + /* Retries on dead nodes handled by mas_next_slot */ + return mas_next_slot(mas, max, true); +} +EXPORT_SYMBOL_GPL(mas_find_range); + /** * mas_find_rev: On the first call, find the first non-null entry at or below * mas->index down to %min. Otherwise find the first non-null entry below -- cgit v1.2.3 From 6b9e93e0102048e64681c2fa265ae81c221f6c6d Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Thu, 18 May 2023 10:55:40 -0400 Subject: maple_tree: add mas_prev_range() and mas_find_range_rev interface Some users of the maple tree may want to move to the previous range regardless of the value stored there. Add this interface as well as the 'find' variant to support walking to the first value, then iterating over the previous ranges. Link: https://lkml.kernel.org/r/20230518145544.1722059-32-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Vernon Yang Cc: David Binderman Cc: Peng Zhang Cc: Sergey Senozhatsky Cc: Wei Yang Signed-off-by: Andrew Morton --- include/linux/maple_tree.h | 2 + lib/maple_tree.c | 161 ++++++++++++++++++++++++++++++++++----------- 2 files changed, 124 insertions(+), 39 deletions(-) (limited to 'include') diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index 9d040043858a..541675229568 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -457,6 +457,7 @@ void mas_store_prealloc(struct ma_state *mas, void *entry); void *mas_find(struct ma_state *mas, unsigned long max); void *mas_find_range(struct ma_state *mas, unsigned long max); void *mas_find_rev(struct ma_state *mas, unsigned long min); +void *mas_find_range_rev(struct ma_state *mas, unsigned long max); int mas_preallocate(struct ma_state *mas, gfp_t gfp); bool mas_is_err(struct ma_state *mas); @@ -467,6 +468,7 @@ void mas_destroy(struct ma_state *mas); int mas_expected_entries(struct ma_state *mas, unsigned long nr_entries); void *mas_prev(struct ma_state *mas, unsigned long min); +void *mas_prev_range(struct ma_state *mas, unsigned long max); void *mas_next(struct ma_state *mas, unsigned long max); void *mas_next_range(struct ma_state *mas, unsigned long max); diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 9c7e765c809d..59c15f8b4793 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -5919,18 +5919,8 @@ void *mt_next(struct maple_tree *mt, unsigned long index, unsigned long max) } EXPORT_SYMBOL_GPL(mt_next); -/** - * mas_prev() - Get the previous entry - * @mas: The maple state - * @min: The minimum value to check. - * - * Must hold rcu_read_lock or the write lock. - * Will reset mas to MAS_START if the node is MAS_NONE. Will stop on not - * searchable nodes. - * - * Return: the previous value or %NULL. - */ -void *mas_prev(struct ma_state *mas, unsigned long min) +static inline bool mas_prev_setup(struct ma_state *mas, unsigned long min, + void **entry) { if (mas->index <= min) goto none; @@ -5948,7 +5938,8 @@ void *mas_prev(struct ma_state *mas, unsigned long min) if (!mas->index) goto none; mas->index = mas->last = 0; - return mas_root(mas); + *entry = mas_root(mas); + return true; } if (mas_is_none(mas)) { @@ -5956,18 +5947,64 @@ void *mas_prev(struct ma_state *mas, unsigned long min) /* Walked to out-of-range pointer? */ mas->index = mas->last = 0; mas->node = MAS_ROOT; - return mas_root(mas); + *entry = mas_root(mas); + return true; } - return NULL; + return true; } - return mas_prev_slot(mas, min, false); + + return false; none: mas->node = MAS_NONE; - return NULL; + return true; +} + +/** + * mas_prev() - Get the previous entry + * @mas: The maple state + * @min: The minimum value to check. + * + * Must hold rcu_read_lock or the write lock. + * Will reset mas to MAS_START if the node is MAS_NONE. Will stop on not + * searchable nodes. + * + * Return: the previous value or %NULL. + */ +void *mas_prev(struct ma_state *mas, unsigned long min) +{ + void *entry = NULL; + + if (mas_prev_setup(mas, min, &entry)) + return entry; + + return mas_prev_slot(mas, min, false); } EXPORT_SYMBOL_GPL(mas_prev); +/** + * mas_prev_range() - Advance to the previous range + * @mas: The maple state + * @min: The minimum value to check. + * + * Sets @mas->index and @mas->last to the range. + * Must hold rcu_read_lock or the write lock. + * Will reset mas to MAS_START if the node is MAS_NONE. Will stop on not + * searchable nodes. + * + * Return: the previous value or %NULL. + */ +void *mas_prev_range(struct ma_state *mas, unsigned long min) +{ + void *entry = NULL; + + if (mas_prev_setup(mas, min, &entry)) + return entry; + + return mas_prev_slot(mas, min, true); +} +EXPORT_SYMBOL_GPL(mas_prev_range); + /** * mt_prev() - get the previous value in the maple tree * @mt: The maple tree @@ -6114,20 +6151,18 @@ void *mas_find_range(struct ma_state *mas, unsigned long max) EXPORT_SYMBOL_GPL(mas_find_range); /** - * mas_find_rev: On the first call, find the first non-null entry at or below - * mas->index down to %min. Otherwise find the first non-null entry below - * mas->index down to %min. + * mas_find_rev_setup() - Internal function to set up mas_find_*_rev() * @mas: The maple state - * @min: The minimum value to check. - * - * Must hold rcu_read_lock or the write lock. - * If an entry exists, last and index are updated accordingly. - * May set @mas->node to MAS_NONE. + * @min: The minimum index + * @entry: Pointer to the entry * - * Return: The entry or %NULL. + * Returns: True if entry is the answer, false otherwise. */ -void *mas_find_rev(struct ma_state *mas, unsigned long min) +static inline bool mas_find_rev_setup(struct ma_state *mas, unsigned long min, + void **entry) { + *entry = NULL; + if (unlikely(mas_is_none(mas))) { if (mas->index <= min) goto none; @@ -6139,7 +6174,7 @@ void *mas_find_rev(struct ma_state *mas, unsigned long min) if (unlikely(mas_is_paused(mas))) { if (unlikely(mas->index <= min)) { mas->node = MAS_NONE; - return NULL; + return true; } mas->node = MAS_START; mas->last = --mas->index; @@ -6147,14 +6182,12 @@ void *mas_find_rev(struct ma_state *mas, unsigned long min) if (unlikely(mas_is_start(mas))) { /* First run or continue */ - void *entry; - if (mas->index < min) - return NULL; + return true; - entry = mas_walk(mas); - if (entry) - return entry; + *entry = mas_walk(mas); + if (*entry) + return true; } if (unlikely(!mas_searchable(mas))) { @@ -6168,22 +6201,72 @@ void *mas_find_rev(struct ma_state *mas, unsigned long min) */ mas->last = mas->index = 0; mas->node = MAS_ROOT; - return mas_root(mas); + *entry = mas_root(mas); + return true; } } if (mas->index < min) - return NULL; + return true; - /* Retries on dead nodes handled by mas_prev_slot */ - return mas_prev_slot(mas, min, false); + return false; none: mas->node = MAS_NONE; - return NULL; + return true; +} + +/** + * mas_find_rev: On the first call, find the first non-null entry at or below + * mas->index down to %min. Otherwise find the first non-null entry below + * mas->index down to %min. + * @mas: The maple state + * @min: The minimum value to check. + * + * Must hold rcu_read_lock or the write lock. + * If an entry exists, last and index are updated accordingly. + * May set @mas->node to MAS_NONE. + * + * Return: The entry or %NULL. + */ +void *mas_find_rev(struct ma_state *mas, unsigned long min) +{ + void *entry; + + if (mas_find_rev_setup(mas, min, &entry)) + return entry; + + /* Retries on dead nodes handled by mas_prev_slot */ + return mas_prev_slot(mas, min, false); + } EXPORT_SYMBOL_GPL(mas_find_rev); +/** + * mas_find_range_rev: On the first call, find the first non-null entry at or + * below mas->index down to %min. Otherwise advance to the previous slot after + * mas->index down to %min. + * @mas: The maple state + * @min: The minimum value to check. + * + * Must hold rcu_read_lock or the write lock. + * If an entry exists, last and index are updated accordingly. + * May set @mas->node to MAS_NONE. + * + * Return: The entry or %NULL. + */ +void *mas_find_range_rev(struct ma_state *mas, unsigned long min) +{ + void *entry; + + if (mas_find_rev_setup(mas, min, &entry)) + return entry; + + /* Retries on dead nodes handled by mas_prev_slot */ + return mas_prev_slot(mas, min, true); +} +EXPORT_SYMBOL_GPL(mas_find_range_rev); + /** * mas_erase() - Find the range in which index resides and erase the entire * range. -- cgit v1.2.3 From bb5dbd2272b8d7b3a34d234bb916819afbf802d1 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Thu, 18 May 2023 10:55:43 -0400 Subject: mm: add vma_iter_{next,prev}_range() to vma iterator Add functionality to the VMA iterator to advance and retreat one offset within the maple tree, regardless of the value contained. This can lead to less re-walking to find an area of interest, especially when there is nothing in that offset. Link: https://lkml.kernel.org/r/20230518145544.1722059-35-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: David Binderman Cc: Peng Zhang Cc: Sergey Senozhatsky Cc: Vernon Yang Cc: Wei Yang Signed-off-by: Andrew Morton --- include/linux/mm.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 280429ffa91d..62bb3272e531 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -866,11 +866,24 @@ static inline struct vm_area_struct *vma_next(struct vma_iterator *vmi) return mas_find(&vmi->mas, ULONG_MAX); } +static inline +struct vm_area_struct *vma_iter_next_range(struct vma_iterator *vmi) +{ + return mas_next_range(&vmi->mas, ULONG_MAX); +} + + static inline struct vm_area_struct *vma_prev(struct vma_iterator *vmi) { return mas_prev(&vmi->mas, 0); } +static inline +struct vm_area_struct *vma_iter_prev_range(struct vma_iterator *vmi) +{ + return mas_prev_range(&vmi->mas, 0); +} + static inline unsigned long vma_iter_addr(struct vma_iterator *vmi) { return vmi->mas.index; -- cgit v1.2.3 From ecd8b2928f2efc7b678b361d51920c15b5ef3885 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 19 May 2023 14:39:55 +0200 Subject: mm: compaction: remove compaction result helpers Patch series "mm: compaction: cleanups & simplifications". These compaction cleanups are split out from the huge page allocator series[1], as requested by reviewer feedback. [1] https://lore.kernel.org/linux-mm/20230418191313.268131-1-hannes@cmpxchg.org/ This patch (of 5): The compaction result helpers encode quirks that are specific to the allocator's retry logic. E.g. COMPACT_SUCCESS and COMPACT_COMPLETE actually represent failures that should be retried upon, and so on. I frequently found myself pulling up the helper implementation in order to understand and work on the retry logic. They're not quite clean abstractions; rather they split the retry logic into two locations. Remove the helpers and inline the checks. Then comment on the result interpretations directly where the decision making happens. Link: https://lkml.kernel.org/r/20230519123959.77335-1-hannes@cmpxchg.org Link: https://lkml.kernel.org/r/20230519123959.77335-2-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Vlastimil Babka Cc: Mel Gorman Cc: Michal Hocko Signed-off-by: Andrew Morton --- include/linux/compaction.h | 92 ------------------------------------------ include/trace/events/mmflags.h | 4 +- mm/page_alloc.c | 30 ++++++++------ 3 files changed, 19 insertions(+), 107 deletions(-) (limited to 'include') diff --git a/include/linux/compaction.h b/include/linux/compaction.h index a6e512cfb670..1f0328a2ba48 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -95,78 +95,6 @@ extern enum compact_result compaction_suitable(struct zone *zone, int order, extern void compaction_defer_reset(struct zone *zone, int order, bool alloc_success); -/* Compaction has made some progress and retrying makes sense */ -static inline bool compaction_made_progress(enum compact_result result) -{ - /* - * Even though this might sound confusing this in fact tells us - * that the compaction successfully isolated and migrated some - * pageblocks. - */ - if (result == COMPACT_SUCCESS) - return true; - - return false; -} - -/* Compaction has failed and it doesn't make much sense to keep retrying. */ -static inline bool compaction_failed(enum compact_result result) -{ - /* All zones were scanned completely and still not result. */ - if (result == COMPACT_COMPLETE) - return true; - - return false; -} - -/* Compaction needs reclaim to be performed first, so it can continue. */ -static inline bool compaction_needs_reclaim(enum compact_result result) -{ - /* - * Compaction backed off due to watermark checks for order-0 - * so the regular reclaim has to try harder and reclaim something. - */ - if (result == COMPACT_SKIPPED) - return true; - - return false; -} - -/* - * Compaction has backed off for some reason after doing some work or none - * at all. It might be throttling or lock contention. Retrying might be still - * worthwhile, but with a higher priority if allowed. - */ -static inline bool compaction_withdrawn(enum compact_result result) -{ - /* - * If compaction is deferred for high-order allocations, it is - * because sync compaction recently failed. If this is the case - * and the caller requested a THP allocation, we do not want - * to heavily disrupt the system, so we fail the allocation - * instead of entering direct reclaim. - */ - if (result == COMPACT_DEFERRED) - return true; - - /* - * If compaction in async mode encounters contention or blocks higher - * priority task we back off early rather than cause stalls. - */ - if (result == COMPACT_CONTENDED) - return true; - - /* - * Page scanners have met but we haven't scanned full zones so this - * is a back off in fact. - */ - if (result == COMPACT_PARTIAL_SKIPPED) - return true; - - return false; -} - - bool compaction_zonelist_suitable(struct alloc_context *ac, int order, int alloc_flags); @@ -185,26 +113,6 @@ static inline enum compact_result compaction_suitable(struct zone *zone, int ord return COMPACT_SKIPPED; } -static inline bool compaction_made_progress(enum compact_result result) -{ - return false; -} - -static inline bool compaction_failed(enum compact_result result) -{ - return false; -} - -static inline bool compaction_needs_reclaim(enum compact_result result) -{ - return false; -} - -static inline bool compaction_withdrawn(enum compact_result result) -{ - return true; -} - static inline void kcompactd_run(int nid) { } diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index b63e7c0fbbe5..1478b9dd05fa 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -223,8 +223,8 @@ IF_HAVE_VM_SOFTDIRTY(VM_SOFTDIRTY, "softdirty" ) \ #define compact_result_to_feedback(result) \ ({ \ enum compact_result __result = result; \ - (compaction_failed(__result)) ? COMPACTION_FAILED : \ - (compaction_withdrawn(__result)) ? COMPACTION_WITHDRAWN : COMPACTION_PROGRESS; \ + (__result == COMPACT_COMPLETE) ? COMPACTION_FAILED : \ + (__result == COMPACT_SUCCESS) ? COMPACTION_PROGRESS : COMPACTION_WITHDRAWN; \ }) #define COMPACTION_FEEDBACK \ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b9a9ba2db9e9..e3a3ebc2dfce 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3469,35 +3469,39 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags, if (fatal_signal_pending(current)) return false; - if (compaction_made_progress(compact_result)) + /* + * Compaction managed to coalesce some page blocks, but the + * allocation failed presumably due to a race. Retry some. + */ + if (compact_result == COMPACT_SUCCESS) (*compaction_retries)++; /* - * compaction considers all the zone as desperately out of memory - * so it doesn't really make much sense to retry except when the + * All zones were scanned completely and still no result. It + * doesn't really make much sense to retry except when the * failure could be caused by insufficient priority */ - if (compaction_failed(compact_result)) + if (compact_result == COMPACT_COMPLETE) goto check_priority; /* - * compaction was skipped because there are not enough order-0 pages - * to work with, so we retry only if it looks like reclaim can help. + * Compaction was skipped due to a lack of free order-0 + * migration targets. Continue if reclaim can help. */ - if (compaction_needs_reclaim(compact_result)) { + if (compact_result == COMPACT_SKIPPED) { ret = compaction_zonelist_suitable(ac, order, alloc_flags); goto out; } /* - * make sure the compaction wasn't deferred or didn't bail out early - * due to locks contention before we declare that we should give up. - * But the next retry should use a higher priority if allowed, so - * we don't just keep bailing out endlessly. + * If compaction backed due to being deferred, due to + * contended locks in async mode, or due to scanners meeting + * after a partial scan, retry with increased priority. */ - if (compaction_withdrawn(compact_result)) { + if (compact_result == COMPACT_DEFERRED || + compact_result == COMPACT_CONTENDED || + compact_result == COMPACT_PARTIAL_SKIPPED) goto check_priority; - } /* * !costly requests are much more important than __GFP_RETRY_MAYFAIL -- cgit v1.2.3 From e8606320e9af9774fd879e71c940fc9e5fd9b901 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 19 May 2023 14:39:57 +0200 Subject: mm: compaction: refactor __compaction_suitable() __compaction_suitable() is supposed to check for available migration targets. However, it also checks whether the operation was requested via /proc/sys/vm/compact_memory, and whether the original allocation request can already succeed. These don't apply to all callsites. Move the checks out to the callers, so that later patches can deal with them one by one. No functional change intended. [hannes@cmpxchg.org: fix comment, per Vlastimil] Link: https://lkml.kernel.org/r/20230602144942.GC161817@cmpxchg.org Link: https://lkml.kernel.org/r/20230519123959.77335-4-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Vlastimil Babka Cc: Mel Gorman Cc: Michal Hocko Signed-off-by: Andrew Morton --- include/linux/compaction.h | 4 +-- mm/compaction.c | 79 +++++++++++++++++++++++++++++----------------- mm/vmscan.c | 35 ++++++++++++-------- 3 files changed, 73 insertions(+), 45 deletions(-) (limited to 'include') diff --git a/include/linux/compaction.h b/include/linux/compaction.h index 1f0328a2ba48..9f7cf3e1bf89 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -90,7 +90,7 @@ extern enum compact_result try_to_compact_pages(gfp_t gfp_mask, struct page **page); extern void reset_isolation_suitable(pg_data_t *pgdat); extern enum compact_result compaction_suitable(struct zone *zone, int order, - unsigned int alloc_flags, int highest_zoneidx); + int highest_zoneidx); extern void compaction_defer_reset(struct zone *zone, int order, bool alloc_success); @@ -108,7 +108,7 @@ static inline void reset_isolation_suitable(pg_data_t *pgdat) } static inline enum compact_result compaction_suitable(struct zone *zone, int order, - int alloc_flags, int highest_zoneidx) + int highest_zoneidx) { return COMPACT_SKIPPED; } diff --git a/mm/compaction.c b/mm/compaction.c index e23e00bec030..bb9b76244a5d 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -2194,24 +2194,10 @@ static enum compact_result compact_finished(struct compact_control *cc) } static enum compact_result __compaction_suitable(struct zone *zone, int order, - unsigned int alloc_flags, int highest_zoneidx, unsigned long wmark_target) { unsigned long watermark; - - if (is_via_compact_memory(order)) - return COMPACT_CONTINUE; - - watermark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK); - /* - * If watermarks for high-order allocation are already met, there - * should be no need for compaction at all. - */ - if (zone_watermark_ok(zone, order, watermark, highest_zoneidx, - alloc_flags)) - return COMPACT_SUCCESS; - /* * Watermarks for order-0 must be met for compaction to be able to * isolate free pages for migration targets. This means that the @@ -2240,17 +2226,15 @@ static enum compact_result __compaction_suitable(struct zone *zone, int order, * compaction_suitable: Is this suitable to run compaction on this zone now? * Returns * COMPACT_SKIPPED - If there are too few free pages for compaction - * COMPACT_SUCCESS - If the allocation would succeed without compaction * COMPACT_CONTINUE - If compaction should run now */ enum compact_result compaction_suitable(struct zone *zone, int order, - unsigned int alloc_flags, int highest_zoneidx) { enum compact_result ret; int fragindex; - ret = __compaction_suitable(zone, order, alloc_flags, highest_zoneidx, + ret = __compaction_suitable(zone, order, highest_zoneidx, zone_page_state(zone, NR_FREE_PAGES)); /* * fragmentation index determines if allocation failures are due to @@ -2294,7 +2278,16 @@ bool compaction_zonelist_suitable(struct alloc_context *ac, int order, for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->highest_zoneidx, ac->nodemask) { unsigned long available; - enum compact_result compact_result; + unsigned long watermark; + + if (is_via_compact_memory(order)) + return true; + + /* Allocation can already succeed, nothing to do */ + watermark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK); + if (zone_watermark_ok(zone, order, watermark, + ac->highest_zoneidx, alloc_flags)) + continue; /* * Do not consider all the reclaimable memory because we do not @@ -2304,9 +2297,8 @@ bool compaction_zonelist_suitable(struct alloc_context *ac, int order, */ available = zone_reclaimable_pages(zone) / order; available += zone_page_state_snapshot(zone, NR_FREE_PAGES); - compact_result = __compaction_suitable(zone, order, alloc_flags, - ac->highest_zoneidx, available); - if (compact_result == COMPACT_CONTINUE) + if (__compaction_suitable(zone, order, ac->highest_zoneidx, + available) == COMPACT_CONTINUE) return true; } @@ -2336,11 +2328,23 @@ compact_zone(struct compact_control *cc, struct capture_control *capc) INIT_LIST_HEAD(&cc->migratepages); cc->migratetype = gfp_migratetype(cc->gfp_mask); - ret = compaction_suitable(cc->zone, cc->order, cc->alloc_flags, - cc->highest_zoneidx); - /* Compaction is likely to fail */ - if (ret == COMPACT_SUCCESS || ret == COMPACT_SKIPPED) - return ret; + + if (!is_via_compact_memory(cc->order)) { + unsigned long watermark; + + /* Allocation can already succeed, nothing to do */ + watermark = wmark_pages(cc->zone, + cc->alloc_flags & ALLOC_WMARK_MASK); + if (zone_watermark_ok(cc->zone, cc->order, watermark, + cc->highest_zoneidx, cc->alloc_flags)) + return COMPACT_SUCCESS; + + ret = compaction_suitable(cc->zone, cc->order, + cc->highest_zoneidx); + /* Compaction is likely to fail */ + if (ret == COMPACT_SKIPPED) + return ret; + } /* * Clear pageblock skip if there were failures recently and compaction @@ -2844,7 +2848,16 @@ static bool kcompactd_node_suitable(pg_data_t *pgdat) if (!populated_zone(zone)) continue; - if (compaction_suitable(zone, pgdat->kcompactd_max_order, 0, + if (is_via_compact_memory(pgdat->kcompactd_max_order)) + return true; + + /* Allocation can already succeed, check other zones */ + if (zone_watermark_ok(zone, pgdat->kcompactd_max_order, + min_wmark_pages(zone), + highest_zoneidx, 0)) + continue; + + if (compaction_suitable(zone, pgdat->kcompactd_max_order, highest_zoneidx) == COMPACT_CONTINUE) return true; } @@ -2882,10 +2895,18 @@ static void kcompactd_do_work(pg_data_t *pgdat) if (compaction_deferred(zone, cc.order)) continue; - if (compaction_suitable(zone, cc.order, 0, zoneid) != - COMPACT_CONTINUE) + if (is_via_compact_memory(cc.order)) + goto compact; + + /* Allocation can already succeed, nothing to do */ + if (zone_watermark_ok(zone, cc.order, + min_wmark_pages(zone), zoneid, 0)) continue; + if (compaction_suitable(zone, cc.order, + zoneid) != COMPACT_CONTINUE) + continue; +compact: if (kthread_should_stop()) return; diff --git a/mm/vmscan.c b/mm/vmscan.c index 4637f6462e9c..9f8bfd1fcf58 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -6399,14 +6399,17 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat, if (!managed_zone(zone)) continue; - switch (compaction_suitable(zone, sc->order, 0, sc->reclaim_idx)) { - case COMPACT_SUCCESS: - case COMPACT_CONTINUE: + if (sc->order == -1) /* is_via_compact_memory() */ + return false; + + /* Allocation can already succeed, nothing to do */ + if (zone_watermark_ok(zone, sc->order, min_wmark_pages(zone), + sc->reclaim_idx, 0)) + return false; + + if (compaction_suitable(zone, sc->order, + sc->reclaim_idx) == COMPACT_CONTINUE) return false; - default: - /* check next zone */ - ; - } } /* @@ -6594,16 +6597,20 @@ again: static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) { unsigned long watermark; - enum compact_result suitable; - suitable = compaction_suitable(zone, sc->order, 0, sc->reclaim_idx); - if (suitable == COMPACT_SUCCESS) - /* Allocation should succeed already. Don't reclaim. */ + if (sc->order == -1) /* is_via_compact_memory() */ + goto suitable; + + /* Allocation can already succeed, nothing to do */ + if (zone_watermark_ok(zone, sc->order, min_wmark_pages(zone), + sc->reclaim_idx, 0)) return true; - if (suitable == COMPACT_SKIPPED) - /* Compaction cannot yet proceed. Do reclaim. */ - return false; + /* Compaction cannot yet proceed. Do reclaim. */ + if (compaction_suitable(zone, sc->order, + sc->reclaim_idx) == COMPACT_SKIPPED) + return false; +suitable: /* * Compaction is already possible, but it takes time to run and there * are potentially other callers using the pages just freed. So proceed -- cgit v1.2.3 From 3cf04937529020e149666f56a41ebdeb226b69ed Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 2 Jun 2023 11:12:04 -0400 Subject: mm: compaction: have compaction_suitable() return bool Since it only returns COMPACT_CONTINUE or COMPACT_SKIPPED now, a bool return value simplifies the callsites. Link: https://lkml.kernel.org/r/20230602151204.GD161817@cmpxchg.org Signed-off-by: Johannes Weiner Suggested-by: Vlastimil Babka Acked-by: Vlastimil Babka Cc: Baolin Wang Cc: Mel Gorman Cc: Michal Hocko Signed-off-by: Andrew Morton --- include/linux/compaction.h | 6 ++--- mm/compaction.c | 64 ++++++++++++++++++++++------------------------ mm/vmscan.c | 6 ++--- 3 files changed, 36 insertions(+), 40 deletions(-) (limited to 'include') diff --git a/include/linux/compaction.h b/include/linux/compaction.h index 9f7cf3e1bf89..57b16e69c19a 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -89,7 +89,7 @@ extern enum compact_result try_to_compact_pages(gfp_t gfp_mask, const struct alloc_context *ac, enum compact_priority prio, struct page **page); extern void reset_isolation_suitable(pg_data_t *pgdat); -extern enum compact_result compaction_suitable(struct zone *zone, int order, +extern bool compaction_suitable(struct zone *zone, int order, int highest_zoneidx); extern void compaction_defer_reset(struct zone *zone, int order, @@ -107,10 +107,10 @@ static inline void reset_isolation_suitable(pg_data_t *pgdat) { } -static inline enum compact_result compaction_suitable(struct zone *zone, int order, +static inline bool compaction_suitable(struct zone *zone, int order, int highest_zoneidx) { - return COMPACT_SKIPPED; + return false; } static inline void kcompactd_run(int nid) diff --git a/mm/compaction.c b/mm/compaction.c index 470cfd24ef18..9b550bfe900b 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -2193,9 +2193,9 @@ static enum compact_result compact_finished(struct compact_control *cc) return ret; } -static enum compact_result __compaction_suitable(struct zone *zone, int order, - int highest_zoneidx, - unsigned long wmark_target) +static bool __compaction_suitable(struct zone *zone, int order, + int highest_zoneidx, + unsigned long wmark_target) { unsigned long watermark; /* @@ -2215,27 +2215,20 @@ static enum compact_result __compaction_suitable(struct zone *zone, int order, watermark = (order > PAGE_ALLOC_COSTLY_ORDER) ? low_wmark_pages(zone) : min_wmark_pages(zone); watermark += compact_gap(order); - if (!__zone_watermark_ok(zone, 0, watermark, highest_zoneidx, - ALLOC_CMA, wmark_target)) - return COMPACT_SKIPPED; - - return COMPACT_CONTINUE; + return __zone_watermark_ok(zone, 0, watermark, highest_zoneidx, + ALLOC_CMA, wmark_target); } /* * compaction_suitable: Is this suitable to run compaction on this zone now? - * Returns - * COMPACT_SKIPPED - If there are too few free pages for compaction - * COMPACT_CONTINUE - If compaction should run now */ -enum compact_result compaction_suitable(struct zone *zone, int order, - int highest_zoneidx) +bool compaction_suitable(struct zone *zone, int order, int highest_zoneidx) { - enum compact_result ret; - int fragindex; + enum compact_result compact_result; + bool suitable; - ret = __compaction_suitable(zone, order, highest_zoneidx, - zone_page_state(zone, NR_FREE_PAGES)); + suitable = __compaction_suitable(zone, order, highest_zoneidx, + zone_page_state(zone, NR_FREE_PAGES)); /* * fragmentation index determines if allocation failures are due to * low memory or external fragmentation @@ -2252,17 +2245,24 @@ enum compact_result compaction_suitable(struct zone *zone, int order, * excessive compaction for costly orders, but it should not be at the * expense of system stability. */ - if (ret == COMPACT_CONTINUE && (order > PAGE_ALLOC_COSTLY_ORDER)) { - fragindex = fragmentation_index(zone, order); - if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold) - ret = COMPACT_NOT_SUITABLE_ZONE; + if (suitable) { + compact_result = COMPACT_CONTINUE; + if (order > PAGE_ALLOC_COSTLY_ORDER) { + int fragindex = fragmentation_index(zone, order); + + if (fragindex >= 0 && + fragindex <= sysctl_extfrag_threshold) { + suitable = false; + compact_result = COMPACT_NOT_SUITABLE_ZONE; + } + } + } else { + compact_result = COMPACT_SKIPPED; } - trace_mm_compaction_suitable(zone, order, ret); - if (ret == COMPACT_NOT_SUITABLE_ZONE) - ret = COMPACT_SKIPPED; + trace_mm_compaction_suitable(zone, order, compact_result); - return ret; + return suitable; } bool compaction_zonelist_suitable(struct alloc_context *ac, int order, @@ -2288,7 +2288,7 @@ bool compaction_zonelist_suitable(struct alloc_context *ac, int order, available = zone_reclaimable_pages(zone) / order; available += zone_page_state_snapshot(zone, NR_FREE_PAGES); if (__compaction_suitable(zone, order, ac->highest_zoneidx, - available) == COMPACT_CONTINUE) + available)) return true; } @@ -2329,11 +2329,10 @@ compact_zone(struct compact_control *cc, struct capture_control *capc) cc->highest_zoneidx, cc->alloc_flags)) return COMPACT_SUCCESS; - ret = compaction_suitable(cc->zone, cc->order, - cc->highest_zoneidx); /* Compaction is likely to fail */ - if (ret == COMPACT_SKIPPED) - return ret; + if (!compaction_suitable(cc->zone, cc->order, + cc->highest_zoneidx)) + return COMPACT_SKIPPED; } /* @@ -2845,7 +2844,7 @@ static bool kcompactd_node_suitable(pg_data_t *pgdat) continue; if (compaction_suitable(zone, pgdat->kcompactd_max_order, - highest_zoneidx) == COMPACT_CONTINUE) + highest_zoneidx)) return true; } @@ -2887,8 +2886,7 @@ static void kcompactd_do_work(pg_data_t *pgdat) min_wmark_pages(zone), zoneid, 0)) continue; - if (compaction_suitable(zone, cc.order, - zoneid) != COMPACT_CONTINUE) + if (!compaction_suitable(zone, cc.order, zoneid)) continue; if (kthread_should_stop()) diff --git a/mm/vmscan.c b/mm/vmscan.c index 99e4ae44850d..df7e52b522ec 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -6404,8 +6404,7 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat, sc->reclaim_idx, 0)) return false; - if (compaction_suitable(zone, sc->order, - sc->reclaim_idx) == COMPACT_CONTINUE) + if (compaction_suitable(zone, sc->order, sc->reclaim_idx)) return false; } @@ -6601,8 +6600,7 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) return true; /* Compaction cannot yet proceed. Do reclaim. */ - if (compaction_suitable(zone, sc->order, - sc->reclaim_idx) == COMPACT_SKIPPED) + if (!compaction_suitable(zone, sc->order, sc->reclaim_idx)) return false; /* -- cgit v1.2.3 From 5c7e7a0d79072eb02780a2c0dee730b23cde711d Mon Sep 17 00:00:00 2001 From: "T.J. Alumbaugh" Date: Mon, 22 May 2023 11:20:56 +0000 Subject: mm: multi-gen LRU: cleanup lru_gen_soft_reclaim() lru_gen_soft_reclaim() gets the lruvec from the memcg and node ID to keep a cleaner interface on the caller side. Link: https://lkml.kernel.org/r/20230522112058.2965866-2-talumbau@google.com Signed-off-by: T.J. Alumbaugh Reviewed-by: Yuanchu Xie Cc: David Hildenbrand Cc: Yu Zhao Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 4 ++-- mm/memcontrol.c | 2 +- mm/vmscan.c | 4 +++- 3 files changed, 6 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 3a68326c9989..5a7ada0413da 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -534,7 +534,7 @@ void lru_gen_exit_memcg(struct mem_cgroup *memcg); void lru_gen_online_memcg(struct mem_cgroup *memcg); void lru_gen_offline_memcg(struct mem_cgroup *memcg); void lru_gen_release_memcg(struct mem_cgroup *memcg); -void lru_gen_soft_reclaim(struct lruvec *lruvec); +void lru_gen_soft_reclaim(struct mem_cgroup *memcg, int nid); #else /* !CONFIG_MEMCG */ @@ -585,7 +585,7 @@ static inline void lru_gen_release_memcg(struct mem_cgroup *memcg) { } -static inline void lru_gen_soft_reclaim(struct lruvec *lruvec) +static inline void lru_gen_soft_reclaim(struct mem_cgroup *memcg, int nid) { } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 7c681492b47b..6a3d4ce87b8a 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -485,7 +485,7 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, int nid) if (lru_gen_enabled()) { if (soft_limit_excess(memcg)) - lru_gen_soft_reclaim(&memcg->nodeinfo[nid]->lruvec); + lru_gen_soft_reclaim(memcg, nid); return; } diff --git a/mm/vmscan.c b/mm/vmscan.c index cafb933d609f..a51a7e0f8b63 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -4846,8 +4846,10 @@ void lru_gen_release_memcg(struct mem_cgroup *memcg) } } -void lru_gen_soft_reclaim(struct lruvec *lruvec) +void lru_gen_soft_reclaim(struct mem_cgroup *memcg, int nid) { + struct lruvec *lruvec = get_lruvec(memcg, nid); + /* see the comment on MEMCG_NR_GENS */ if (lru_gen_memcg_seg(lruvec) != MEMCG_LRU_HEAD) lru_gen_rotate_memcg(lruvec, MEMCG_LRU_HEAD); -- cgit v1.2.3 From 447ba88658faa8dbfd29d557daa38b7d88f460ec Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Thu, 25 May 2023 20:54:00 +0800 Subject: mm: compaction: add trace event for fast freepages isolation The fast_isolate_freepages() can also isolate freepages, but we can not know the fast isolation efficiency to understand the fast isolation pressure. So add a trace event to show some numbers to help to understand the efficiency for fast freepages isolation. Link: https://lkml.kernel.org/r/78d2932d0160d122c15372aceb3f2c45460a17fc.1685018752.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Acked-by: Vlastimil Babka Cc: Mel Gorman Signed-off-by: Andrew Morton --- include/trace/events/compaction.h | 11 +++++++++++ mm/compaction.c | 6 +++++- 2 files changed, 16 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h index 3313eb83c117..2b2a975efd20 100644 --- a/include/trace/events/compaction.h +++ b/include/trace/events/compaction.h @@ -64,6 +64,17 @@ DEFINE_EVENT(mm_compaction_isolate_template, mm_compaction_isolate_freepages, TP_ARGS(start_pfn, end_pfn, nr_scanned, nr_taken) ); +DEFINE_EVENT(mm_compaction_isolate_template, mm_compaction_fast_isolate_freepages, + + TP_PROTO( + unsigned long start_pfn, + unsigned long end_pfn, + unsigned long nr_scanned, + unsigned long nr_taken), + + TP_ARGS(start_pfn, end_pfn, nr_scanned, nr_taken) +); + #ifdef CONFIG_COMPACTION TRACE_EVENT(mm_compaction_migratepages, diff --git a/mm/compaction.c b/mm/compaction.c index 3b09d8d02581..ce6293bf9c4a 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1416,7 +1416,7 @@ static int next_search_order(struct compact_control *cc, int order) static void fast_isolate_freepages(struct compact_control *cc) { unsigned int limit = max(1U, freelist_scan_limit(cc) >> 1); - unsigned int nr_scanned = 0; + unsigned int nr_scanned = 0, total_isolated = 0; unsigned long low_pfn, min_pfn, highest = 0; unsigned long nr_isolated = 0; unsigned long distance; @@ -1515,6 +1515,7 @@ static void fast_isolate_freepages(struct compact_control *cc) set_page_private(page, order); nr_isolated = 1 << order; nr_scanned += nr_isolated - 1; + total_isolated += nr_isolated; cc->nr_freepages += nr_isolated; list_add_tail(&page->lru, &cc->freepages); count_compact_events(COMPACTISOLATED, nr_isolated); @@ -1535,6 +1536,9 @@ static void fast_isolate_freepages(struct compact_control *cc) limit = max(1U, limit >> 1); } + trace_mm_compaction_fast_isolate_freepages(min_pfn, cc->free_pfn, + nr_scanned, total_isolated); + if (!page) { cc->fast_search_fail++; if (scan_start) { -- cgit v1.2.3 From 06b27ce36a1a3dc5ea6f8314d0c7d1baa9f8ece7 Mon Sep 17 00:00:00 2001 From: Peng Zhang Date: Wed, 24 May 2023 11:12:47 +0800 Subject: maple_tree: relocate the declaration of mas_empty_area_rev(). Relocate the declaration of mas_empty_area_rev() so that mas_empty_area() and mas_empty_area_rev() are together. Link: https://lkml.kernel.org/r/20230524031247.65949-11-zhangpeng.00@bytedance.com Signed-off-by: Peng Zhang Reviewed-by: Liam R. Howlett Signed-off-by: Andrew Morton --- include/linux/maple_tree.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index 541675229568..295548cca8b3 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -474,6 +474,12 @@ void *mas_next_range(struct ma_state *mas, unsigned long max); int mas_empty_area(struct ma_state *mas, unsigned long min, unsigned long max, unsigned long size); +/* + * This finds an empty area from the highest address to the lowest. + * AKA "Topdown" version, + */ +int mas_empty_area_rev(struct ma_state *mas, unsigned long min, + unsigned long max, unsigned long size); static inline void mas_init(struct ma_state *mas, struct maple_tree *tree, unsigned long addr) @@ -497,12 +503,6 @@ static inline bool mas_is_paused(const struct ma_state *mas) return mas->node == MAS_PAUSE; } -/* - * This finds an empty area from the highest address to the lowest. - * AKA "Topdown" version, - */ -int mas_empty_area_rev(struct ma_state *mas, unsigned long min, - unsigned long max, unsigned long size); /** * mas_reset() - Reset a Maple Tree operation state. * @mas: Maple Tree operation state. -- cgit v1.2.3 From 3ecdeb0f876e91c4a7129ba2ba5baa530aa6c4f9 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Mon, 29 May 2023 14:13:53 +0800 Subject: swap: remove __swp_swapcount() __swp_swapcount() just encloses the calling to swap_swapcount() with get/put_swap_device(). It is called in __read_swap_cache_async() only, which encloses the calling with get/put_swap_device() already. So, __read_swap_cache_async() can call swap_swapcount() directly. Link: https://lkml.kernel.org/r/20230529061355.125791-4-ying.huang@intel.com Signed-off-by: "Huang, Ying" Reviewed-by: David Hildenbrand Reviewed-by: Chris Li (Google) Cc: Hugh Dickins Cc: Johannes Weiner Cc: Matthew Wilcox Cc: Michal Hocko Cc: Minchan Kim Cc: Tim Chen Cc: Yang Shi Cc: Yu Zhao Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- include/linux/swap.h | 4 ++-- mm/swap_state.c | 2 +- mm/swapfile.c | 20 +------------------- 3 files changed, 4 insertions(+), 22 deletions(-) (limited to 'include') diff --git a/include/linux/swap.h b/include/linux/swap.h index b2128df5edea..2ddbfd85f6c7 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -513,7 +513,7 @@ int find_first_swap(dev_t *device); extern unsigned int count_swap_pages(int, int); extern sector_t swapdev_block(int, pgoff_t); extern int __swap_count(swp_entry_t entry); -extern int __swp_swapcount(swp_entry_t entry); +extern int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry); extern int swp_swapcount(swp_entry_t entry); extern struct swap_info_struct *page_swap_info(struct page *); extern struct swap_info_struct *swp_swap_info(swp_entry_t entry); @@ -591,7 +591,7 @@ static inline int __swap_count(swp_entry_t entry) return 0; } -static inline int __swp_swapcount(swp_entry_t entry) +static inline int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry) { return 0; } diff --git a/mm/swap_state.c b/mm/swap_state.c index a8450b4a110c..ef32353c18a6 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -447,7 +447,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, * as SWAP_HAS_CACHE. That's done in later part of code or * else swap_off will be aborted if we return NULL. */ - if (!__swp_swapcount(entry) && swap_slot_cache_enabled) + if (!swap_swapcount(si, entry) && swap_slot_cache_enabled) goto fail_put_swap; /* diff --git a/mm/swapfile.c b/mm/swapfile.c index cf8b16b6a98e..2d264efe90d2 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1444,7 +1444,7 @@ int __swap_count(swp_entry_t entry) * This does not give an exact answer when swap count is continued, * but does include the high COUNT_CONTINUED flag to allow for that. */ -static int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry) +int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry) { pgoff_t offset = swp_offset(entry); struct swap_cluster_info *ci; @@ -1456,24 +1456,6 @@ static int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry) return count; } -/* - * How many references to @entry are currently swapped out? - * This does not give an exact answer when swap count is continued, - * but does include the high COUNT_CONTINUED flag to allow for that. - */ -int __swp_swapcount(swp_entry_t entry) -{ - int count = 0; - struct swap_info_struct *si; - - si = get_swap_device(entry); - if (si) { - count = swap_swapcount(si, entry); - put_swap_device(si); - } - return count; -} - /* * How many references to @entry are currently swapped out? * This considers COUNT_CONTINUED so it returns exact answer. -- cgit v1.2.3 From 0d625446d0a451a683a357799912b9e688629707 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 1 Jun 2023 16:58:53 +0200 Subject: backing_dev: remove current->backing_dev_info Patch series "cleanup the filemap / direct I/O interaction", v4. This series cleans up some of the generic write helper calling conventions and the page cache writeback / invalidation for direct I/O. This is a spinoff from the no-bufferhead kernel project, for which we'll want to an use iomap based buffered write path in the block layer. This patch (of 12): The last user of current->backing_dev_info disappeared in commit b9b1335e6403 ("remove bdi_congested() and wb_congested() and related functions"). Remove the field and all assignments to it. Link: https://lkml.kernel.org/r/20230601145904.1385409-1-hch@lst.de Link: https://lkml.kernel.org/r/20230601145904.1385409-2-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Christian Brauner Reviewed-by: Damien Le Moal Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Reviewed-by: Darrick J. Wong Acked-by: Theodore Ts'o Cc: Al Viro Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Chao Yu Cc: Ilya Dryomov Cc: Jaegeuk Kim Cc: Jens Axboe Cc: Matthew Wilcox Cc: Miklos Szeredi Cc: Miklos Szeredi Cc: Trond Myklebust Cc: Xiubo Li Signed-off-by: Andrew Morton --- fs/btrfs/file.c | 6 +----- fs/ceph/file.c | 4 ---- fs/ext4/file.c | 2 -- fs/f2fs/file.c | 2 -- fs/fuse/file.c | 4 ---- fs/gfs2/file.c | 2 -- fs/nfs/file.c | 5 +---- fs/ntfs/file.c | 2 -- fs/ntfs3/file.c | 3 --- fs/xfs/xfs_file.c | 4 ---- include/linux/sched.h | 3 --- mm/filemap.c | 3 --- 12 files changed, 2 insertions(+), 38 deletions(-) (limited to 'include') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index f649647392e0..ecd43ab66fa6 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1145,7 +1145,6 @@ static int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from, !(BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC))) return -EAGAIN; - current->backing_dev_info = inode_to_bdi(inode); ret = file_remove_privs(file); if (ret) return ret; @@ -1165,10 +1164,8 @@ static int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from, loff_t end_pos = round_up(pos + count, fs_info->sectorsize); ret = btrfs_cont_expand(BTRFS_I(inode), oldsize, end_pos); - if (ret) { - current->backing_dev_info = NULL; + if (ret) return ret; - } } return 0; @@ -1689,7 +1686,6 @@ ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from, if (sync) atomic_dec(&inode->sync_writers); - current->backing_dev_info = NULL; return num_written; } diff --git a/fs/ceph/file.c b/fs/ceph/file.c index f4d8bf7dec88..c8ef72f723ba 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -1791,9 +1791,6 @@ retry_snap: else ceph_start_io_write(inode); - /* We can write back this queue in page reclaim */ - current->backing_dev_info = inode_to_bdi(inode); - if (iocb->ki_flags & IOCB_APPEND) { err = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE, false); if (err < 0) @@ -1940,7 +1937,6 @@ out: ceph_end_io_write(inode); out_unlocked: ceph_free_cap_flush(prealloc_cf); - current->backing_dev_info = NULL; return written ? written : err; } diff --git a/fs/ext4/file.c b/fs/ext4/file.c index d101b3b0c7da..bc430270c23c 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -285,9 +285,7 @@ static ssize_t ext4_buffered_write_iter(struct kiocb *iocb, if (ret <= 0) goto out; - current->backing_dev_info = inode_to_bdi(inode); ret = generic_perform_write(iocb, from); - current->backing_dev_info = NULL; out: inode_unlock(inode); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 5ac53d2627d2..4f423d367a44 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -4517,9 +4517,7 @@ static ssize_t f2fs_buffered_write_iter(struct kiocb *iocb, if (iocb->ki_flags & IOCB_NOWAIT) return -EOPNOTSUPP; - current->backing_dev_info = inode_to_bdi(inode); ret = generic_perform_write(iocb, from); - current->backing_dev_info = NULL; if (ret > 0) { iocb->ki_pos += ret; diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 89d97f6188e0..97d435874b14 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1362,9 +1362,6 @@ static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from) writethrough: inode_lock(inode); - /* We can write back this queue in page reclaim */ - current->backing_dev_info = inode_to_bdi(inode); - err = generic_write_checks(iocb, from); if (err <= 0) goto out; @@ -1409,7 +1406,6 @@ writethrough: iocb->ki_pos += written; } out: - current->backing_dev_info = NULL; inode_unlock(inode); if (written > 0) written = generic_write_sync(iocb, written); diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 300844f50dcd..904a0d6ac1a1 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -1041,11 +1041,9 @@ retry: goto out_unlock; } - current->backing_dev_info = inode_to_bdi(inode); pagefault_disable(); ret = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops); pagefault_enable(); - current->backing_dev_info = NULL; if (ret > 0) { iocb->ki_pos += ret; written += ret; diff --git a/fs/nfs/file.c b/fs/nfs/file.c index f0edf5a36237..665ce3fc62ea 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -648,11 +648,8 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) since = filemap_sample_wb_err(file->f_mapping); nfs_start_io_write(inode); result = generic_write_checks(iocb, from); - if (result > 0) { - current->backing_dev_info = inode_to_bdi(inode); + if (result > 0) result = generic_perform_write(iocb, from); - current->backing_dev_info = NULL; - } nfs_end_io_write(inode); if (result <= 0) goto out; diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index c481b14e4fd9..e296f804a9c4 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -1911,11 +1911,9 @@ static ssize_t ntfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) inode_lock(vi); /* We can write back this queue in page reclaim. */ - current->backing_dev_info = inode_to_bdi(vi); err = ntfs_prepare_file_for_write(iocb, from); if (iov_iter_count(from) && !err) written = ntfs_perform_write(file, from, iocb->ki_pos); - current->backing_dev_info = NULL; inode_unlock(vi); iocb->ki_pos += written; if (likely(written > 0)) diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c index 9a3d55c367d9..86d16a2c8339 100644 --- a/fs/ntfs3/file.c +++ b/fs/ntfs3/file.c @@ -820,7 +820,6 @@ static ssize_t ntfs_compress_write(struct kiocb *iocb, struct iov_iter *from) if (!pages) return -ENOMEM; - current->backing_dev_info = inode_to_bdi(inode); err = file_remove_privs(file); if (err) goto out; @@ -993,8 +992,6 @@ static ssize_t ntfs_compress_write(struct kiocb *iocb, struct iov_iter *from) out: kfree(pages); - current->backing_dev_info = NULL; - if (err < 0) return err; diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index aede746541f8..431c3fd0e2b5 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -717,9 +717,6 @@ write_retry: if (ret) goto out; - /* We can write back this queue in page reclaim */ - current->backing_dev_info = inode_to_bdi(inode); - trace_xfs_file_buffered_write(iocb, from); ret = iomap_file_buffered_write(iocb, from, &xfs_buffered_write_iomap_ops); @@ -753,7 +750,6 @@ write_retry: goto write_retry; } - current->backing_dev_info = NULL; out: if (iolock) xfs_iunlock(ip, iolock); diff --git a/include/linux/sched.h b/include/linux/sched.h index eed5d65b8d1f..54780571fe9a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -41,7 +41,6 @@ /* task_struct member predeclarations (sorted alphabetically): */ struct audit_context; -struct backing_dev_info; struct bio_list; struct blk_plug; struct bpf_local_storage; @@ -1186,8 +1185,6 @@ struct task_struct { /* VM state: */ struct reclaim_state *reclaim_state; - struct backing_dev_info *backing_dev_info; - struct io_context *io_context; #ifdef CONFIG_COMPACTION diff --git a/mm/filemap.c b/mm/filemap.c index 570bc8c3db87..0d371ed91a68 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3964,8 +3964,6 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) ssize_t err; ssize_t status; - /* We can write back this queue in page reclaim */ - current->backing_dev_info = inode_to_bdi(inode); err = file_remove_privs(file); if (err) goto out; @@ -4026,7 +4024,6 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) iocb->ki_pos += written; } out: - current->backing_dev_info = NULL; return written ? written : err; } EXPORT_SYMBOL(__generic_file_write_iter); -- cgit v1.2.3 From 3c435a0fe35c220bec442dffff04a64aacf952b0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 1 Jun 2023 16:58:56 +0200 Subject: filemap: add a kiocb_write_and_wait helper Factor out a helper that does filemap_write_and_wait_range for the range covered by a read kiocb, or returns -EAGAIN if the kiocb is marked as nowait and there would be pages to write. Link: https://lkml.kernel.org/r/20230601145904.1385409-5-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Hannes Reinecke Acked-by: Darrick J. Wong Cc: Al Viro Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Chao Yu Cc: Christian Brauner Cc: Ilya Dryomov Cc: Jaegeuk Kim Cc: Jens Axboe Cc: Johannes Thumshirn Cc: Matthew Wilcox Cc: Miklos Szeredi Cc: Miklos Szeredi Cc: Theodore Ts'o Cc: Trond Myklebust Cc: Xiubo Li Signed-off-by: Andrew Morton --- block/fops.c | 18 +++--------------- include/linux/pagemap.h | 2 ++ mm/filemap.c | 30 ++++++++++++++++++------------ 3 files changed, 23 insertions(+), 27 deletions(-) (limited to 'include') diff --git a/block/fops.c b/block/fops.c index 58d0aebc7313..575171049c5d 100644 --- a/block/fops.c +++ b/block/fops.c @@ -576,21 +576,9 @@ static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to) goto reexpand; /* skip atime */ if (iocb->ki_flags & IOCB_DIRECT) { - struct address_space *mapping = iocb->ki_filp->f_mapping; - - if (iocb->ki_flags & IOCB_NOWAIT) { - if (filemap_range_needs_writeback(mapping, pos, - pos + count - 1)) { - ret = -EAGAIN; - goto reexpand; - } - } else { - ret = filemap_write_and_wait_range(mapping, pos, - pos + count - 1); - if (ret < 0) - goto reexpand; - } - + ret = kiocb_write_and_wait(iocb, count); + if (ret < 0) + goto reexpand; file_accessed(iocb->ki_filp); ret = blkdev_direct_IO(iocb, to); diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index c1ae5ebc375f..b6a12ca108b7 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -30,6 +30,7 @@ static inline void invalidate_remote_inode(struct inode *inode) int invalidate_inode_pages2(struct address_space *mapping); int invalidate_inode_pages2_range(struct address_space *mapping, pgoff_t start, pgoff_t end); + int write_inode_now(struct inode *, int sync); int filemap_fdatawrite(struct address_space *); int filemap_flush(struct address_space *); @@ -54,6 +55,7 @@ int filemap_check_errors(struct address_space *mapping); void __filemap_set_wb_err(struct address_space *mapping, int err); int filemap_fdatawrite_wbc(struct address_space *mapping, struct writeback_control *wbc); +int kiocb_write_and_wait(struct kiocb *iocb, size_t count); static inline int filemap_write_and_wait(struct address_space *mapping) { diff --git a/mm/filemap.c b/mm/filemap.c index 3a80a69fa9fa..5566e10ca1a7 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2735,6 +2735,21 @@ put_folios: } EXPORT_SYMBOL_GPL(filemap_read); +int kiocb_write_and_wait(struct kiocb *iocb, size_t count) +{ + struct address_space *mapping = iocb->ki_filp->f_mapping; + loff_t pos = iocb->ki_pos; + loff_t end = pos + count - 1; + + if (iocb->ki_flags & IOCB_NOWAIT) { + if (filemap_range_needs_writeback(mapping, pos, end)) + return -EAGAIN; + return 0; + } + + return filemap_write_and_wait_range(mapping, pos, end); +} + /** * generic_file_read_iter - generic filesystem read routine * @iocb: kernel I/O control block @@ -2770,18 +2785,9 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; - if (iocb->ki_flags & IOCB_NOWAIT) { - if (filemap_range_needs_writeback(mapping, iocb->ki_pos, - iocb->ki_pos + count - 1)) - return -EAGAIN; - } else { - retval = filemap_write_and_wait_range(mapping, - iocb->ki_pos, - iocb->ki_pos + count - 1); - if (retval < 0) - return retval; - } - + retval = kiocb_write_and_wait(iocb, count); + if (retval < 0) + return retval; file_accessed(file); retval = mapping->a_ops->direct_IO(iocb, iter); -- cgit v1.2.3 From e003f74afbd2feadbb9ffbf9135e2d2fb5d320a5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 1 Jun 2023 16:58:57 +0200 Subject: filemap: add a kiocb_invalidate_pages helper Factor out a helper that calls filemap_write_and_wait_range and invalidate_inode_pages2_range for the range covered by a write kiocb or returns -EAGAIN if the kiocb is marked as nowait and there would be pages to write or invalidate. Link: https://lkml.kernel.org/r/20230601145904.1385409-6-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Hannes Reinecke Acked-by: Darrick J. Wong Cc: Al Viro Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Chao Yu Cc: Christian Brauner Cc: Ilya Dryomov Cc: Jaegeuk Kim Cc: Jens Axboe Cc: Johannes Thumshirn Cc: Matthew Wilcox Cc: Miklos Szeredi Cc: Miklos Szeredi Cc: Theodore Ts'o Cc: Trond Myklebust Cc: Xiubo Li Signed-off-by: Andrew Morton --- include/linux/pagemap.h | 1 + mm/filemap.c | 48 ++++++++++++++++++++++++++++-------------------- 2 files changed, 29 insertions(+), 20 deletions(-) (limited to 'include') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index b6a12ca108b7..7b66a67dba51 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -30,6 +30,7 @@ static inline void invalidate_remote_inode(struct inode *inode) int invalidate_inode_pages2(struct address_space *mapping); int invalidate_inode_pages2_range(struct address_space *mapping, pgoff_t start, pgoff_t end); +int kiocb_invalidate_pages(struct kiocb *iocb, size_t count); int write_inode_now(struct inode *, int sync); int filemap_fdatawrite(struct address_space *); diff --git a/mm/filemap.c b/mm/filemap.c index 5566e10ca1a7..6ba6233c4bbb 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2750,6 +2750,33 @@ int kiocb_write_and_wait(struct kiocb *iocb, size_t count) return filemap_write_and_wait_range(mapping, pos, end); } +int kiocb_invalidate_pages(struct kiocb *iocb, size_t count) +{ + struct address_space *mapping = iocb->ki_filp->f_mapping; + loff_t pos = iocb->ki_pos; + loff_t end = pos + count - 1; + int ret; + + if (iocb->ki_flags & IOCB_NOWAIT) { + /* we could block if there are any pages in the range */ + if (filemap_range_has_page(mapping, pos, end)) + return -EAGAIN; + } else { + ret = filemap_write_and_wait_range(mapping, pos, end); + if (ret) + return ret; + } + + /* + * After a write we want buffered reads to be sure to go to disk to get + * the new data. We invalidate clean cached page from the region we're + * about to write. We do this *before* the write so that we can return + * without clobbering -EIOCBQUEUED from ->direct_IO(). + */ + return invalidate_inode_pages2_range(mapping, pos >> PAGE_SHIFT, + end >> PAGE_SHIFT); +} + /** * generic_file_read_iter - generic filesystem read routine * @iocb: kernel I/O control block @@ -3793,30 +3820,11 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from) write_len = iov_iter_count(from); end = (pos + write_len - 1) >> PAGE_SHIFT; - if (iocb->ki_flags & IOCB_NOWAIT) { - /* If there are pages to writeback, return */ - if (filemap_range_has_page(file->f_mapping, pos, - pos + write_len - 1)) - return -EAGAIN; - } else { - written = filemap_write_and_wait_range(mapping, pos, - pos + write_len - 1); - if (written) - goto out; - } - - /* - * After a write we want buffered reads to be sure to go to disk to get - * the new data. We invalidate clean cached page from the region we're - * about to write. We do this *before* the write so that we can return - * without clobbering -EIOCBQUEUED from ->direct_IO(). - */ - written = invalidate_inode_pages2_range(mapping, - pos >> PAGE_SHIFT, end); /* * If a page can not be invalidated, return 0 to fall back * to buffered write. */ + written = kiocb_invalidate_pages(iocb, write_len); if (written) { if (written == -EBUSY) return 0; -- cgit v1.2.3 From c402a9a9430b670926decbb284b756ee6f47c1ec Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 1 Jun 2023 16:58:58 +0200 Subject: filemap: add a kiocb_invalidate_post_direct_write helper Add a helper to invalidate page cache after a dio write. Link: https://lkml.kernel.org/r/20230601145904.1385409-7-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Hannes Reinecke Acked-by: Darrick J. Wong Cc: Al Viro Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Chao Yu Cc: Christian Brauner Cc: Ilya Dryomov Cc: Jaegeuk Kim Cc: Jens Axboe Cc: Johannes Thumshirn Cc: Matthew Wilcox Cc: Miklos Szeredi Cc: Miklos Szeredi Cc: Theodore Ts'o Cc: Trond Myklebust Cc: Xiubo Li Signed-off-by: Andrew Morton --- fs/direct-io.c | 10 ++-------- fs/iomap/direct-io.c | 12 ++---------- include/linux/fs.h | 5 ----- include/linux/pagemap.h | 1 + mm/filemap.c | 37 ++++++++++++++++++++----------------- 5 files changed, 25 insertions(+), 40 deletions(-) (limited to 'include') diff --git a/fs/direct-io.c b/fs/direct-io.c index 0b380bb8a81e..4f9069aee0fe 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -285,14 +285,8 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, unsigned int flags) * zeros from unwritten extents. */ if (flags & DIO_COMPLETE_INVALIDATE && - ret > 0 && dio_op == REQ_OP_WRITE && - dio->inode->i_mapping->nrpages) { - err = invalidate_inode_pages2_range(dio->inode->i_mapping, - offset >> PAGE_SHIFT, - (offset + ret - 1) >> PAGE_SHIFT); - if (err) - dio_warn_stale_pagecache(dio->iocb->ki_filp); - } + ret > 0 && dio_op == REQ_OP_WRITE) + kiocb_invalidate_post_direct_write(dio->iocb, ret); inode_dio_end(dio->inode); diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index 6207a59d2162..0795c54a745b 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -81,7 +81,6 @@ ssize_t iomap_dio_complete(struct iomap_dio *dio) { const struct iomap_dio_ops *dops = dio->dops; struct kiocb *iocb = dio->iocb; - struct inode *inode = file_inode(iocb->ki_filp); loff_t offset = iocb->ki_pos; ssize_t ret = dio->error; @@ -108,15 +107,8 @@ ssize_t iomap_dio_complete(struct iomap_dio *dio) * ->end_io() when necessary, otherwise a racing buffer read would cache * zeros from unwritten extents. */ - if (!dio->error && dio->size && - (dio->flags & IOMAP_DIO_WRITE) && inode->i_mapping->nrpages) { - int err; - err = invalidate_inode_pages2_range(inode->i_mapping, - offset >> PAGE_SHIFT, - (offset + dio->size - 1) >> PAGE_SHIFT); - if (err) - dio_warn_stale_pagecache(iocb->ki_filp); - } + if (!dio->error && dio->size && (dio->flags & IOMAP_DIO_WRITE)) + kiocb_invalidate_post_direct_write(iocb, dio->size); inode_dio_end(file_inode(iocb->ki_filp)); diff --git a/include/linux/fs.h b/include/linux/fs.h index 86b50271b4f7..4f196f827d9d 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2843,11 +2843,6 @@ static inline void inode_dio_end(struct inode *inode) wake_up_bit(&inode->i_state, __I_DIO_WAKEUP); } -/* - * Warn about a page cache invalidation failure diring a direct I/O write. - */ -void dio_warn_stale_pagecache(struct file *filp); - extern void inode_set_flags(struct inode *inode, unsigned int flags, unsigned int mask); diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 7b66a67dba51..716953ee1ebd 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -31,6 +31,7 @@ int invalidate_inode_pages2(struct address_space *mapping); int invalidate_inode_pages2_range(struct address_space *mapping, pgoff_t start, pgoff_t end); int kiocb_invalidate_pages(struct kiocb *iocb, size_t count); +void kiocb_invalidate_post_direct_write(struct kiocb *iocb, size_t count); int write_inode_now(struct inode *, int sync); int filemap_fdatawrite(struct address_space *); diff --git a/mm/filemap.c b/mm/filemap.c index 6ba6233c4bbb..b45506f74133 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3789,7 +3789,7 @@ EXPORT_SYMBOL(read_cache_page_gfp); /* * Warn about a page cache invalidation failure during a direct I/O write. */ -void dio_warn_stale_pagecache(struct file *filp) +static void dio_warn_stale_pagecache(struct file *filp) { static DEFINE_RATELIMIT_STATE(_rs, 86400 * HZ, DEFAULT_RATELIMIT_BURST); char pathname[128]; @@ -3806,19 +3806,23 @@ void dio_warn_stale_pagecache(struct file *filp) } } +void kiocb_invalidate_post_direct_write(struct kiocb *iocb, size_t count) +{ + struct address_space *mapping = iocb->ki_filp->f_mapping; + + if (mapping->nrpages && + invalidate_inode_pages2_range(mapping, + iocb->ki_pos >> PAGE_SHIFT, + (iocb->ki_pos + count - 1) >> PAGE_SHIFT)) + dio_warn_stale_pagecache(iocb->ki_filp); +} + ssize_t generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from) { - struct file *file = iocb->ki_filp; - struct address_space *mapping = file->f_mapping; - struct inode *inode = mapping->host; - loff_t pos = iocb->ki_pos; - ssize_t written; - size_t write_len; - pgoff_t end; - - write_len = iov_iter_count(from); - end = (pos + write_len - 1) >> PAGE_SHIFT; + struct address_space *mapping = iocb->ki_filp->f_mapping; + size_t write_len = iov_iter_count(from); + ssize_t written; /* * If a page can not be invalidated, return 0 to fall back @@ -3828,7 +3832,7 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from) if (written) { if (written == -EBUSY) return 0; - goto out; + return written; } written = mapping->a_ops->direct_IO(iocb, from); @@ -3850,11 +3854,11 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from) * * Skip invalidation for async writes or if mapping has no pages. */ - if (written > 0 && mapping->nrpages && - invalidate_inode_pages2_range(mapping, pos >> PAGE_SHIFT, end)) - dio_warn_stale_pagecache(file); - if (written > 0) { + struct inode *inode = mapping->host; + loff_t pos = iocb->ki_pos; + + kiocb_invalidate_post_direct_write(iocb, written); pos += written; write_len -= written; if (pos > i_size_read(inode) && !S_ISBLK(inode->i_mode)) { @@ -3865,7 +3869,6 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from) } if (written != -EIOCBQUEUED) iov_iter_revert(from, write_len - iov_iter_count(from)); -out: return written; } EXPORT_SYMBOL(generic_file_direct_write); -- cgit v1.2.3 From 44fff0fa08ec5a6d9d5fb05443a36d854d0ece4d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 1 Jun 2023 16:59:01 +0200 Subject: fs: factor out a direct_write_fallback helper Add a helper dealing with handling the syncing of a buffered write fallback for direct I/O. Link: https://lkml.kernel.org/r/20230601145904.1385409-10-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Miklos Szeredi Reviewed-by: Darrick J. Wong Cc: Al Viro Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Chao Yu Cc: Christian Brauner Cc: Hannes Reinecke Cc: Ilya Dryomov Cc: Jaegeuk Kim Cc: Jens Axboe Cc: Johannes Thumshirn Cc: Matthew Wilcox Cc: Miklos Szeredi Cc: Theodore Ts'o Cc: Trond Myklebust Cc: Xiubo Li Signed-off-by: Andrew Morton --- fs/libfs.c | 41 +++++++++++++++++++++++++++++++++ include/linux/fs.h | 2 ++ mm/filemap.c | 66 +++++++++++++----------------------------------------- 3 files changed, 58 insertions(+), 51 deletions(-) (limited to 'include') diff --git a/fs/libfs.c b/fs/libfs.c index 89cf614a3271..5b851315eeed 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -1613,3 +1613,44 @@ u64 inode_query_iversion(struct inode *inode) return cur >> I_VERSION_QUERIED_SHIFT; } EXPORT_SYMBOL(inode_query_iversion); + +ssize_t direct_write_fallback(struct kiocb *iocb, struct iov_iter *iter, + ssize_t direct_written, ssize_t buffered_written) +{ + struct address_space *mapping = iocb->ki_filp->f_mapping; + loff_t pos = iocb->ki_pos - buffered_written; + loff_t end = iocb->ki_pos - 1; + int err; + + /* + * If the buffered write fallback returned an error, we want to return + * the number of bytes which were written by direct I/O, or the error + * code if that was zero. + * + * Note that this differs from normal direct-io semantics, which will + * return -EFOO even if some bytes were written. + */ + if (unlikely(buffered_written < 0)) { + if (direct_written) + return direct_written; + return buffered_written; + } + + /* + * We need to ensure that the page cache pages are written to disk and + * invalidated to preserve the expected O_DIRECT semantics. + */ + err = filemap_write_and_wait_range(mapping, pos, end); + if (err < 0) { + /* + * We don't know how much we wrote, so just return the number of + * bytes which were direct-written + */ + if (direct_written) + return direct_written; + return err; + } + invalidate_mapping_pages(mapping, pos >> PAGE_SHIFT, end >> PAGE_SHIFT); + return direct_written + buffered_written; +} +EXPORT_SYMBOL_GPL(direct_write_fallback); diff --git a/include/linux/fs.h b/include/linux/fs.h index 4f196f827d9d..c363f8687c7e 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2744,6 +2744,8 @@ extern ssize_t __generic_file_write_iter(struct kiocb *, struct iov_iter *); extern ssize_t generic_file_write_iter(struct kiocb *, struct iov_iter *); extern ssize_t generic_file_direct_write(struct kiocb *, struct iov_iter *); ssize_t generic_perform_write(struct kiocb *, struct iov_iter *); +ssize_t direct_write_fallback(struct kiocb *iocb, struct iov_iter *iter, + ssize_t direct_written, ssize_t buffered_written); ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos, rwf_t flags); diff --git a/mm/filemap.c b/mm/filemap.c index b45506f74133..916b7c6444fe 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3979,23 +3979,19 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; - struct inode *inode = mapping->host; - ssize_t written = 0; - ssize_t err; - ssize_t status; + struct inode *inode = mapping->host; + ssize_t ret; - err = file_remove_privs(file); - if (err) - goto out; + ret = file_remove_privs(file); + if (ret) + return ret; - err = file_update_time(file); - if (err) - goto out; + ret = file_update_time(file); + if (ret) + return ret; if (iocb->ki_flags & IOCB_DIRECT) { - loff_t pos, endbyte; - - written = generic_file_direct_write(iocb, from); + ret = generic_file_direct_write(iocb, from); /* * If the write stopped short of completing, fall back to * buffered writes. Some filesystems do this for writes to @@ -4003,45 +3999,13 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) * not succeed (even if it did, DAX does not handle dirty * page-cache pages correctly). */ - if (written < 0 || !iov_iter_count(from) || IS_DAX(inode)) - goto out; - - pos = iocb->ki_pos; - status = generic_perform_write(iocb, from); - /* - * If generic_perform_write() returned a synchronous error - * then we want to return the number of bytes which were - * direct-written, or the error code if that was zero. Note - * that this differs from normal direct-io semantics, which - * will return -EFOO even if some bytes were written. - */ - if (unlikely(status < 0)) { - err = status; - goto out; - } - /* - * We need to ensure that the page cache pages are written to - * disk and invalidated to preserve the expected O_DIRECT - * semantics. - */ - endbyte = pos + status - 1; - err = filemap_write_and_wait_range(mapping, pos, endbyte); - if (err == 0) { - written += status; - invalidate_mapping_pages(mapping, - pos >> PAGE_SHIFT, - endbyte >> PAGE_SHIFT); - } else { - /* - * We don't know how much we wrote, so just return - * the number of bytes which were direct-written - */ - } - } else { - written = generic_perform_write(iocb, from); + if (ret < 0 || !iov_iter_count(from) || IS_DAX(inode)) + return ret; + return direct_write_fallback(iocb, from, ret, + generic_perform_write(iocb, from)); } -out: - return written ? written : err; + + return generic_perform_write(iocb, from); } EXPORT_SYMBOL(__generic_file_write_iter); -- cgit v1.2.3 From 54cbbbf3faf610fb4eba6f8d39d933bcbfc6f4de Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Thu, 4 May 2023 22:27:51 +0100 Subject: mm/mmap: separate writenotify and dirty tracking logic MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "mm/gup: disallow GUP writing to file-backed mappings by default", v9. Writing to file-backed mappings which require folio dirty tracking using GUP is a fundamentally broken operation, as kernel write access to GUP mappings do not adhere to the semantics expected by a file system. A GUP caller uses the direct mapping to access the folio, which does not cause write notify to trigger, nor does it enforce that the caller marks the folio dirty. The problem arises when, after an initial write to the folio, writeback results in the folio being cleaned and then the caller, via the GUP interface, writes to the folio again. As a result of the use of this secondary, direct, mapping to the folio no write notify will occur, and if the caller does mark the folio dirty, this will be done so unexpectedly. For example, consider the following scenario:- 1. A folio is written to via GUP which write-faults the memory, notifying the file system and dirtying the folio. 2. Later, writeback is triggered, resulting in the folio being cleaned and the PTE being marked read-only. 3. The GUP caller writes to the folio, as it is mapped read/write via the direct mapping. 4. The GUP caller, now done with the page, unpins it and sets it dirty (though it does not have to). This change updates both the PUP FOLL_LONGTERM slow and fast APIs. As pin_user_pages_fast_only() does not exist, we can rely on a slightly imperfect whitelisting in the PUP-fast case and fall back to the slow case should this fail. This patch (of 3): vma_wants_writenotify() is specifically intended for setting PTE page table flags, accounting for existing page table flag state and whether the underlying filesystem performs dirty tracking for a file-backed mapping. Everything is predicated firstly on whether the mapping is shared writable, as this is the only instance where dirty tracking is pertinent - MAP_PRIVATE mappings will always be CoW'd and unshared, and read-only file-backed shared mappings cannot be written to, even with FOLL_FORCE. All other checks are in line with existing logic, though now separated into checks eplicitily for dirty tracking and those for determining how to set page table flags. We make this change so we can perform checks in the GUP logic to determine which mappings might be problematic when written to. Link: https://lkml.kernel.org/r/cover.1683235180.git.lstoakes@gmail.com Link: https://lkml.kernel.org/r/0f218370bd49b4e6bbfbb499f7c7b92c26ba1ceb.1683235180.git.lstoakes@gmail.com Signed-off-by: Lorenzo Stoakes Reviewed-by: John Hubbard Reviewed-by: Mika Penttilä Reviewed-by: Jan Kara Reviewed-by: Jason Gunthorpe Acked-by: David Hildenbrand Cc: Kirill A . Shutemov Cc: Peter Zijlstra Signed-off-by: Andrew Morton --- include/linux/mm.h | 1 + mm/mmap.c | 58 +++++++++++++++++++++++++++++++++++++++++++----------- 2 files changed, 47 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 62bb3272e531..66032f0d515c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2461,6 +2461,7 @@ extern unsigned long move_page_tables(struct vm_area_struct *vma, #define MM_CP_UFFD_WP_ALL (MM_CP_UFFD_WP | \ MM_CP_UFFD_WP_RESOLVE) +bool vma_needs_dirty_tracking(struct vm_area_struct *vma); int vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot); static inline bool vma_wants_manual_pte_write_upgrade(struct vm_area_struct *vma) { diff --git a/mm/mmap.c b/mm/mmap.c index e1624cb2c04e..f084b7940431 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1454,6 +1454,48 @@ SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg) } #endif /* __ARCH_WANT_SYS_OLD_MMAP */ +static bool vm_ops_needs_writenotify(const struct vm_operations_struct *vm_ops) +{ + return vm_ops && (vm_ops->page_mkwrite || vm_ops->pfn_mkwrite); +} + +static bool vma_is_shared_writable(struct vm_area_struct *vma) +{ + return (vma->vm_flags & (VM_WRITE | VM_SHARED)) == + (VM_WRITE | VM_SHARED); +} + +static bool vma_fs_can_writeback(struct vm_area_struct *vma) +{ + /* No managed pages to writeback. */ + if (vma->vm_flags & VM_PFNMAP) + return false; + + return vma->vm_file && vma->vm_file->f_mapping && + mapping_can_writeback(vma->vm_file->f_mapping); +} + +/* + * Does this VMA require the underlying folios to have their dirty state + * tracked? + */ +bool vma_needs_dirty_tracking(struct vm_area_struct *vma) +{ + /* Only shared, writable VMAs require dirty tracking. */ + if (!vma_is_shared_writable(vma)) + return false; + + /* Does the filesystem need to be notified? */ + if (vm_ops_needs_writenotify(vma->vm_ops)) + return true; + + /* + * Even if the filesystem doesn't indicate a need for writenotify, if it + * can writeback, dirty tracking is still required. + */ + return vma_fs_can_writeback(vma); +} + /* * Some shared mappings will want the pages marked read-only * to track write events. If so, we'll downgrade vm_page_prot @@ -1462,21 +1504,18 @@ SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg) */ int vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot) { - vm_flags_t vm_flags = vma->vm_flags; - const struct vm_operations_struct *vm_ops = vma->vm_ops; - /* If it was private or non-writable, the write bit is already clear */ - if ((vm_flags & (VM_WRITE|VM_SHARED)) != ((VM_WRITE|VM_SHARED))) + if (!vma_is_shared_writable(vma)) return 0; /* The backer wishes to know when pages are first written to? */ - if (vm_ops && (vm_ops->page_mkwrite || vm_ops->pfn_mkwrite)) + if (vm_ops_needs_writenotify(vma->vm_ops)) return 1; /* The open routine did something to the protections that pgprot_modify * won't preserve? */ if (pgprot_val(vm_page_prot) != - pgprot_val(vm_pgprot_modify(vm_page_prot, vm_flags))) + pgprot_val(vm_pgprot_modify(vm_page_prot, vma->vm_flags))) return 0; /* @@ -1490,13 +1529,8 @@ int vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot) if (userfaultfd_wp(vma)) return 1; - /* Specialty mapping? */ - if (vm_flags & VM_PFNMAP) - return 0; - /* Can the mapping track the dirty pages? */ - return vma->vm_file && vma->vm_file->f_mapping && - mapping_can_writeback(vma->vm_file->f_mapping); + return vma_fs_can_writeback(vma); } /* -- cgit v1.2.3 From ce5df7764b3b2abaf3687c460a9a1922daaed5b7 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 19 May 2023 13:16:52 +0200 Subject: mm: page_isolation: write proper kerneldoc And remove the incorrect header comments. [akpm@linux-foundation.org: s/lower/first/, s/upper/last/, per Mike] Link: https://lkml.kernel.org/r/20230519111652.40658-1-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Cc: Mike Rapoport Signed-off-by: Andrew Morton --- include/linux/page-isolation.h | 24 ++++++------------------ mm/page_isolation.c | 33 ++++++++++++++++++++++++++------- 2 files changed, 32 insertions(+), 25 deletions(-) (limited to 'include') diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h index 5456b7be38ae..0ab089e89db4 100644 --- a/include/linux/page-isolation.h +++ b/include/linux/page-isolation.h @@ -37,24 +37,12 @@ void set_pageblock_migratetype(struct page *page, int migratetype); int move_freepages_block(struct zone *zone, struct page *page, int migratetype, int *num_movable); -/* - * Changes migrate type in [start_pfn, end_pfn) to be MIGRATE_ISOLATE. - */ -int -start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, - int migratetype, int flags, gfp_t gfp_flags); - -/* - * Changes MIGRATE_ISOLATE to MIGRATE_MOVABLE. - * target range is [start_pfn, end_pfn) - */ -void -undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, - int migratetype); - -/* - * Test all pages in [start_pfn, end_pfn) are isolated or not. - */ +int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, + int migratetype, int flags, gfp_t gfp_flags); + +void undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, + int migratetype); + int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn, int isol_flags); diff --git a/mm/page_isolation.c b/mm/page_isolation.c index c6f3605e37ab..6599cc965e21 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -481,10 +481,9 @@ failed: } /** - * start_isolate_page_range() - make page-allocation-type of range of pages to - * be MIGRATE_ISOLATE. - * @start_pfn: The lower PFN of the range to be isolated. - * @end_pfn: The upper PFN of the range to be isolated. + * start_isolate_page_range() - mark page range MIGRATE_ISOLATE + * @start_pfn: The first PFN of the range to be isolated. + * @end_pfn: The last PFN of the range to be isolated. * @migratetype: Migrate type to set in error recovery. * @flags: The following flags are allowed (they can be combined in * a bit mask) @@ -571,8 +570,14 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, return 0; } -/* - * Make isolated pages available again. +/** + * undo_isolate_page_range - undo effects of start_isolate_page_range() + * @start_pfn: The first PFN of the isolated range + * @end_pfn: The last PFN of the isolated range + * @migratetype: New migrate type to set on the range + * + * This finds every MIGRATE_ISOLATE page block in the given range + * and switches it to @migratetype. */ void undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, int migratetype) @@ -631,7 +636,21 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn, return pfn; } -/* Caller should ensure that requested range is in a single zone */ +/** + * test_pages_isolated - check if pageblocks in range are isolated + * @start_pfn: The first PFN of the isolated range + * @end_pfn: The first PFN *after* the isolated range + * @isol_flags: Testing mode flags + * + * This tests if all in the specified range are free. + * + * If %MEMORY_OFFLINE is specified in @flags, it will consider + * poisoned and offlined pages free as well. + * + * Caller must ensure the requested range doesn't span zones. + * + * Returns 0 if true, -EBUSY if one or more pages are in use. + */ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn, int isol_flags) { -- cgit v1.2.3 From e52ee4cc8fa87a75ab0cfc7bf51c0715a880a08e Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Sat, 3 Jun 2023 22:25:13 +0800 Subject: mm: remove obsolete alloc_migrate_target() There's only declaration left in the header file. Remove it. Link: https://lkml.kernel.org/r/20230603142513.787000-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: David Hildenbrand Cc: Johannes Weiner Signed-off-by: Andrew Morton --- include/linux/page-isolation.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include') diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h index 0ab089e89db4..4ac34392823a 100644 --- a/include/linux/page-isolation.h +++ b/include/linux/page-isolation.h @@ -45,7 +45,4 @@ void undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn, int isol_flags); - -struct page *alloc_migrate_target(struct page *page, unsigned long private); - #endif -- cgit v1.2.3 From e5797dc011182f8b25420bc977f37cd92fc6e755 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 6 Jun 2023 20:18:13 +0800 Subject: mm: vmscan: mark kswapd_run() and kswapd_stop() __meminit Add __meminit to kswapd_run() and kswapd_stop() to ensure they're default to __init when memory hotplug is not enabled. Link: https://lkml.kernel.org/r/20230606121813.242163-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Acked-by: Yu Zhao Acked-by: David Hildenbrand Signed-off-by: Andrew Morton --- include/linux/swap.h | 4 ++-- mm/vmscan.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/swap.h b/include/linux/swap.h index 2ddbfd85f6c7..b5f6f2916de1 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -460,8 +460,8 @@ static inline bool node_reclaim_enabled(void) void check_move_unevictable_folios(struct folio_batch *fbatch); void check_move_unevictable_pages(struct pagevec *pvec); -extern void kswapd_run(int nid); -extern void kswapd_stop(int nid); +extern void __meminit kswapd_run(int nid); +extern void __meminit kswapd_stop(int nid); #ifdef CONFIG_SWAP diff --git a/mm/vmscan.c b/mm/vmscan.c index 9048739c41db..a4e2936f6d35 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -7871,7 +7871,7 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) /* * This kswapd start function will be called by init and node-hot-add. */ -void kswapd_run(int nid) +void __meminit kswapd_run(int nid) { pg_data_t *pgdat = NODE_DATA(nid); @@ -7892,7 +7892,7 @@ void kswapd_run(int nid) * Called by memory hotplug when all memory in a node is offlined. Caller must * be holding mem_hotplug_begin/done(). */ -void kswapd_stop(int nid) +void __meminit kswapd_stop(int nid) { pg_data_t *pgdat = NODE_DATA(nid); struct task_struct *kswapd; -- cgit v1.2.3 From bd5f79ab39367665f40e10c2486aa15e7a841490 Mon Sep 17 00:00:00 2001 From: Yajun Deng Date: Wed, 7 Jun 2023 10:39:52 +0800 Subject: mm/sparse: remove unused parameters in sparse_remove_section() These parameters ms and map_offset are not used in sparse_remove_section(), so remove them. The __remove_section() is only called by __remove_pages(), remove it. And put the WARN_ON_ONCE() in sparse_remove_section(). Link: https://lkml.kernel.org/r/20230607023952.2247489-1-yajun.deng@linux.dev Signed-off-by: Yajun Deng Reviewed-by: David Hildenbrand Cc: Oscar Salvador Signed-off-by: Andrew Morton --- include/linux/memory_hotplug.h | 5 ++--- mm/memory_hotplug.c | 18 +----------------- mm/sparse.c | 10 +++++++--- 3 files changed, 10 insertions(+), 23 deletions(-) (limited to 'include') diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 04bc286eed42..013c69753c91 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -344,9 +344,8 @@ extern void remove_pfn_range_from_zone(struct zone *zone, extern int sparse_add_section(int nid, unsigned long pfn, unsigned long nr_pages, struct vmem_altmap *altmap, struct dev_pagemap *pgmap); -extern void sparse_remove_section(struct mem_section *ms, - unsigned long pfn, unsigned long nr_pages, - unsigned long map_offset, struct vmem_altmap *altmap); +extern void sparse_remove_section(unsigned long pfn, unsigned long nr_pages, + struct vmem_altmap *altmap); extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pnum); extern struct zone *zone_for_pfn_range(int online_type, int nid, diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 9061ac69b1b6..8877734b5f2f 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -492,18 +492,6 @@ void __ref remove_pfn_range_from_zone(struct zone *zone, set_zone_contiguous(zone); } -static void __remove_section(unsigned long pfn, unsigned long nr_pages, - unsigned long map_offset, - struct vmem_altmap *altmap) -{ - struct mem_section *ms = __pfn_to_section(pfn); - - if (WARN_ON_ONCE(!valid_section(ms))) - return; - - sparse_remove_section(ms, pfn, nr_pages, map_offset, altmap); -} - /** * __remove_pages() - remove sections of pages * @pfn: starting pageframe (must be aligned to start of a section) @@ -520,9 +508,6 @@ void __remove_pages(unsigned long pfn, unsigned long nr_pages, { const unsigned long end_pfn = pfn + nr_pages; unsigned long cur_nr_pages; - unsigned long map_offset = 0; - - map_offset = vmem_altmap_offset(altmap); if (check_pfn_span(pfn, nr_pages)) { WARN(1, "Misaligned %s start: %#lx end: %#lx\n", __func__, pfn, pfn + nr_pages - 1); @@ -534,8 +519,7 @@ void __remove_pages(unsigned long pfn, unsigned long nr_pages, /* Select all remaining pages up to the next section boundary */ cur_nr_pages = min(end_pfn - pfn, SECTION_ALIGN_UP(pfn + 1) - pfn); - __remove_section(pfn, cur_nr_pages, map_offset, altmap); - map_offset = 0; + sparse_remove_section(pfn, cur_nr_pages, altmap); } } diff --git a/mm/sparse.c b/mm/sparse.c index c2afdb26039e..7a29e10193fe 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -922,10 +922,14 @@ int __meminit sparse_add_section(int nid, unsigned long start_pfn, return 0; } -void sparse_remove_section(struct mem_section *ms, unsigned long pfn, - unsigned long nr_pages, unsigned long map_offset, - struct vmem_altmap *altmap) +void sparse_remove_section(unsigned long pfn, unsigned long nr_pages, + struct vmem_altmap *altmap) { + struct mem_section *ms = __pfn_to_section(pfn); + + if (WARN_ON_ONCE(!valid_section(ms))) + return; + section_deactivate(pfn, nr_pages, altmap); } #endif /* CONFIG_MEMORY_HOTPLUG */ -- cgit v1.2.3 From 36ce9d76b0a93bae799e27e4f5ac35478c676592 Mon Sep 17 00:00:00 2001 From: Roberto Sassu Date: Wed, 7 Jun 2023 18:15:23 +0200 Subject: shmem: use ramfs_kill_sb() for kill_sb method of ramfs-based tmpfs As the ramfs-based tmpfs uses ramfs_init_fs_context() for the init_fs_context method, which allocates fc->s_fs_info, use ramfs_kill_sb() to free it and avoid a memory leak. Link: https://lkml.kernel.org/r/20230607161523.2876433-1-roberto.sassu@huaweicloud.com Fixes: c3b1b1cbf002 ("ramfs: add support for "mode=" mount option") Signed-off-by: Roberto Sassu Cc: Hugh Dickins Cc: David Howells Cc: Al Viro Cc: Signed-off-by: Andrew Morton --- fs/ramfs/inode.c | 2 +- include/linux/ramfs.h | 1 + mm/shmem.c | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c index 5ba580c78835..fef477c78107 100644 --- a/fs/ramfs/inode.c +++ b/fs/ramfs/inode.c @@ -278,7 +278,7 @@ int ramfs_init_fs_context(struct fs_context *fc) return 0; } -static void ramfs_kill_sb(struct super_block *sb) +void ramfs_kill_sb(struct super_block *sb) { kfree(sb->s_fs_info); kill_litter_super(sb); diff --git a/include/linux/ramfs.h b/include/linux/ramfs.h index 917528d102c4..d506dc63dd47 100644 --- a/include/linux/ramfs.h +++ b/include/linux/ramfs.h @@ -7,6 +7,7 @@ struct inode *ramfs_get_inode(struct super_block *sb, const struct inode *dir, umode_t mode, dev_t dev); extern int ramfs_init_fs_context(struct fs_context *fc); +extern void ramfs_kill_sb(struct super_block *sb); #ifdef CONFIG_MMU static inline int diff --git a/mm/shmem.c b/mm/shmem.c index 5e54ab5f61f2..c606ab89693a 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -4199,7 +4199,7 @@ static struct file_system_type shmem_fs_type = { .name = "tmpfs", .init_fs_context = ramfs_init_fs_context, .parameters = ramfs_fs_parameters, - .kill_sb = kill_litter_super, + .kill_sb = ramfs_kill_sb, .fs_flags = FS_USERNS_MOUNT, }; -- cgit v1.2.3 From a668968f84265e698a122656c433809ab9f023fa Mon Sep 17 00:00:00 2001 From: Haifeng Xu Date: Wed, 7 Jun 2023 02:45:48 +0000 Subject: mm/memory_hotplug: remove reset_node_managed_pages() in hotadd_init_pgdat() managed pages has already been set to 0 in free_area_init_core_hotplug(), via zone_init_internals() on each zone. It's pointless to reset again. Furthermore, reset_node_managed_pages() no longer needs to be exposed outside of mm/memblock.c. Remove declaration in include/linux/memblock.h and define it as static. In addtion to this, the only caller of reset_node_managed_pages() is reset_all_zones_managed_pages(), which is annotated with __init, so it should be safe to also mark reset_node_managed_pages() as __init. Link: https://lkml.kernel.org/r/20230607024548.1240-1-haifeng.xu@shopee.com Signed-off-by: Haifeng Xu Suggested-by: David Hildenbrand Cc: Michal Hocko Cc: Mike Rapoport (IBM) Cc: Oscar Salvador Signed-off-by: Andrew Morton --- include/linux/memblock.h | 1 - mm/memblock.c | 2 +- mm/memory_hotplug.c | 1 - 3 files changed, 1 insertion(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/memblock.h b/include/linux/memblock.h index f82ee3fac1cd..f71ff9f0ec81 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -128,7 +128,6 @@ int memblock_clear_nomap(phys_addr_t base, phys_addr_t size); void memblock_free_all(void); void memblock_free(void *ptr, size_t size); -void reset_node_managed_pages(pg_data_t *pgdat); void reset_all_zones_managed_pages(void); /* Low level functions */ diff --git a/mm/memblock.c b/mm/memblock.c index 3feafea06ab2..da4264528e1e 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -2122,7 +2122,7 @@ static unsigned long __init free_low_memory_core_early(void) static int reset_managed_pages_done __initdata; -void reset_node_managed_pages(pg_data_t *pgdat) +static void __init reset_node_managed_pages(pg_data_t *pgdat) { struct zone *z; diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 8877734b5f2f..5248323fc0f7 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1194,7 +1194,6 @@ static pg_data_t __ref *hotadd_init_pgdat(int nid) * online_pages() and offline_pages(). * TODO: should be in free_area_init_core_hotplug? */ - reset_node_managed_pages(pgdat); reset_node_present_pages(pgdat); return pgdat; -- cgit v1.2.3 From b9c91c43412f2e07a5287dfe7027acdd8fb0b1ef Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Wed, 7 Jun 2023 19:51:43 +0000 Subject: mm: zswap: support exclusive loads Commit 71024cb4a0bf ("frontswap: remove frontswap_tmem_exclusive_gets") removed support for exclusive loads from frontswap as it was not used. Bring back exclusive loads support to frontswap by adding an "exclusive" output parameter to frontswap_ops->load. On the zswap side, add a module parameter to enable/disable exclusive loads, and a config option to control the boot default value. Refactor zswap entry invalidation in zswap_frontswap_invalidate_page() into zswap_invalidate_entry() to reuse it in zswap_frontswap_load() if exclusive loads are enabled. With exclusive loads, we avoid having two copies of the same page in memory (compressed & uncompressed) after faulting it in from zswap. On the other hand, if the page is to be reclaimed again without being dirtied, it will be re-compressed. Compression is not usually slow, and a page that was just faulted in is less likely to be reclaimed again soon. Link: https://lkml.kernel.org/r/20230607195143.1473802-1-yosryahmed@google.com Signed-off-by: Yosry Ahmed Suggested-by: Yu Zhao Acked-by: Johannes Weiner Cc: Dan Streetman Cc: Domenico Cerasuolo Cc: Konrad Rzeszutek Wilk Cc: Nhat Pham Cc: Seth Jennings Cc: Vitaly Wool Signed-off-by: Andrew Morton --- include/linux/frontswap.h | 2 +- mm/Kconfig | 16 ++++++++++++++++ mm/frontswap.c | 10 ++++++++-- mm/zswap.c | 28 ++++++++++++++++++++-------- 4 files changed, 45 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/linux/frontswap.h b/include/linux/frontswap.h index a631bac12220..eaa0ac5f9003 100644 --- a/include/linux/frontswap.h +++ b/include/linux/frontswap.h @@ -10,7 +10,7 @@ struct frontswap_ops { void (*init)(unsigned); /* this swap type was just swapon'ed */ int (*store)(unsigned, pgoff_t, struct page *); /* store a page */ - int (*load)(unsigned, pgoff_t, struct page *); /* load a page */ + int (*load)(unsigned, pgoff_t, struct page *, bool *); /* load a page */ void (*invalidate_page)(unsigned, pgoff_t); /* page no longer needed */ void (*invalidate_area)(unsigned); /* swap type just swapoff'ed */ }; diff --git a/mm/Kconfig b/mm/Kconfig index 7672a22647b4..12f32f8d26bf 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -46,6 +46,22 @@ config ZSWAP_DEFAULT_ON The selection made here can be overridden by using the kernel command line 'zswap.enabled=' option. +config ZSWAP_EXCLUSIVE_LOADS_DEFAULT_ON + bool "Invalidate zswap entries when pages are loaded" + depends on ZSWAP + help + If selected, exclusive loads for zswap will be enabled at boot, + otherwise it will be disabled. + + If exclusive loads are enabled, when a page is loaded from zswap, + the zswap entry is invalidated at once, as opposed to leaving it + in zswap until the swap entry is freed. + + This avoids having two copies of the same page in memory + (compressed and uncompressed) after faulting in a page from zswap. + The cost is that if the page was never dirtied and needs to be + swapped out again, it will be re-compressed. + choice prompt "Default compressor" depends on ZSWAP diff --git a/mm/frontswap.c b/mm/frontswap.c index 279e55b4ed87..2fb5df3384b8 100644 --- a/mm/frontswap.c +++ b/mm/frontswap.c @@ -206,6 +206,7 @@ int __frontswap_load(struct page *page) int type = swp_type(entry); struct swap_info_struct *sis = swap_info[type]; pgoff_t offset = swp_offset(entry); + bool exclusive = false; VM_BUG_ON(!frontswap_ops); VM_BUG_ON(!PageLocked(page)); @@ -215,9 +216,14 @@ int __frontswap_load(struct page *page) return -1; /* Try loading from each implementation, until one succeeds. */ - ret = frontswap_ops->load(type, offset, page); - if (ret == 0) + ret = frontswap_ops->load(type, offset, page, &exclusive); + if (ret == 0) { inc_frontswap_loads(); + if (exclusive) { + SetPageDirty(page); + __frontswap_clear(sis, offset); + } + } return ret; } diff --git a/mm/zswap.c b/mm/zswap.c index bcb82e09eb64..9fa86265f6dd 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -138,6 +138,10 @@ static bool zswap_non_same_filled_pages_enabled = true; module_param_named(non_same_filled_pages_enabled, zswap_non_same_filled_pages_enabled, bool, 0644); +static bool zswap_exclusive_loads_enabled = IS_ENABLED( + CONFIG_ZSWAP_EXCLUSIVE_LOADS_DEFAULT_ON); +module_param_named(exclusive_loads, zswap_exclusive_loads_enabled, bool, 0644); + /********************************* * data structures **********************************/ @@ -1340,12 +1344,22 @@ shrink: goto reject; } +static void zswap_invalidate_entry(struct zswap_tree *tree, + struct zswap_entry *entry) +{ + /* remove from rbtree */ + zswap_rb_erase(&tree->rbroot, entry); + + /* drop the initial reference from entry creation */ + zswap_entry_put(tree, entry); +} + /* * returns 0 if the page was successfully decompressed * return -1 on entry not found or error */ static int zswap_frontswap_load(unsigned type, pgoff_t offset, - struct page *page) + struct page *page, bool *exclusive) { struct zswap_tree *tree = zswap_trees[type]; struct zswap_entry *entry; @@ -1415,6 +1429,10 @@ stats: freeentry: spin_lock(&tree->lock); zswap_entry_put(tree, entry); + if (!ret && zswap_exclusive_loads_enabled) { + zswap_invalidate_entry(tree, entry); + *exclusive = true; + } spin_unlock(&tree->lock); return ret; @@ -1434,13 +1452,7 @@ static void zswap_frontswap_invalidate_page(unsigned type, pgoff_t offset) spin_unlock(&tree->lock); return; } - - /* remove from rbtree */ - zswap_rb_erase(&tree->rbroot, entry); - - /* drop the initial reference from entry creation */ - zswap_entry_put(tree, entry); - + zswap_invalidate_entry(tree, entry); spin_unlock(&tree->lock); } -- cgit v1.2.3 From 26e1a0c3277d7f43856ec424902423be212cc178 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 8 Jun 2023 18:06:53 -0700 Subject: mm: use pmdp_get_lockless() without surplus barrier() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "mm: allow pte_offset_map[_lock]() to fail", v2. What is it all about? Some mmap_lock avoidance i.e. latency reduction. Initially just for the case of collapsing shmem or file pages to THPs; but likely to be relied upon later in other contexts e.g. freeing of empty page tables (but that's not work I'm doing). mmap_write_lock avoidance when collapsing to anon THPs? Perhaps, but again that's not work I've done: a quick attempt was not as easy as the shmem/file case. I would much prefer not to have to make these small but wide-ranging changes for such a niche case; but failed to find another way, and have heard that shmem MADV_COLLAPSE's usefulness is being limited by that mmap_write_lock it currently requires. These changes (though of course not these exact patches) have been in Google's data centre kernel for three years now: we do rely upon them. What is this preparatory series about? The current mmap locking will not be enough to guard against that tricky transition between pmd entry pointing to page table, and empty pmd entry, and pmd entry pointing to huge page: pte_offset_map() will have to validate the pmd entry for itself, returning NULL if no page table is there. What to do about that varies: sometimes nearby error handling indicates just to skip it; but in many cases an ACTION_AGAIN or "goto again" is appropriate (and if that risks an infinite loop, then there must have been an oops, or pfn 0 mistaken for page table, before). Given the likely extension to freeing empty page tables, I have not limited this set of changes to a THP config; and it has been easier, and sets a better example, if each site is given appropriate handling: even where deeper study might prove that failure could only happen if the pmd table were corrupted. Several of the patches are, or include, cleanup on the way; and by the end, pmd_trans_unstable() and suchlike are deleted: pte_offset_map() and pte_offset_map_lock() then handle those original races and more. Most uses of pte_lockptr() are deprecated, with pte_offset_map_nolock() taking its place. This patch (of 32): Use pmdp_get_lockless() in preference to READ_ONCE(*pmdp), to get a more reliable result with PAE (or READ_ONCE as before without PAE); and remove the unnecessary extra barrier()s which got left behind in its callers. HOWEVER: Note the small print in linux/pgtable.h, where it was designed specifically for fast GUP, and depends on interrupts being disabled for its full guarantee: most callers which have been added (here and before) do NOT have interrupts disabled, so there is still some need for caution. Link: https://lkml.kernel.org/r/f35279a9-9ac0-de22-d245-591afbfb4dc@google.com Signed-off-by: Hugh Dickins Acked-by: Yu Zhao Acked-by: Peter Xu Cc: Alistair Popple Cc: Anshuman Khandual Cc: Axel Rasmussen Cc: Christophe Leroy Cc: Christoph Hellwig Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Ira Weiny Cc: Jason Gunthorpe Cc: Kirill A. Shutemov Cc: Lorenzo Stoakes Cc: Matthew Wilcox Cc: Mel Gorman Cc: Miaohe Lin Cc: Mike Kravetz Cc: Mike Rapoport (IBM) Cc: Minchan Kim Cc: Naoya Horiguchi Cc: Pavel Tatashin Cc: Peter Zijlstra Cc: Qi Zheng Cc: Ralph Campbell Cc: Ryan Roberts Cc: SeongJae Park Cc: Song Liu Cc: Steven Price Cc: Suren Baghdasaryan Cc: Thomas Hellström Cc: Will Deacon Cc: Yang Shi Cc: Zack Rusin Signed-off-by: Andrew Morton --- fs/userfaultfd.c | 10 +--------- include/linux/pgtable.h | 17 ----------------- mm/gup.c | 6 +----- mm/hmm.c | 2 +- mm/khugepaged.c | 5 ----- mm/ksm.c | 3 +-- mm/memory.c | 14 ++------------ mm/mprotect.c | 5 ----- mm/page_vma_mapped.c | 2 +- 9 files changed, 7 insertions(+), 57 deletions(-) (limited to 'include') diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 0fd96d6e39ce..f7a0817b1ec0 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -349,15 +349,7 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx, if (!pud_present(*pud)) goto out; pmd = pmd_offset(pud, address); - /* - * READ_ONCE must function as a barrier with narrower scope - * and it must be equivalent to: - * _pmd = *pmd; barrier(); - * - * This is to deal with the instability (as in - * pmd_trans_unstable) of the pmd. - */ - _pmd = READ_ONCE(*pmd); + _pmd = pmdp_get_lockless(pmd); if (pmd_none(_pmd)) goto out; diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index c5a51481bbb9..8ec27fe69dc8 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1344,23 +1344,6 @@ static inline int pud_trans_unstable(pud_t *pud) static inline int pmd_none_or_trans_huge_or_clear_bad(pmd_t *pmd) { pmd_t pmdval = pmdp_get_lockless(pmd); - /* - * The barrier will stabilize the pmdval in a register or on - * the stack so that it will stop changing under the code. - * - * When CONFIG_TRANSPARENT_HUGEPAGE=y on x86 32bit PAE, - * pmdp_get_lockless is allowed to return a not atomic pmdval - * (for example pointing to an hugepage that has never been - * mapped in the pmd). The below checks will only care about - * the low part of the pmd with 32bit PAE x86 anyway, with the - * exception of pmd_none(). So the important thing is that if - * the low part of the pmd is found null, the high part will - * be also null or the pmd_none() check below would be - * confused. - */ -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - barrier(); -#endif /* * !pmd_present() checks for pmd migration entries * diff --git a/mm/gup.c b/mm/gup.c index a718b956edbe..d448fd286b8c 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -654,11 +654,7 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma, struct mm_struct *mm = vma->vm_mm; pmd = pmd_offset(pudp, address); - /* - * The READ_ONCE() will stabilize the pmdval in a register or - * on the stack so that it will stop changing under the code. - */ - pmdval = READ_ONCE(*pmd); + pmdval = pmdp_get_lockless(pmd); if (pmd_none(pmdval)) return no_page_table(vma, flags); if (!pmd_present(pmdval)) diff --git a/mm/hmm.c b/mm/hmm.c index 6a151c09de5e..e23043345615 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -332,7 +332,7 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp, pmd_t pmd; again: - pmd = READ_ONCE(*pmdp); + pmd = pmdp_get_lockless(pmdp); if (pmd_none(pmd)) return hmm_vma_walk_hole(start, end, -1, walk); diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 3649ba12a235..2d206e62d358 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -959,11 +959,6 @@ static int find_pmd_or_thp_or_none(struct mm_struct *mm, return SCAN_PMD_NULL; pmde = pmdp_get_lockless(*pmd); - -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - /* See comments in pmd_none_or_trans_huge_or_clear_bad() */ - barrier(); -#endif if (pmd_none(pmde)) return SCAN_PMD_NONE; if (!pmd_present(pmde)) diff --git a/mm/ksm.c b/mm/ksm.c index 0156bded3a66..df2aa281d49d 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1194,8 +1194,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, * without holding anon_vma lock for write. So when looking for a * genuine pmde (in which to find pte), test present and !THP together. */ - pmde = *pmd; - barrier(); + pmde = pmdp_get_lockless(pmd); if (!pmd_present(pmde) || pmd_trans_huge(pmde)) goto out; diff --git a/mm/memory.c b/mm/memory.c index 36082fd42df4..221b21623644 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4923,18 +4923,9 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf) * So now it's safe to run pte_offset_map(). */ vmf->pte = pte_offset_map(vmf->pmd, vmf->address); - vmf->orig_pte = *vmf->pte; + vmf->orig_pte = ptep_get_lockless(vmf->pte); vmf->flags |= FAULT_FLAG_ORIG_PTE_VALID; - /* - * some architectures can have larger ptes than wordsize, - * e.g.ppc44x-defconfig has CONFIG_PTE_64BIT=y and - * CONFIG_32BIT=y, so READ_ONCE cannot guarantee atomic - * accesses. The code below just needs a consistent view - * for the ifs and we later double check anyway with the - * ptl lock held. So here a barrier will do. - */ - barrier(); if (pte_none(vmf->orig_pte)) { pte_unmap(vmf->pte); vmf->pte = NULL; @@ -5058,9 +5049,8 @@ retry_pud: if (!(ret & VM_FAULT_FALLBACK)) return ret; } else { - vmf.orig_pmd = *vmf.pmd; + vmf.orig_pmd = pmdp_get_lockless(vmf.pmd); - barrier(); if (unlikely(is_swap_pmd(vmf.orig_pmd))) { VM_BUG_ON(thp_migration_supported() && !is_pmd_migration_entry(vmf.orig_pmd)); diff --git a/mm/mprotect.c b/mm/mprotect.c index 92d3d3ca390a..c5a13c0f1017 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -309,11 +309,6 @@ static inline int pmd_none_or_clear_bad_unless_trans_huge(pmd_t *pmd) { pmd_t pmdval = pmdp_get_lockless(pmd); - /* See pmd_none_or_trans_huge_or_clear_bad for info on barrier */ -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - barrier(); -#endif - if (pmd_none(pmdval)) return 1; if (pmd_trans_huge(pmdval)) diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c index 4e448cfbc6ef..64aff6718bdb 100644 --- a/mm/page_vma_mapped.c +++ b/mm/page_vma_mapped.c @@ -210,7 +210,7 @@ restart: * compiler and used as a stale value after we've observed a * subsequent update. */ - pmde = READ_ONCE(*pvmw->pmd); + pmde = pmdp_get_lockless(pvmw->pmd); if (pmd_trans_huge(pmde) || is_pmd_migration_entry(pmde) || (pmd_present(pmde) && pmd_devmap(pmde))) { -- cgit v1.2.3 From 0cb8fd4d14165a7e654048e43983d86f75b90879 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 8 Jun 2023 18:08:20 -0700 Subject: mm/migrate: remove cruft from migration_entry_wait()s MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit migration_entry_wait_on_locked() does not need to take a mapped pte pointer, its callers can do the unmap first. Annotate it with __releases(ptl) to reduce sparse warnings. Fold __migration_entry_wait_huge() into migration_entry_wait_huge(). Fold __migration_entry_wait() into migration_entry_wait(), preferring the tighter pte_offset_map_lock() to pte_offset_map() and pte_lockptr(). Link: https://lkml.kernel.org/r/b0e2a532-cdf2-561b-e999-f3b13b8d6d3@google.com Signed-off-by: Hugh Dickins Reviewed-by: Alistair Popple Cc: Anshuman Khandual Cc: Axel Rasmussen Cc: Christophe Leroy Cc: Christoph Hellwig Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Ira Weiny Cc: Jason Gunthorpe Cc: Kirill A. Shutemov Cc: Lorenzo Stoakes Cc: Matthew Wilcox Cc: Mel Gorman Cc: Miaohe Lin Cc: Mike Kravetz Cc: Mike Rapoport (IBM) Cc: Minchan Kim Cc: Naoya Horiguchi Cc: Pavel Tatashin Cc: Peter Xu Cc: Peter Zijlstra Cc: Qi Zheng Cc: Ralph Campbell Cc: Ryan Roberts Cc: SeongJae Park Cc: Song Liu Cc: Steven Price Cc: Suren Baghdasaryan Cc: Thomas Hellström Cc: Will Deacon Cc: Yang Shi Cc: Yu Zhao Cc: Zack Rusin Signed-off-by: Andrew Morton --- include/linux/migrate.h | 4 ++-- include/linux/swapops.h | 17 +++-------------- mm/filemap.c | 13 ++++--------- mm/migrate.c | 37 +++++++++++++------------------------ 4 files changed, 22 insertions(+), 49 deletions(-) (limited to 'include') diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 6de5756d8533..711dd9412561 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -75,8 +75,8 @@ bool isolate_movable_page(struct page *page, isolate_mode_t mode); int migrate_huge_page_move_mapping(struct address_space *mapping, struct folio *dst, struct folio *src); -void migration_entry_wait_on_locked(swp_entry_t entry, pte_t *ptep, - spinlock_t *ptl); +void migration_entry_wait_on_locked(swp_entry_t entry, spinlock_t *ptl) + __releases(ptl); void folio_migrate_flags(struct folio *newfolio, struct folio *folio); void folio_migrate_copy(struct folio *newfolio, struct folio *folio); int folio_migrate_mapping(struct address_space *mapping, diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 3a451b7afcb3..4c932cb45e0b 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -332,15 +332,9 @@ static inline bool is_migration_entry_dirty(swp_entry_t entry) return false; } -extern void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep, - spinlock_t *ptl); extern void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, unsigned long address); -#ifdef CONFIG_HUGETLB_PAGE -extern void __migration_entry_wait_huge(struct vm_area_struct *vma, - pte_t *ptep, spinlock_t *ptl); extern void migration_entry_wait_huge(struct vm_area_struct *vma, pte_t *pte); -#endif /* CONFIG_HUGETLB_PAGE */ #else /* CONFIG_MIGRATION */ static inline swp_entry_t make_readable_migration_entry(pgoff_t offset) { @@ -362,15 +356,10 @@ static inline int is_migration_entry(swp_entry_t swp) return 0; } -static inline void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep, - spinlock_t *ptl) { } static inline void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, - unsigned long address) { } -#ifdef CONFIG_HUGETLB_PAGE -static inline void __migration_entry_wait_huge(struct vm_area_struct *vma, - pte_t *ptep, spinlock_t *ptl) { } -static inline void migration_entry_wait_huge(struct vm_area_struct *vma, pte_t *pte) { } -#endif /* CONFIG_HUGETLB_PAGE */ + unsigned long address) { } +static inline void migration_entry_wait_huge(struct vm_area_struct *vma, + pte_t *pte) { } static inline int is_writable_migration_entry(swp_entry_t entry) { return 0; diff --git a/mm/filemap.c b/mm/filemap.c index 916b7c6444fe..e0259fb823a5 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1362,8 +1362,6 @@ repeat: /** * migration_entry_wait_on_locked - Wait for a migration entry to be removed * @entry: migration swap entry. - * @ptep: mapped pte pointer. Will return with the ptep unmapped. Only required - * for pte entries, pass NULL for pmd entries. * @ptl: already locked ptl. This function will drop the lock. * * Wait for a migration entry referencing the given page to be removed. This is @@ -1372,13 +1370,13 @@ repeat: * should be called while holding the ptl for the migration entry referencing * the page. * - * Returns after unmapping and unlocking the pte/ptl with pte_unmap_unlock(). + * Returns after unlocking the ptl. * * This follows the same logic as folio_wait_bit_common() so see the comments * there. */ -void migration_entry_wait_on_locked(swp_entry_t entry, pte_t *ptep, - spinlock_t *ptl) +void migration_entry_wait_on_locked(swp_entry_t entry, spinlock_t *ptl) + __releases(ptl) { struct wait_page_queue wait_page; wait_queue_entry_t *wait = &wait_page.wait; @@ -1412,10 +1410,7 @@ void migration_entry_wait_on_locked(swp_entry_t entry, pte_t *ptep, * a valid reference to the page, and it must take the ptl to remove the * migration entry. So the page is valid until the ptl is dropped. */ - if (ptep) - pte_unmap_unlock(ptep, ptl); - else - spin_unlock(ptl); + spin_unlock(ptl); for (;;) { unsigned int flags; diff --git a/mm/migrate.c b/mm/migrate.c index 30b5ce10935e..c1f2c40441e1 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -296,14 +296,18 @@ void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked) * get to the page and wait until migration is finished. * When we return from this function the fault will be retried. */ -void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep, - spinlock_t *ptl) +void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, + unsigned long address) { + spinlock_t *ptl; + pte_t *ptep; pte_t pte; swp_entry_t entry; - spin_lock(ptl); + ptep = pte_offset_map_lock(mm, pmd, address, &ptl); pte = *ptep; + pte_unmap(ptep); + if (!is_swap_pte(pte)) goto out; @@ -311,18 +315,10 @@ void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep, if (!is_migration_entry(entry)) goto out; - migration_entry_wait_on_locked(entry, ptep, ptl); + migration_entry_wait_on_locked(entry, ptl); return; out: - pte_unmap_unlock(ptep, ptl); -} - -void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, - unsigned long address) -{ - spinlock_t *ptl = pte_lockptr(mm, pmd); - pte_t *ptep = pte_offset_map(pmd, address); - __migration_entry_wait(mm, ptep, ptl); + spin_unlock(ptl); } #ifdef CONFIG_HUGETLB_PAGE @@ -332,9 +328,9 @@ void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, * * This function will release the vma lock before returning. */ -void __migration_entry_wait_huge(struct vm_area_struct *vma, - pte_t *ptep, spinlock_t *ptl) +void migration_entry_wait_huge(struct vm_area_struct *vma, pte_t *ptep) { + spinlock_t *ptl = huge_pte_lockptr(hstate_vma(vma), vma->vm_mm, ptep); pte_t pte; hugetlb_vma_assert_locked(vma); @@ -352,16 +348,9 @@ void __migration_entry_wait_huge(struct vm_area_struct *vma, * lock release in migration_entry_wait_on_locked(). */ hugetlb_vma_unlock_read(vma); - migration_entry_wait_on_locked(pte_to_swp_entry(pte), NULL, ptl); + migration_entry_wait_on_locked(pte_to_swp_entry(pte), ptl); } } - -void migration_entry_wait_huge(struct vm_area_struct *vma, pte_t *pte) -{ - spinlock_t *ptl = huge_pte_lockptr(hstate_vma(vma), vma->vm_mm, pte); - - __migration_entry_wait_huge(vma, pte, ptl); -} #endif #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION @@ -372,7 +361,7 @@ void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd) ptl = pmd_lock(mm, pmd); if (!is_pmd_migration_entry(*pmd)) goto unlock; - migration_entry_wait_on_locked(pmd_to_swp_entry(*pmd), NULL, ptl); + migration_entry_wait_on_locked(pmd_to_swp_entry(*pmd), ptl); return; unlock: spin_unlock(ptl); -- cgit v1.2.3 From 46c475bd676bb05077c8a38b37f175552f035406 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 8 Jun 2023 18:09:25 -0700 Subject: mm/pgtable: kmap_local_page() instead of kmap_atomic() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit pte_offset_map() was still using kmap_atomic(): update it to the preferred kmap_local_page() before making further changes there, in case we need this as a bisection point; but I doubt it can cause any trouble. Link: https://lkml.kernel.org/r/d74dc4b3-6a76-446f-8f5-52ae271fa07d@google.com Signed-off-by: Hugh Dickins Cc: Alistair Popple Cc: Anshuman Khandual Cc: Axel Rasmussen Cc: Christophe Leroy Cc: Christoph Hellwig Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Ira Weiny Cc: Jason Gunthorpe Cc: Kirill A. Shutemov Cc: Lorenzo Stoakes Cc: Matthew Wilcox Cc: Mel Gorman Cc: Miaohe Lin Cc: Mike Kravetz Cc: Mike Rapoport (IBM) Cc: Minchan Kim Cc: Naoya Horiguchi Cc: Pavel Tatashin Cc: Peter Xu Cc: Peter Zijlstra Cc: Qi Zheng Cc: Ralph Campbell Cc: Ryan Roberts Cc: SeongJae Park Cc: Song Liu Cc: Steven Price Cc: Suren Baghdasaryan Cc: Thomas Hellström Cc: Will Deacon Cc: Yang Shi Cc: Yu Zhao Cc: Zack Rusin Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 8ec27fe69dc8..94235ff2706e 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -96,9 +96,9 @@ static inline pte_t *pte_offset_kernel(pmd_t *pmd, unsigned long address) #if defined(CONFIG_HIGHPTE) #define pte_offset_map(dir, address) \ - ((pte_t *)kmap_atomic(pmd_page(*(dir))) + \ + ((pte_t *)kmap_local_page(pmd_page(*(dir))) + \ pte_index((address))) -#define pte_unmap(pte) kunmap_atomic((pte)) +#define pte_unmap(pte) kunmap_local((pte)) #else #define pte_offset_map(dir, address) pte_offset_kernel((dir), (address)) #define pte_unmap(pte) ((void)(pte)) /* NOP */ -- cgit v1.2.3 From 0d940a9b270b9220dcff74d8e9123c9788365751 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 8 Jun 2023 18:10:32 -0700 Subject: mm/pgtable: allow pte_offset_map[_lock]() to fail MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make pte_offset_map() a wrapper for __pte_offset_map() (optionally outputs pmdval), pte_offset_map_lock() a sparse __cond_lock wrapper for __pte_offset_map_lock(): those __funcs added in mm/pgtable-generic.c. __pte_offset_map() do pmdval validation (including pmd_clear_bad() when pmd_bad()), returning NULL if pmdval is not for a page table. __pte_offset_map_lock() verify pmdval unchanged after getting the lock, trying again if it changed. No #ifdef CONFIG_TRANSPARENT_HUGEPAGE around them: that could be done to cover the imminent case, but we expect to generalize it later, and it makes a mess of where to do the pmd_bad() clearing. Add pte_offset_map_nolock(): outputs ptl like pte_offset_map_lock(), without actually taking the lock. This will be preferred to open uses of pte_lockptr(), because (when split ptlock is in page table's struct page) it points to the right lock for the returned pte pointer, even if *pmd gets changed racily afterwards. Update corresponding Documentation. Do not add the anticipated rcu_read_lock() and rcu_read_unlock()s yet: they have to wait until all architectures are balancing pte_offset_map()s with pte_unmap()s (as in the arch series posted earlier). But comment where they will go, so that it's easy to add them for experiments. And only when those are in place can transient racy failure cases be enabled. Add more safety for the PAE mismatched pmd_low pmd_high case at that time. Link: https://lkml.kernel.org/r/2929bfd-9893-a374-e463-4c3127ff9b9d@google.com Signed-off-by: Hugh Dickins Cc: Alistair Popple Cc: Anshuman Khandual Cc: Axel Rasmussen Cc: Christophe Leroy Cc: Christoph Hellwig Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Ira Weiny Cc: Jason Gunthorpe Cc: Kirill A. Shutemov Cc: Lorenzo Stoakes Cc: Matthew Wilcox Cc: Mel Gorman Cc: Miaohe Lin Cc: Mike Kravetz Cc: Mike Rapoport (IBM) Cc: Minchan Kim Cc: Naoya Horiguchi Cc: Pavel Tatashin Cc: Peter Xu Cc: Peter Zijlstra Cc: Qi Zheng Cc: Ralph Campbell Cc: Ryan Roberts Cc: SeongJae Park Cc: Song Liu Cc: Steven Price Cc: Suren Baghdasaryan Cc: Thomas Hellström Cc: Will Deacon Cc: Yang Shi Cc: Yu Zhao Cc: Zack Rusin Signed-off-by: Andrew Morton --- Documentation/mm/split_page_table_lock.rst | 17 +++++---- include/linux/mm.h | 27 +++++++++----- include/linux/pgtable.h | 22 ++++++++---- mm/pgtable-generic.c | 56 ++++++++++++++++++++++++++++++ 4 files changed, 101 insertions(+), 21 deletions(-) (limited to 'include') diff --git a/Documentation/mm/split_page_table_lock.rst b/Documentation/mm/split_page_table_lock.rst index 50ee0dfc95be..a834fad9de12 100644 --- a/Documentation/mm/split_page_table_lock.rst +++ b/Documentation/mm/split_page_table_lock.rst @@ -14,15 +14,20 @@ tables. Access to higher level tables protected by mm->page_table_lock. There are helpers to lock/unlock a table and other accessor functions: - pte_offset_map_lock() - maps pte and takes PTE table lock, returns pointer to the taken - lock; + maps PTE and takes PTE table lock, returns pointer to PTE with + pointer to its PTE table lock, or returns NULL if no PTE table; + - pte_offset_map_nolock() + maps PTE, returns pointer to PTE with pointer to its PTE table + lock (not taken), or returns NULL if no PTE table; + - pte_offset_map() + maps PTE, returns pointer to PTE, or returns NULL if no PTE table; + - pte_unmap() + unmaps PTE table; - pte_unmap_unlock() unlocks and unmaps PTE table; - pte_alloc_map_lock() - allocates PTE table if needed and take the lock, returns pointer - to taken lock or NULL if allocation failed; - - pte_lockptr() - returns pointer to PTE table lock; + allocates PTE table if needed and takes its lock, returns pointer to + PTE with pointer to its lock, or returns NULL if allocation failed; - pmd_lock() takes PMD table lock, returns pointer to taken lock; - pmd_lockptr() diff --git a/include/linux/mm.h b/include/linux/mm.h index 66032f0d515c..a08dc8cc48fb 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2827,14 +2827,25 @@ static inline void pgtable_pte_page_dtor(struct page *page) dec_lruvec_page_state(page, NR_PAGETABLE); } -#define pte_offset_map_lock(mm, pmd, address, ptlp) \ -({ \ - spinlock_t *__ptl = pte_lockptr(mm, pmd); \ - pte_t *__pte = pte_offset_map(pmd, address); \ - *(ptlp) = __ptl; \ - spin_lock(__ptl); \ - __pte; \ -}) +pte_t *__pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp); +static inline pte_t *pte_offset_map(pmd_t *pmd, unsigned long addr) +{ + return __pte_offset_map(pmd, addr, NULL); +} + +pte_t *__pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd, + unsigned long addr, spinlock_t **ptlp); +static inline pte_t *pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd, + unsigned long addr, spinlock_t **ptlp) +{ + pte_t *pte; + + __cond_lock(*ptlp, pte = __pte_offset_map_lock(mm, pmd, addr, ptlp)); + return pte; +} + +pte_t *pte_offset_map_nolock(struct mm_struct *mm, pmd_t *pmd, + unsigned long addr, spinlock_t **ptlp); #define pte_unmap_unlock(pte, ptl) do { \ spin_unlock(ptl); \ diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 94235ff2706e..3fabbb018557 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -94,14 +94,22 @@ static inline pte_t *pte_offset_kernel(pmd_t *pmd, unsigned long address) #define pte_offset_kernel pte_offset_kernel #endif -#if defined(CONFIG_HIGHPTE) -#define pte_offset_map(dir, address) \ - ((pte_t *)kmap_local_page(pmd_page(*(dir))) + \ - pte_index((address))) -#define pte_unmap(pte) kunmap_local((pte)) +#ifdef CONFIG_HIGHPTE +#define __pte_map(pmd, address) \ + ((pte_t *)kmap_local_page(pmd_page(*(pmd))) + pte_index((address))) +#define pte_unmap(pte) do { \ + kunmap_local((pte)); \ + /* rcu_read_unlock() to be added later */ \ +} while (0) #else -#define pte_offset_map(dir, address) pte_offset_kernel((dir), (address)) -#define pte_unmap(pte) ((void)(pte)) /* NOP */ +static inline pte_t *__pte_map(pmd_t *pmd, unsigned long address) +{ + return pte_offset_kernel(pmd, address); +} +static inline void pte_unmap(pte_t *pte) +{ + /* rcu_read_unlock() to be added later */ +} #endif /* Find an entry in the second-level page table.. */ diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index d2fc52bffafc..c7ab18a5fb77 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -10,6 +10,8 @@ #include #include #include +#include +#include #include #include @@ -229,3 +231,57 @@ pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, } #endif #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + +pte_t *__pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp) +{ + pmd_t pmdval; + + /* rcu_read_lock() to be added later */ + pmdval = pmdp_get_lockless(pmd); + if (pmdvalp) + *pmdvalp = pmdval; + if (unlikely(pmd_none(pmdval) || is_pmd_migration_entry(pmdval))) + goto nomap; + if (unlikely(pmd_trans_huge(pmdval) || pmd_devmap(pmdval))) + goto nomap; + if (unlikely(pmd_bad(pmdval))) { + pmd_clear_bad(pmd); + goto nomap; + } + return __pte_map(&pmdval, addr); +nomap: + /* rcu_read_unlock() to be added later */ + return NULL; +} + +pte_t *pte_offset_map_nolock(struct mm_struct *mm, pmd_t *pmd, + unsigned long addr, spinlock_t **ptlp) +{ + pmd_t pmdval; + pte_t *pte; + + pte = __pte_offset_map(pmd, addr, &pmdval); + if (likely(pte)) + *ptlp = pte_lockptr(mm, &pmdval); + return pte; +} + +pte_t *__pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd, + unsigned long addr, spinlock_t **ptlp) +{ + spinlock_t *ptl; + pmd_t pmdval; + pte_t *pte; +again: + pte = __pte_offset_map(pmd, addr, &pmdval); + if (unlikely(!pte)) + return pte; + ptl = pte_lockptr(mm, &pmdval); + spin_lock(ptl); + if (likely(pmd_same(pmdval, pmdp_get_lockless(pmd)))) { + *ptlp = ptl; + return pte; + } + pte_unmap_unlock(pte, ptl); + goto again; +} -- cgit v1.2.3 From feda5c393a6c843c7bf1fc49e1381e2d3822b564 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 8 Jun 2023 18:50:37 -0700 Subject: mm/pgtable: delete pmd_trans_unstable() and friends MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Delete pmd_trans_unstable, pmd_none_or_trans_huge_or_clear_bad() and pmd_devmap_trans_unstable(), all now unused. With mixed feelings, delete all the comments on pmd_trans_unstable(). That was very good documentation of a subtle state, and this series does not even eliminate that state: but rather, normalizes and extends it, asking pte_offset_map[_lock]() callers to anticipate failure, without regard for whether mmap_read_lock() or mmap_write_lock() is held. Retain pud_trans_unstable(), which has one use in __handle_mm_fault(), but delete its equivalent pud_none_or_trans_huge_or_dev_or_clear_bad(). While there, move the default arch_needs_pgtable_deposit() definition up near where pgtable_trans_huge_deposit() and withdraw() are declared. Link: https://lkml.kernel.org/r/5abdab3-3136-b42e-274d-9c6281bfb79@google.com Signed-off-by: Hugh Dickins Cc: Alistair Popple Cc: Anshuman Khandual Cc: Axel Rasmussen Cc: Christophe Leroy Cc: Christoph Hellwig Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Ira Weiny Cc: Jason Gunthorpe Cc: Kirill A. Shutemov Cc: Lorenzo Stoakes Cc: Matthew Wilcox Cc: Mel Gorman Cc: Miaohe Lin Cc: Mike Kravetz Cc: Mike Rapoport (IBM) Cc: Minchan Kim Cc: Naoya Horiguchi Cc: Pavel Tatashin Cc: Peter Xu Cc: Peter Zijlstra Cc: Qi Zheng Cc: Ralph Campbell Cc: Ryan Roberts Cc: SeongJae Park Cc: Song Liu Cc: Steven Price Cc: Suren Baghdasaryan Cc: Thomas Hellström Cc: Will Deacon Cc: Yang Shi Cc: Yu Zhao Cc: Zack Rusin Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 103 ++++-------------------------------------------- mm/khugepaged.c | 4 -- 2 files changed, 7 insertions(+), 100 deletions(-) (limited to 'include') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 3fabbb018557..a1326e61d7ee 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -599,6 +599,10 @@ extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp); #endif +#ifndef arch_needs_pgtable_deposit +#define arch_needs_pgtable_deposit() (false) +#endif + #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* * This is an implementation of pmdp_establish() that is only suitable for an @@ -1300,9 +1304,10 @@ static inline int pud_trans_huge(pud_t pud) } #endif -/* See pmd_none_or_trans_huge_or_clear_bad for discussion. */ -static inline int pud_none_or_trans_huge_or_dev_or_clear_bad(pud_t *pud) +static inline int pud_trans_unstable(pud_t *pud) { +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \ + defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) pud_t pudval = READ_ONCE(*pud); if (pud_none(pudval) || pud_trans_huge(pudval) || pud_devmap(pudval)) @@ -1311,104 +1316,10 @@ static inline int pud_none_or_trans_huge_or_dev_or_clear_bad(pud_t *pud) pud_clear_bad(pud); return 1; } - return 0; -} - -/* See pmd_trans_unstable for discussion. */ -static inline int pud_trans_unstable(pud_t *pud) -{ -#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \ - defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) - return pud_none_or_trans_huge_or_dev_or_clear_bad(pud); -#else - return 0; #endif -} - -#ifndef arch_needs_pgtable_deposit -#define arch_needs_pgtable_deposit() (false) -#endif -/* - * This function is meant to be used by sites walking pagetables with - * the mmap_lock held in read mode to protect against MADV_DONTNEED and - * transhuge page faults. MADV_DONTNEED can convert a transhuge pmd - * into a null pmd and the transhuge page fault can convert a null pmd - * into an hugepmd or into a regular pmd (if the hugepage allocation - * fails). While holding the mmap_lock in read mode the pmd becomes - * stable and stops changing under us only if it's not null and not a - * transhuge pmd. When those races occurs and this function makes a - * difference vs the standard pmd_none_or_clear_bad, the result is - * undefined so behaving like if the pmd was none is safe (because it - * can return none anyway). The compiler level barrier() is critically - * important to compute the two checks atomically on the same pmdval. - * - * For 32bit kernels with a 64bit large pmd_t this automatically takes - * care of reading the pmd atomically to avoid SMP race conditions - * against pmd_populate() when the mmap_lock is hold for reading by the - * caller (a special atomic read not done by "gcc" as in the generic - * version above, is also needed when THP is disabled because the page - * fault can populate the pmd from under us). - */ -static inline int pmd_none_or_trans_huge_or_clear_bad(pmd_t *pmd) -{ - pmd_t pmdval = pmdp_get_lockless(pmd); - /* - * !pmd_present() checks for pmd migration entries - * - * The complete check uses is_pmd_migration_entry() in linux/swapops.h - * But using that requires moving current function and pmd_trans_unstable() - * to linux/swapops.h to resolve dependency, which is too much code move. - * - * !pmd_present() is equivalent to is_pmd_migration_entry() currently, - * because !pmd_present() pages can only be under migration not swapped - * out. - * - * pmd_none() is preserved for future condition checks on pmd migration - * entries and not confusing with this function name, although it is - * redundant with !pmd_present(). - */ - if (pmd_none(pmdval) || pmd_trans_huge(pmdval) || - (IS_ENABLED(CONFIG_ARCH_ENABLE_THP_MIGRATION) && !pmd_present(pmdval))) - return 1; - if (unlikely(pmd_bad(pmdval))) { - pmd_clear_bad(pmd); - return 1; - } return 0; } -/* - * This is a noop if Transparent Hugepage Support is not built into - * the kernel. Otherwise it is equivalent to - * pmd_none_or_trans_huge_or_clear_bad(), and shall only be called in - * places that already verified the pmd is not none and they want to - * walk ptes while holding the mmap sem in read mode (write mode don't - * need this). If THP is not enabled, the pmd can't go away under the - * code even if MADV_DONTNEED runs, but if THP is enabled we need to - * run a pmd_trans_unstable before walking the ptes after - * split_huge_pmd returns (because it may have run when the pmd become - * null, but then a page fault can map in a THP and not a regular page). - */ -static inline int pmd_trans_unstable(pmd_t *pmd) -{ -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - return pmd_none_or_trans_huge_or_clear_bad(pmd); -#else - return 0; -#endif -} - -/* - * the ordering of these checks is important for pmds with _page_devmap set. - * if we check pmd_trans_unstable() first we will trip the bad_pmd() check - * inside of pmd_none_or_trans_huge_or_clear_bad(). this will end up correctly - * returning 1 but not before it spams dmesg with the pmd_clear_bad() output. - */ -static inline int pmd_devmap_trans_unstable(pmd_t *pmd) -{ - return pmd_devmap(*pmd) || pmd_trans_unstable(pmd); -} - #ifndef CONFIG_NUMA_BALANCING /* * Technically a PTE can be PROTNONE even when not doing NUMA balancing but diff --git a/mm/khugepaged.c b/mm/khugepaged.c index d1951ed572f8..881669e738c0 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -944,10 +944,6 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, return SCAN_SUCCEED; } -/* - * See pmd_trans_unstable() for how the result may change out from - * underneath us, even if we hold mmap_lock in read. - */ static int find_pmd_or_thp_or_none(struct mm_struct *mm, unsigned long address, pmd_t **pmd) -- cgit v1.2.3 From 4f8fcf4ced0b7184149045818dcc2f9e2689b775 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 8 Jun 2023 18:52:17 -0700 Subject: mm/swap: swap_vma_readahead() do the pte_offset_map() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit swap_vma_readahead() has been proceeding in an unconventional way, its preliminary swap_ra_info() doing the pte_offset_map() and pte_unmap(), then relying on that pte pointer even after the pte_unmap() - in its CONFIG_64BIT case (I think !CONFIG_HIGHPTE was intended; whereas 32-bit copied ptes to stack while they were mapped, but had to limit how many). Though it would be difficult to construct a failing testcase, accessing page table after pte_unmap() will become bad practice, even on 64-bit: an rcu_read_unlock() in pte_unmap() will allow page table to be freed. Move relevant definitions from include/linux/swap.h to mm/swap_state.c, nothing else used them. Delete the CONFIG_64BIT distinction and buffer, delete all reference to ptes from swap_ra_info(), use pte_offset_map() repeatedly in swap_vma_readahead(), breaking from the loop if it fails. (Will the repeated "map" and "unmap" show up as a slowdown anywhere? If so, maybe modify __read_swap_cache_async() to do the pte_unmap() only when it does not find the page already in the swapcache.) Use ptep_get_lockless(), mainly for its READ_ONCE(). Correctly advance the address passed down to each call of __read__swap_cache_async(). Link: https://lkml.kernel.org/r/b7c64ab3-9e44-aac0-d2b-c57de578af1c@google.com Signed-off-by: Hugh Dickins Reviewed-by: "Huang, Ying" Cc: Alistair Popple Cc: Anshuman Khandual Cc: Axel Rasmussen Cc: Christophe Leroy Cc: Christoph Hellwig Cc: David Hildenbrand Cc: Ira Weiny Cc: Jason Gunthorpe Cc: Kirill A. Shutemov Cc: Lorenzo Stoakes Cc: Matthew Wilcox Cc: Mel Gorman Cc: Miaohe Lin Cc: Mike Kravetz Cc: Mike Rapoport (IBM) Cc: Minchan Kim Cc: Naoya Horiguchi Cc: Pavel Tatashin Cc: Peter Xu Cc: Peter Zijlstra Cc: Qi Zheng Cc: Ralph Campbell Cc: Ryan Roberts Cc: SeongJae Park Cc: Song Liu Cc: Steven Price Cc: Suren Baghdasaryan Cc: Thomas Hellström Cc: Will Deacon Cc: Yang Shi Cc: Yu Zhao Cc: Zack Rusin Signed-off-by: Andrew Morton --- include/linux/swap.h | 19 ------------------- mm/swap_state.c | 45 ++++++++++++++++++++++++--------------------- 2 files changed, 24 insertions(+), 40 deletions(-) (limited to 'include') diff --git a/include/linux/swap.h b/include/linux/swap.h index b5f6f2916de1..ce7e82cf787f 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -337,25 +337,6 @@ struct swap_info_struct { */ }; -#ifdef CONFIG_64BIT -#define SWAP_RA_ORDER_CEILING 5 -#else -/* Avoid stack overflow, because we need to save part of page table */ -#define SWAP_RA_ORDER_CEILING 3 -#define SWAP_RA_PTE_CACHE_SIZE (1 << SWAP_RA_ORDER_CEILING) -#endif - -struct vma_swap_readahead { - unsigned short win; - unsigned short offset; - unsigned short nr_pte; -#ifdef CONFIG_64BIT - pte_t *ptes; -#else - pte_t ptes[SWAP_RA_PTE_CACHE_SIZE]; -#endif -}; - static inline swp_entry_t folio_swap_entry(struct folio *folio) { swp_entry_t entry = { .val = page_private(&folio->page) }; diff --git a/mm/swap_state.c b/mm/swap_state.c index ef32353c18a6..a33c60e0158f 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -709,6 +709,14 @@ void exit_swap_address_space(unsigned int type) swapper_spaces[type] = NULL; } +#define SWAP_RA_ORDER_CEILING 5 + +struct vma_swap_readahead { + unsigned short win; + unsigned short offset; + unsigned short nr_pte; +}; + static void swap_ra_info(struct vm_fault *vmf, struct vma_swap_readahead *ra_info) { @@ -716,11 +724,7 @@ static void swap_ra_info(struct vm_fault *vmf, unsigned long ra_val; unsigned long faddr, pfn, fpfn, lpfn, rpfn; unsigned long start, end; - pte_t *pte, *orig_pte; unsigned int max_win, hits, prev_win, win; -#ifndef CONFIG_64BIT - pte_t *tpte; -#endif max_win = 1 << min_t(unsigned int, READ_ONCE(page_cluster), SWAP_RA_ORDER_CEILING); @@ -739,12 +743,9 @@ static void swap_ra_info(struct vm_fault *vmf, max_win, prev_win); atomic_long_set(&vma->swap_readahead_info, SWAP_RA_VAL(faddr, win, 0)); - if (win == 1) return; - /* Copy the PTEs because the page table may be unmapped */ - orig_pte = pte = pte_offset_map(vmf->pmd, faddr); if (fpfn == pfn + 1) { lpfn = fpfn; rpfn = fpfn + win; @@ -764,15 +765,6 @@ static void swap_ra_info(struct vm_fault *vmf, ra_info->nr_pte = end - start; ra_info->offset = fpfn - start; - pte -= ra_info->offset; -#ifdef CONFIG_64BIT - ra_info->ptes = pte; -#else - tpte = ra_info->ptes; - for (pfn = start; pfn != end; pfn++) - *tpte++ = *pte++; -#endif - pte_unmap(orig_pte); } /** @@ -796,7 +788,8 @@ static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask, struct swap_iocb *splug = NULL; struct vm_area_struct *vma = vmf->vma; struct page *page; - pte_t *pte, pentry; + pte_t *pte = NULL, pentry; + unsigned long addr; swp_entry_t entry; unsigned int i; bool page_allocated; @@ -808,17 +801,25 @@ static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask, if (ra_info.win == 1) goto skip; + addr = vmf->address - (ra_info.offset * PAGE_SIZE); + blk_start_plug(&plug); - for (i = 0, pte = ra_info.ptes; i < ra_info.nr_pte; - i++, pte++) { - pentry = *pte; + for (i = 0; i < ra_info.nr_pte; i++, addr += PAGE_SIZE) { + if (!pte++) { + pte = pte_offset_map(vmf->pmd, addr); + if (!pte) + break; + } + pentry = ptep_get_lockless(pte); if (!is_swap_pte(pentry)) continue; entry = pte_to_swp_entry(pentry); if (unlikely(non_swap_entry(entry))) continue; + pte_unmap(pte); + pte = NULL; page = __read_swap_cache_async(entry, gfp_mask, vma, - vmf->address, &page_allocated); + addr, &page_allocated); if (!page) continue; if (page_allocated) { @@ -830,6 +831,8 @@ static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask, } put_page(page); } + if (pte) + pte_unmap(pte); blk_finish_plug(&plug); swap_read_unplug(splug); lru_add_drain(); -- cgit v1.2.3 From b95826c9aa48b2997b3973b42a8716ba132b920e Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Mon, 12 Jun 2023 09:34:05 -0700 Subject: mm: remove set_compound_page_dtor() All users can use the folio equivalent so this function can be safely removed. Link: https://lkml.kernel.org/r/20230612163405.99345-1-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Cc: Aneesh Kumar K.V Cc: Matthew Wilcox Cc: Tarun Sahu Signed-off-by: Andrew Morton --- include/linux/mm.h | 10 ---------- mm/huge_memory.c | 2 +- mm/internal.h | 2 +- 3 files changed, 2 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index a08dc8cc48fb..8f40bf17d597 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1223,16 +1223,6 @@ enum compound_dtor_id { }; extern compound_page_dtor * const compound_page_dtors[NR_COMPOUND_DTORS]; -static inline void set_compound_page_dtor(struct page *page, - enum compound_dtor_id compound_dtor) -{ - struct folio *folio = (struct folio *)page; - - VM_BUG_ON_PAGE(compound_dtor >= NR_COMPOUND_DTORS, page); - VM_BUG_ON_PAGE(!PageHead(page), page); - folio->_folio_dtor = compound_dtor; -} - static inline void folio_set_compound_dtor(struct folio *folio, enum compound_dtor_id compound_dtor) { diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 31bc8fa768e3..76f970aa5b4d 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -583,7 +583,7 @@ void prep_transhuge_page(struct page *page) VM_BUG_ON_FOLIO(folio_order(folio) < 2, folio); INIT_LIST_HEAD(&folio->_deferred_list); - set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR); + folio_set_compound_dtor(folio, TRANSHUGE_PAGE_DTOR); } static inline bool is_transparent_hugepage(struct page *page) diff --git a/mm/internal.h b/mm/internal.h index faf0508d89a5..33b8b8f66af3 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -391,7 +391,7 @@ static inline void prep_compound_head(struct page *page, unsigned int order) { struct folio *folio = (struct folio *)page; - set_compound_page_dtor(page, COMPOUND_PAGE_DTOR); + folio_set_compound_dtor(folio, COMPOUND_PAGE_DTOR); set_compound_order(page, order); atomic_set(&folio->_entire_mapcount, -1); atomic_set(&folio->_nr_pages_mapped, 0); -- cgit v1.2.3 From 4ab5f8ec7d71aea5fe13a48248242130f84ac6bb Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Mon, 12 Jun 2023 16:31:45 +0100 Subject: mm/slab: decouple ARCH_KMALLOC_MINALIGN from ARCH_DMA_MINALIGN Patch series "mm, dma, arm64: Reduce ARCH_KMALLOC_MINALIGN to 8", v7. A series reducing the kmalloc() minimum alignment on arm64 to 8 (from 128). This patch (of 17): In preparation for supporting a kmalloc() minimum alignment smaller than the arch DMA alignment, decouple the two definitions. This requires that either the kmalloc() caches are aligned to a (run-time) cache-line size or the DMA API bounces unaligned kmalloc() allocations. Subsequent patches will implement both options. After this patch, ARCH_DMA_MINALIGN is expected to be used in static alignment annotations and defined by an architecture to be the maximum alignment for all supported configurations/SoCs in a single Image. Architectures opting in to a smaller ARCH_KMALLOC_MINALIGN will need to define its value in the arch headers. Since ARCH_DMA_MINALIGN is now always defined, adjust the #ifdef in dma_get_cache_alignment() so that there is no change for architectures not requiring a minimum DMA alignment. Link: https://lkml.kernel.org/r/20230612153201.554742-1-catalin.marinas@arm.com Link: https://lkml.kernel.org/r/20230612153201.554742-2-catalin.marinas@arm.com Signed-off-by: Catalin Marinas Tested-by: Isaac J. Manjarres Cc: Vlastimil Babka Cc: Christoph Hellwig Cc: Robin Murphy Cc: Alasdair Kergon Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Daniel Vetter Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Joerg Roedel Cc: Jonathan Cameron Cc: Marc Zyngier Cc: Mark Brown Cc: Mike Snitzer Cc: Rafael J. Wysocki Cc: Saravana Kannan Cc: Will Deacon Cc: Jerry Snitselaar Cc: Jonathan Cameron Cc: Lars-Peter Clausen Cc: Logan Gunthorpe Signed-off-by: Andrew Morton --- include/linux/cache.h | 6 ++++++ include/linux/dma-mapping.h | 3 ++- include/linux/slab.h | 14 ++++++++++---- 3 files changed, 18 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/cache.h b/include/linux/cache.h index 5da1bbd96154..9900d20b76c2 100644 --- a/include/linux/cache.h +++ b/include/linux/cache.h @@ -98,4 +98,10 @@ struct cacheline_padding { #define CACHELINE_PADDING(name) #endif +#ifdef ARCH_DMA_MINALIGN +#define ARCH_HAS_DMA_MINALIGN +#else +#define ARCH_DMA_MINALIGN __alignof__(unsigned long long) +#endif + #endif /* __LINUX_CACHE_H */ diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index 0ee20b764000..a50375331eac 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -2,6 +2,7 @@ #ifndef _LINUX_DMA_MAPPING_H #define _LINUX_DMA_MAPPING_H +#include #include #include #include @@ -545,7 +546,7 @@ static inline int dma_set_min_align_mask(struct device *dev, static inline int dma_get_cache_alignment(void) { -#ifdef ARCH_DMA_MINALIGN +#ifdef ARCH_HAS_DMA_MINALIGN return ARCH_DMA_MINALIGN; #endif return 1; diff --git a/include/linux/slab.h b/include/linux/slab.h index 6b3e155b70bf..ca53425e9b32 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -12,6 +12,7 @@ #ifndef _LINUX_SLAB_H #define _LINUX_SLAB_H +#include #include #include #include @@ -235,12 +236,17 @@ void kmem_dump_obj(void *object); * alignment larger than the alignment of a 64-bit integer. * Setting ARCH_DMA_MINALIGN in arch headers allows that. */ -#if defined(ARCH_DMA_MINALIGN) && ARCH_DMA_MINALIGN > 8 +#ifdef ARCH_HAS_DMA_MINALIGN +#if ARCH_DMA_MINALIGN > 8 && !defined(ARCH_KMALLOC_MINALIGN) #define ARCH_KMALLOC_MINALIGN ARCH_DMA_MINALIGN -#define KMALLOC_MIN_SIZE ARCH_DMA_MINALIGN -#define KMALLOC_SHIFT_LOW ilog2(ARCH_DMA_MINALIGN) -#else +#endif +#endif + +#ifndef ARCH_KMALLOC_MINALIGN #define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long) +#elif ARCH_KMALLOC_MINALIGN > 8 +#define KMALLOC_MIN_SIZE ARCH_KMALLOC_MINALIGN +#define KMALLOC_SHIFT_LOW ilog2(KMALLOC_MIN_SIZE) #endif /* -- cgit v1.2.3 From 8c57da28dc3df4e091474a004b5596c9b88a3be0 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Mon, 12 Jun 2023 16:31:46 +0100 Subject: dma: allow dma_get_cache_alignment() to be overridden by the arch code On arm64, ARCH_DMA_MINALIGN is larger than most cache line size configurations deployed. Allow an architecture to override dma_get_cache_alignment() in order to return a run-time probed value (e.g. cache_line_size()). Link: https://lkml.kernel.org/r/20230612153201.554742-3-catalin.marinas@arm.com Signed-off-by: Catalin Marinas Reviewed-by: Christoph Hellwig Tested-by: Isaac J. Manjarres Cc: Robin Murphy Cc: Will Deacon Cc: Alasdair Kergon Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Daniel Vetter Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Jerry Snitselaar Cc: Joerg Roedel Cc: Jonathan Cameron Cc: Jonathan Cameron Cc: Lars-Peter Clausen Cc: Logan Gunthorpe Cc: Marc Zyngier Cc: Mark Brown Cc: Mike Snitzer Cc: "Rafael J. Wysocki" Cc: Saravana Kannan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/dma-mapping.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index a50375331eac..e13050eb9777 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -544,6 +544,7 @@ static inline int dma_set_min_align_mask(struct device *dev, return 0; } +#ifndef dma_get_cache_alignment static inline int dma_get_cache_alignment(void) { #ifdef ARCH_HAS_DMA_MINALIGN @@ -551,6 +552,7 @@ static inline int dma_get_cache_alignment(void) #endif return 1; } +#endif static inline void *dmam_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t gfp) -- cgit v1.2.3 From 88b216d339691888ef98644a5eae62c3d9c8ddf0 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Mon, 12 Jun 2023 16:31:54 +0100 Subject: iio: core: use ARCH_DMA_MINALIGN instead of ARCH_KMALLOC_MINALIGN ARCH_DMA_MINALIGN represents the minimum (static) alignment for safe DMA operations while ARCH_KMALLOC_MINALIGN is the minimum kmalloc() objects alignment. Link: https://lkml.kernel.org/r/20230612153201.554742-11-catalin.marinas@arm.com Signed-off-by: Catalin Marinas Acked-by: Jonathan Cameron Tested-by: Isaac J. Manjarres Cc: Lars-Peter Clausen Cc: Alasdair Kergon Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Christoph Hellwig Cc: Daniel Vetter Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Jerry Snitselaar Cc: Joerg Roedel Cc: Jonathan Cameron Cc: Logan Gunthorpe Cc: Marc Zyngier Cc: Mark Brown Cc: Mike Snitzer Cc: "Rafael J. Wysocki" Cc: Robin Murphy Cc: Saravana Kannan Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/iio/iio.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h index 81413cd3a3e7..d28a5e8097e4 100644 --- a/include/linux/iio/iio.h +++ b/include/linux/iio/iio.h @@ -722,7 +722,7 @@ static inline void *iio_device_get_drvdata(const struct iio_dev *indio_dev) * must not share cachelines with the rest of the structure, thus making * them safe for use with non-coherent DMA. */ -#define IIO_DMA_MINALIGN ARCH_KMALLOC_MINALIGN +#define IIO_DMA_MINALIGN ARCH_DMA_MINALIGN struct iio_dev *iio_device_alloc(struct device *parent, int sizeof_priv); /* The information at the returned address is guaranteed to be cacheline aligned */ -- cgit v1.2.3 From af2880ec44021d32cc72a5aa7c5d7d7beaa722d3 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Mon, 12 Jun 2023 16:31:56 +0100 Subject: scatterlist: add dedicated config for DMA flags The DMA flags field will be useful for users beyond PCI P2P, so upgrade to its own dedicated config option. [catalin.marinas@arm.com: use #ifdef CONFIG_NEED_SG_DMA_FLAGS in scatterlist.h] [catalin.marinas@arm.com: update PCI_P2PDMA dma_flags comment in scatterlist.h] Link: https://lkml.kernel.org/r/20230612153201.554742-13-catalin.marinas@arm.com Signed-off-by: Robin Murphy Signed-off-by: Catalin Marinas Reviewed-by: Christoph Hellwig Tested-by: Isaac J. Manjarres Cc: Alasdair Kergon Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Daniel Vetter Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Jerry Snitselaar Cc: Joerg Roedel Cc: Jonathan Cameron Cc: Jonathan Cameron Cc: Lars-Peter Clausen Cc: Logan Gunthorpe Cc: Marc Zyngier Cc: Mark Brown Cc: Mike Snitzer Cc: "Rafael J. Wysocki" Cc: Saravana Kannan Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- drivers/pci/Kconfig | 1 + include/linux/scatterlist.h | 13 ++++++------- kernel/dma/Kconfig | 3 +++ 3 files changed, 10 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/drivers/pci/Kconfig b/drivers/pci/Kconfig index 9309f2469b41..3c07d8d214b3 100644 --- a/drivers/pci/Kconfig +++ b/drivers/pci/Kconfig @@ -168,6 +168,7 @@ config PCI_P2PDMA # depends on 64BIT select GENERIC_ALLOCATOR + select NEED_SG_DMA_FLAGS help Enableѕ drivers to do PCI peer-to-peer transactions to and from BARs that are exposed in other devices that are the part of diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h index 375a5e90d86a..19833fd4113b 100644 --- a/include/linux/scatterlist.h +++ b/include/linux/scatterlist.h @@ -16,7 +16,7 @@ struct scatterlist { #ifdef CONFIG_NEED_SG_DMA_LENGTH unsigned int dma_length; #endif -#ifdef CONFIG_PCI_P2PDMA +#ifdef CONFIG_NEED_SG_DMA_FLAGS unsigned int dma_flags; #endif }; @@ -249,12 +249,11 @@ static inline void sg_unmark_end(struct scatterlist *sg) } /* - * CONFGI_PCI_P2PDMA depends on CONFIG_64BIT which means there is 4 bytes - * in struct scatterlist (assuming also CONFIG_NEED_SG_DMA_LENGTH is set). - * Use this padding for DMA flags bits to indicate when a specific - * dma address is a bus address. + * One 64-bit architectures there is a 4-byte padding in struct scatterlist + * (assuming also CONFIG_NEED_SG_DMA_LENGTH is set). Use this padding for DMA + * flags bits to indicate when a specific dma address is a bus address. */ -#ifdef CONFIG_PCI_P2PDMA +#ifdef CONFIG_NEED_SG_DMA_FLAGS #define SG_DMA_BUS_ADDRESS (1 << 0) @@ -312,7 +311,7 @@ static inline void sg_dma_unmark_bus_address(struct scatterlist *sg) { } -#endif +#endif /* CONFIG_NEED_SG_DMA_FLAGS */ /** * sg_phys - Return physical address of an sg entry diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig index 6677d0e64d27..acc6f231259c 100644 --- a/kernel/dma/Kconfig +++ b/kernel/dma/Kconfig @@ -24,6 +24,9 @@ config DMA_OPS_BYPASS config ARCH_HAS_DMA_MAP_DIRECT bool +config NEED_SG_DMA_FLAGS + bool + config NEED_SG_DMA_LENGTH bool -- cgit v1.2.3 From cb147bbe22d2be9b49021c2e5dacdf2935745d1c Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Mon, 12 Jun 2023 16:31:57 +0100 Subject: dma-mapping: name SG DMA flag helpers consistently sg_is_dma_bus_address() is inconsistent with the naming pattern of its corresponding setters and its own kerneldoc, so take the majority vote and rename it sg_dma_is_bus_address() (and fix up the missing underscores in the kerneldoc too). This gives us a nice clear pattern where SG DMA flags are SG_DMA_, and the helpers for acting on them are sg_dma__(). Link: https://lkml.kernel.org/r/20230612153201.554742-14-catalin.marinas@arm.com Signed-off-by: Robin Murphy Signed-off-by: Catalin Marinas Reviewed-by: Christoph Hellwig Reviewed-by: Jerry Snitselaar Reviewed-by: Logan Gunthorpe Link: https://lore.kernel.org/r/fa2eca2862c7ffc41b50337abffb2dfd2864d3ea.1685036694.git.robin.murphy@arm.com Tested-by: Isaac J. Manjarres Cc: Alasdair Kergon Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Daniel Vetter Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Joerg Roedel Cc: Jonathan Cameron Cc: Jonathan Cameron Cc: Lars-Peter Clausen Cc: Marc Zyngier Cc: Mark Brown Cc: Mike Snitzer Cc: "Rafael J. Wysocki" Cc: Saravana Kannan Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- drivers/iommu/dma-iommu.c | 8 ++++---- drivers/iommu/iommu.c | 2 +- include/linux/scatterlist.h | 8 ++++---- kernel/dma/direct.c | 2 +- 4 files changed, 10 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index 7a9f0b0bddbd..b8bba4aa196f 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -1080,7 +1080,7 @@ static int __finalise_sg(struct device *dev, struct scatterlist *sg, int nents, sg_dma_address(s) = DMA_MAPPING_ERROR; sg_dma_len(s) = 0; - if (sg_is_dma_bus_address(s)) { + if (sg_dma_is_bus_address(s)) { if (i > 0) cur = sg_next(cur); @@ -1136,7 +1136,7 @@ static void __invalidate_sg(struct scatterlist *sg, int nents) int i; for_each_sg(sg, s, nents, i) { - if (sg_is_dma_bus_address(s)) { + if (sg_dma_is_bus_address(s)) { sg_dma_unmark_bus_address(s); } else { if (sg_dma_address(s) != DMA_MAPPING_ERROR) @@ -1329,7 +1329,7 @@ static void iommu_dma_unmap_sg(struct device *dev, struct scatterlist *sg, * just have to be determined. */ for_each_sg(sg, tmp, nents, i) { - if (sg_is_dma_bus_address(tmp)) { + if (sg_dma_is_bus_address(tmp)) { sg_dma_unmark_bus_address(tmp); continue; } @@ -1343,7 +1343,7 @@ static void iommu_dma_unmap_sg(struct device *dev, struct scatterlist *sg, nents -= i; for_each_sg(tmp, tmp, nents, i) { - if (sg_is_dma_bus_address(tmp)) { + if (sg_dma_is_bus_address(tmp)) { sg_dma_unmark_bus_address(tmp); continue; } diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index f1dcfa3f1a1b..eb620552967b 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -2567,7 +2567,7 @@ ssize_t iommu_map_sg(struct iommu_domain *domain, unsigned long iova, len = 0; } - if (sg_is_dma_bus_address(sg)) + if (sg_dma_is_bus_address(sg)) goto next; if (len) { diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h index 19833fd4113b..2f06178996ba 100644 --- a/include/linux/scatterlist.h +++ b/include/linux/scatterlist.h @@ -258,7 +258,7 @@ static inline void sg_unmark_end(struct scatterlist *sg) #define SG_DMA_BUS_ADDRESS (1 << 0) /** - * sg_dma_is_bus address - Return whether a given segment was marked + * sg_dma_is_bus_address - Return whether a given segment was marked * as a bus address * @sg: SG entry * @@ -266,13 +266,13 @@ static inline void sg_unmark_end(struct scatterlist *sg) * Returns true if sg_dma_mark_bus_address() has been called on * this segment. **/ -static inline bool sg_is_dma_bus_address(struct scatterlist *sg) +static inline bool sg_dma_is_bus_address(struct scatterlist *sg) { return sg->dma_flags & SG_DMA_BUS_ADDRESS; } /** - * sg_dma_mark_bus address - Mark the scatterlist entry as a bus address + * sg_dma_mark_bus_address - Mark the scatterlist entry as a bus address * @sg: SG entry * * Description: @@ -300,7 +300,7 @@ static inline void sg_dma_unmark_bus_address(struct scatterlist *sg) #else -static inline bool sg_is_dma_bus_address(struct scatterlist *sg) +static inline bool sg_dma_is_bus_address(struct scatterlist *sg) { return false; } diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 5595d1d5cdcc..d29cade048db 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -463,7 +463,7 @@ void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl, int i; for_each_sg(sgl, sg, nents, i) { - if (sg_is_dma_bus_address(sg)) + if (sg_dma_is_bus_address(sg)) sg_dma_unmark_bus_address(sg); else dma_direct_unmap_page(dev, sg->dma_address, -- cgit v1.2.3 From 370645f41e6e2fdd2fb6f6982530b04612c9793c Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Mon, 12 Jun 2023 16:31:58 +0100 Subject: dma-mapping: force bouncing if the kmalloc() size is not cache-line-aligned For direct DMA, if the size is small enough to have originated from a kmalloc() cache below ARCH_DMA_MINALIGN, check its alignment against dma_get_cache_alignment() and bounce if necessary. For larger sizes, it is the responsibility of the DMA API caller to ensure proper alignment. At this point, the kmalloc() caches are properly aligned but this will change in a subsequent patch. Architectures can opt in by selecting DMA_BOUNCE_UNALIGNED_KMALLOC. Link: https://lkml.kernel.org/r/20230612153201.554742-15-catalin.marinas@arm.com Signed-off-by: Catalin Marinas Reviewed-by: Christoph Hellwig Reviewed-by: Robin Murphy Tested-by: Isaac J. Manjarres Cc: Alasdair Kergon Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Daniel Vetter Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Jerry Snitselaar Cc: Joerg Roedel Cc: Jonathan Cameron Cc: Jonathan Cameron Cc: Lars-Peter Clausen Cc: Logan Gunthorpe Cc: Marc Zyngier Cc: Mark Brown Cc: Mike Snitzer Cc: "Rafael J. Wysocki" Cc: Saravana Kannan Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/dma-map-ops.h | 61 +++++++++++++++++++++++++++++++++++++++++++++ kernel/dma/Kconfig | 4 +++ kernel/dma/direct.h | 3 ++- 3 files changed, 67 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h index 31f114f486c4..9bf19b5bf755 100644 --- a/include/linux/dma-map-ops.h +++ b/include/linux/dma-map-ops.h @@ -8,6 +8,7 @@ #include #include +#include struct cma; @@ -277,6 +278,66 @@ static inline bool dev_is_dma_coherent(struct device *dev) } #endif /* CONFIG_ARCH_HAS_DMA_COHERENCE_H */ +/* + * Check whether potential kmalloc() buffers are safe for non-coherent DMA. + */ +static inline bool dma_kmalloc_safe(struct device *dev, + enum dma_data_direction dir) +{ + /* + * If DMA bouncing of kmalloc() buffers is disabled, the kmalloc() + * caches have already been aligned to a DMA-safe size. + */ + if (!IS_ENABLED(CONFIG_DMA_BOUNCE_UNALIGNED_KMALLOC)) + return true; + + /* + * kmalloc() buffers are DMA-safe irrespective of size if the device + * is coherent or the direction is DMA_TO_DEVICE (non-desctructive + * cache maintenance and benign cache line evictions). + */ + if (dev_is_dma_coherent(dev) || dir == DMA_TO_DEVICE) + return true; + + return false; +} + +/* + * Check whether the given size, assuming it is for a kmalloc()'ed buffer, is + * sufficiently aligned for non-coherent DMA. + */ +static inline bool dma_kmalloc_size_aligned(size_t size) +{ + /* + * Larger kmalloc() sizes are guaranteed to be aligned to + * ARCH_DMA_MINALIGN. + */ + if (size >= 2 * ARCH_DMA_MINALIGN || + IS_ALIGNED(kmalloc_size_roundup(size), dma_get_cache_alignment())) + return true; + + return false; +} + +/* + * Check whether the given object size may have originated from a kmalloc() + * buffer with a slab alignment below the DMA-safe alignment and needs + * bouncing for non-coherent DMA. The pointer alignment is not considered and + * in-structure DMA-safe offsets are the responsibility of the caller. Such + * code should use the static ARCH_DMA_MINALIGN for compiler annotations. + * + * The heuristics can have false positives, bouncing unnecessarily, though the + * buffers would be small. False negatives are theoretically possible if, for + * example, multiple small kmalloc() buffers are coalesced into a larger + * buffer that passes the alignment check. There are no such known constructs + * in the kernel. + */ +static inline bool dma_kmalloc_needs_bounce(struct device *dev, size_t size, + enum dma_data_direction dir) +{ + return !dma_kmalloc_safe(dev, dir) && !dma_kmalloc_size_aligned(size); +} + void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs); void arch_dma_free(struct device *dev, size_t size, void *cpu_addr, diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig index acc6f231259c..abea1823fe21 100644 --- a/kernel/dma/Kconfig +++ b/kernel/dma/Kconfig @@ -90,6 +90,10 @@ config SWIOTLB bool select NEED_DMA_MAP_STATE +config DMA_BOUNCE_UNALIGNED_KMALLOC + bool + depends on SWIOTLB + config DMA_RESTRICTED_POOL bool "DMA Restricted Pool" depends on OF && OF_RESERVED_MEM && SWIOTLB diff --git a/kernel/dma/direct.h b/kernel/dma/direct.h index e38ffc5e6bdd..97ec892ea0b5 100644 --- a/kernel/dma/direct.h +++ b/kernel/dma/direct.h @@ -94,7 +94,8 @@ static inline dma_addr_t dma_direct_map_page(struct device *dev, return swiotlb_map(dev, phys, size, dir, attrs); } - if (unlikely(!dma_capable(dev, dma_addr, size, true))) { + if (unlikely(!dma_capable(dev, dma_addr, size, true)) || + dma_kmalloc_needs_bounce(dev, size, dir)) { if (is_pci_p2pdma_page(page)) return DMA_MAPPING_ERROR; if (is_swiotlb_active(dev)) -- cgit v1.2.3 From 861370f49ce484cd6ef2e9b3ad06d137f3cb0ca3 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Mon, 12 Jun 2023 16:31:59 +0100 Subject: iommu/dma: force bouncing if the size is not cacheline-aligned Similarly to the direct DMA, bounce small allocations as they may have originated from a kmalloc() cache not safe for DMA. Unlike the direct DMA, iommu_dma_map_sg() cannot call iommu_dma_map_sg_swiotlb() for all non-coherent devices as this would break some cases where the iova is expected to be contiguous (dmabuf). Instead, scan the scatterlist for any small sizes and only go the swiotlb path if any element of the list needs bouncing (note that iommu_dma_map_page() would still only bounce those buffers which are not DMA-aligned). To avoid scanning the scatterlist on the 'sync' operations, introduce an SG_DMA_SWIOTLB flag set by iommu_dma_map_sg_swiotlb(). The dev_use_swiotlb() function together with the newly added dev_use_sg_swiotlb() now check for both untrusted devices and unaligned kmalloc() buffers (suggested by Robin Murphy). Link: https://lkml.kernel.org/r/20230612153201.554742-16-catalin.marinas@arm.com Signed-off-by: Catalin Marinas Reviewed-by: Robin Murphy Tested-by: Isaac J. Manjarres Cc: Joerg Roedel Cc: Christoph Hellwig Cc: Alasdair Kergon Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Daniel Vetter Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Jerry Snitselaar Cc: Jonathan Cameron Cc: Jonathan Cameron Cc: Lars-Peter Clausen Cc: Logan Gunthorpe Cc: Marc Zyngier Cc: Mark Brown Cc: Mike Snitzer Cc: "Rafael J. Wysocki" Cc: Saravana Kannan Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- drivers/iommu/Kconfig | 1 + drivers/iommu/dma-iommu.c | 50 +++++++++++++++++++++++++++++++++++++-------- include/linux/scatterlist.h | 41 +++++++++++++++++++++++++++++++++++-- 3 files changed, 81 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig index 6de900776e24..74c45359869d 100644 --- a/drivers/iommu/Kconfig +++ b/drivers/iommu/Kconfig @@ -152,6 +152,7 @@ config IOMMU_DMA select IOMMU_IOVA select IRQ_MSI_IOMMU select NEED_SG_DMA_LENGTH + select NEED_SG_DMA_FLAGS if SWIOTLB # Shared Virtual Addressing config IOMMU_SVA diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index b8bba4aa196f..e86ae462cade 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -520,9 +520,38 @@ static bool dev_is_untrusted(struct device *dev) return dev_is_pci(dev) && to_pci_dev(dev)->untrusted; } -static bool dev_use_swiotlb(struct device *dev) +static bool dev_use_swiotlb(struct device *dev, size_t size, + enum dma_data_direction dir) { - return IS_ENABLED(CONFIG_SWIOTLB) && dev_is_untrusted(dev); + return IS_ENABLED(CONFIG_SWIOTLB) && + (dev_is_untrusted(dev) || + dma_kmalloc_needs_bounce(dev, size, dir)); +} + +static bool dev_use_sg_swiotlb(struct device *dev, struct scatterlist *sg, + int nents, enum dma_data_direction dir) +{ + struct scatterlist *s; + int i; + + if (!IS_ENABLED(CONFIG_SWIOTLB)) + return false; + + if (dev_is_untrusted(dev)) + return true; + + /* + * If kmalloc() buffers are not DMA-safe for this device and + * direction, check the individual lengths in the sg list. If any + * element is deemed unsafe, use the swiotlb for bouncing. + */ + if (!dma_kmalloc_safe(dev, dir)) { + for_each_sg(sg, s, nents, i) + if (!dma_kmalloc_size_aligned(s->length)) + return true; + } + + return false; } /** @@ -922,7 +951,7 @@ static void iommu_dma_sync_single_for_cpu(struct device *dev, { phys_addr_t phys; - if (dev_is_dma_coherent(dev) && !dev_use_swiotlb(dev)) + if (dev_is_dma_coherent(dev) && !dev_use_swiotlb(dev, size, dir)) return; phys = iommu_iova_to_phys(iommu_get_dma_domain(dev), dma_handle); @@ -938,7 +967,7 @@ static void iommu_dma_sync_single_for_device(struct device *dev, { phys_addr_t phys; - if (dev_is_dma_coherent(dev) && !dev_use_swiotlb(dev)) + if (dev_is_dma_coherent(dev) && !dev_use_swiotlb(dev, size, dir)) return; phys = iommu_iova_to_phys(iommu_get_dma_domain(dev), dma_handle); @@ -956,7 +985,7 @@ static void iommu_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg; int i; - if (dev_use_swiotlb(dev)) + if (sg_dma_is_swiotlb(sgl)) for_each_sg(sgl, sg, nelems, i) iommu_dma_sync_single_for_cpu(dev, sg_dma_address(sg), sg->length, dir); @@ -972,7 +1001,7 @@ static void iommu_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg; int i; - if (dev_use_swiotlb(dev)) + if (sg_dma_is_swiotlb(sgl)) for_each_sg(sgl, sg, nelems, i) iommu_dma_sync_single_for_device(dev, sg_dma_address(sg), @@ -998,7 +1027,8 @@ static dma_addr_t iommu_dma_map_page(struct device *dev, struct page *page, * If both the physical buffer start address and size are * page aligned, we don't need to use a bounce page. */ - if (dev_use_swiotlb(dev) && iova_offset(iovad, phys | size)) { + if (dev_use_swiotlb(dev, size, dir) && + iova_offset(iovad, phys | size)) { void *padding_start; size_t padding_size, aligned_size; @@ -1166,6 +1196,8 @@ static int iommu_dma_map_sg_swiotlb(struct device *dev, struct scatterlist *sg, struct scatterlist *s; int i; + sg_dma_mark_swiotlb(sg); + for_each_sg(sg, s, nents, i) { sg_dma_address(s) = iommu_dma_map_page(dev, sg_page(s), s->offset, s->length, dir, attrs); @@ -1210,7 +1242,7 @@ static int iommu_dma_map_sg(struct device *dev, struct scatterlist *sg, goto out; } - if (dev_use_swiotlb(dev)) + if (dev_use_sg_swiotlb(dev, sg, nents, dir)) return iommu_dma_map_sg_swiotlb(dev, sg, nents, dir, attrs); if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) @@ -1315,7 +1347,7 @@ static void iommu_dma_unmap_sg(struct device *dev, struct scatterlist *sg, struct scatterlist *tmp; int i; - if (dev_use_swiotlb(dev)) { + if (sg_dma_is_swiotlb(sg)) { iommu_dma_unmap_sg_swiotlb(dev, sg, nents, dir, attrs); return; } diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h index 2f06178996ba..ec46d8e8e49d 100644 --- a/include/linux/scatterlist.h +++ b/include/linux/scatterlist.h @@ -251,11 +251,13 @@ static inline void sg_unmark_end(struct scatterlist *sg) /* * One 64-bit architectures there is a 4-byte padding in struct scatterlist * (assuming also CONFIG_NEED_SG_DMA_LENGTH is set). Use this padding for DMA - * flags bits to indicate when a specific dma address is a bus address. + * flags bits to indicate when a specific dma address is a bus address or the + * buffer may have been bounced via SWIOTLB. */ #ifdef CONFIG_NEED_SG_DMA_FLAGS -#define SG_DMA_BUS_ADDRESS (1 << 0) +#define SG_DMA_BUS_ADDRESS (1 << 0) +#define SG_DMA_SWIOTLB (1 << 1) /** * sg_dma_is_bus_address - Return whether a given segment was marked @@ -298,6 +300,34 @@ static inline void sg_dma_unmark_bus_address(struct scatterlist *sg) sg->dma_flags &= ~SG_DMA_BUS_ADDRESS; } +/** + * sg_dma_is_swiotlb - Return whether the scatterlist was marked for SWIOTLB + * bouncing + * @sg: SG entry + * + * Description: + * Returns true if the scatterlist was marked for SWIOTLB bouncing. Not all + * elements may have been bounced, so the caller would have to check + * individual SG entries with is_swiotlb_buffer(). + */ +static inline bool sg_dma_is_swiotlb(struct scatterlist *sg) +{ + return sg->dma_flags & SG_DMA_SWIOTLB; +} + +/** + * sg_dma_mark_swiotlb - Mark the scatterlist for SWIOTLB bouncing + * @sg: SG entry + * + * Description: + * Marks a a scatterlist for SWIOTLB bounce. Not all SG entries may be + * bounced. + */ +static inline void sg_dma_mark_swiotlb(struct scatterlist *sg) +{ + sg->dma_flags |= SG_DMA_SWIOTLB; +} + #else static inline bool sg_dma_is_bus_address(struct scatterlist *sg) @@ -310,6 +340,13 @@ static inline void sg_dma_mark_bus_address(struct scatterlist *sg) static inline void sg_dma_unmark_bus_address(struct scatterlist *sg) { } +static inline bool sg_dma_is_swiotlb(struct scatterlist *sg) +{ + return false; +} +static inline void sg_dma_mark_swiotlb(struct scatterlist *sg) +{ +} #endif /* CONFIG_NEED_SG_DMA_FLAGS */ -- cgit v1.2.3 From 6c1d2a073a1d850e79026411e79dff7ef997c90d Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Mon, 12 Jun 2023 16:15:44 +0100 Subject: mm: move ptep_get() and pmdp_get() helpers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are many call sites that directly dereference a pte_t pointer. This makes it very difficult to properly encapsulate a page table in the arch code without having to allocate shadow page tables. We will shortly solve this by replacing all the call sites with ptep_get() calls. But there are call sites above the function definition in the header file, so let's move ptep_get() to an earlier location to solve that problem. And move pmdp_get() at the same time to keep it close to ptep_get(). Link: https://lkml.kernel.org/r/20230612151545.3317766-3-ryan.roberts@arm.com Signed-off-by: Ryan Roberts Cc: Adrian Hunter Cc: Alexander Potapenko Cc: Alexander Shishkin Cc: Alex Williamson Cc: Al Viro Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Christian Brauner Cc: Christoph Hellwig Cc: Daniel Vetter Cc: Dave Airlie Cc: Dimitri Sivanich Cc: Dmitry Vyukov Cc: Ian Rogers Cc: Jason Gunthorpe Cc: Jérôme Glisse Cc: Jiri Olsa Cc: Johannes Weiner Cc: kernel test robot Cc: Kirill A. Shutemov Cc: Lorenzo Stoakes Cc: Mark Rutland Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Michal Hocko Cc: Mike Kravetz Cc: Mike Rapoport (IBM) Cc: Muchun Song Cc: Namhyung Kim Cc: Naoya Horiguchi Cc: Oleksandr Tyshchenko Cc: Pavel Tatashin Cc: Roman Gushchin Cc: SeongJae Park Cc: Shakeel Butt Cc: Uladzislau Rezki (Sony) Cc: Vincenzo Frascino Cc: Yu Zhao Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index a1326e61d7ee..fc06f6419661 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -212,6 +212,20 @@ static inline int pudp_set_access_flags(struct vm_area_struct *vma, #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif +#ifndef ptep_get +static inline pte_t ptep_get(pte_t *ptep) +{ + return READ_ONCE(*ptep); +} +#endif + +#ifndef pmdp_get +static inline pmd_t pmdp_get(pmd_t *pmdp) +{ + return READ_ONCE(*pmdp); +} +#endif + #ifndef __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long address, @@ -317,20 +331,6 @@ static inline void ptep_clear(struct mm_struct *mm, unsigned long addr, ptep_get_and_clear(mm, addr, ptep); } -#ifndef ptep_get -static inline pte_t ptep_get(pte_t *ptep) -{ - return READ_ONCE(*ptep); -} -#endif - -#ifndef pmdp_get -static inline pmd_t pmdp_get(pmd_t *pmdp) -{ - return READ_ONCE(*pmdp); -} -#endif - #ifdef CONFIG_GUP_GET_PXX_LOW_HIGH /* * For walking the pagetables without holding any locks. Some architectures -- cgit v1.2.3 From c33c794828f21217f72ce6fc140e0d34e0d56bff Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Mon, 12 Jun 2023 16:15:45 +0100 Subject: mm: ptep_get() conversion MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Convert all instances of direct pte_t* dereferencing to instead use ptep_get() helper. This means that by default, the accesses change from a C dereference to a READ_ONCE(). This is technically the correct thing to do since where pgtables are modified by HW (for access/dirty) they are volatile and therefore we should always ensure READ_ONCE() semantics. But more importantly, by always using the helper, it can be overridden by the architecture to fully encapsulate the contents of the pte. Arch code is deliberately not converted, as the arch code knows best. It is intended that arch code (arm64) will override the default with its own implementation that can (e.g.) hide certain bits from the core code, or determine young/dirty status by mixing in state from another source. Conversion was done using Coccinelle: ---- // $ make coccicheck \ // COCCI=ptepget.cocci \ // SPFLAGS="--include-headers" \ // MODE=patch virtual patch @ depends on patch @ pte_t *v; @@ - *v + ptep_get(v) ---- Then reviewed and hand-edited to avoid multiple unnecessary calls to ptep_get(), instead opting to store the result of a single call in a variable, where it is correct to do so. This aims to negate any cost of READ_ONCE() and will benefit arch-overrides that may be more complex. Included is a fix for an issue in an earlier version of this patch that was pointed out by kernel test robot. The issue arose because config MMU=n elides definition of the ptep helper functions, including ptep_get(). HUGETLB_PAGE=n configs still define a simple huge_ptep_clear_flush() for linking purposes, which dereferences the ptep. So when both configs are disabled, this caused a build error because ptep_get() is not defined. Fix by continuing to do a direct dereference when MMU=n. This is safe because for this config the arch code cannot be trying to virtualize the ptes because none of the ptep helpers are defined. Link: https://lkml.kernel.org/r/20230612151545.3317766-4-ryan.roberts@arm.com Reported-by: kernel test robot Link: https://lore.kernel.org/oe-kbuild-all/202305120142.yXsNEo6H-lkp@intel.com/ Signed-off-by: Ryan Roberts Cc: Adrian Hunter Cc: Alexander Potapenko Cc: Alexander Shishkin Cc: Alex Williamson Cc: Al Viro Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Christian Brauner Cc: Christoph Hellwig Cc: Daniel Vetter Cc: Dave Airlie Cc: Dimitri Sivanich Cc: Dmitry Vyukov Cc: Ian Rogers Cc: Jason Gunthorpe Cc: Jérôme Glisse Cc: Jiri Olsa Cc: Johannes Weiner Cc: Kirill A. Shutemov Cc: Lorenzo Stoakes Cc: Mark Rutland Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Michal Hocko Cc: Mike Kravetz Cc: Mike Rapoport (IBM) Cc: Muchun Song Cc: Namhyung Kim Cc: Naoya Horiguchi Cc: Oleksandr Tyshchenko Cc: Pavel Tatashin Cc: Roman Gushchin Cc: SeongJae Park Cc: Shakeel Butt Cc: Uladzislau Rezki (Sony) Cc: Vincenzo Frascino Cc: Yu Zhao Signed-off-by: Andrew Morton --- drivers/gpu/drm/i915/gem/selftests/i915_gem_mman.c | 8 +- drivers/misc/sgi-gru/grufault.c | 2 +- drivers/vfio/vfio_iommu_type1.c | 7 +- drivers/xen/privcmd.c | 2 +- fs/proc/task_mmu.c | 33 +++---- fs/userfaultfd.c | 6 +- include/linux/hugetlb.h | 4 + include/linux/mm_inline.h | 2 +- include/linux/pgtable.h | 6 +- kernel/events/uprobes.c | 2 +- mm/damon/ops-common.c | 2 +- mm/damon/paddr.c | 2 +- mm/damon/vaddr.c | 10 ++- mm/filemap.c | 2 +- mm/gup.c | 21 +++-- mm/highmem.c | 12 +-- mm/hmm.c | 2 +- mm/huge_memory.c | 4 +- mm/hugetlb.c | 2 +- mm/hugetlb_vmemmap.c | 6 +- mm/kasan/init.c | 9 +- mm/kasan/shadow.c | 10 +-- mm/khugepaged.c | 22 ++--- mm/ksm.c | 22 ++--- mm/madvise.c | 6 +- mm/mapping_dirty_helpers.c | 4 +- mm/memcontrol.c | 4 +- mm/memory-failure.c | 26 +++--- mm/memory.c | 100 +++++++++++---------- mm/mempolicy.c | 6 +- mm/migrate.c | 14 +-- mm/migrate_device.c | 15 ++-- mm/mincore.c | 2 +- mm/mlock.c | 6 +- mm/mprotect.c | 8 +- mm/mremap.c | 2 +- mm/page_table_check.c | 4 +- mm/page_vma_mapped.c | 27 +++--- mm/pgtable-generic.c | 2 +- mm/rmap.c | 34 ++++--- mm/sparse-vmemmap.c | 8 +- mm/swap_state.c | 8 +- mm/swapfile.c | 20 +++-- mm/userfaultfd.c | 4 +- mm/vmalloc.c | 6 +- mm/vmscan.c | 14 +-- virt/kvm/kvm_main.c | 11 ++- 47 files changed, 301 insertions(+), 228 deletions(-) (limited to 'include') diff --git a/drivers/gpu/drm/i915/gem/selftests/i915_gem_mman.c b/drivers/gpu/drm/i915/gem/selftests/i915_gem_mman.c index 56279908ed30..01e271b6ad21 100644 --- a/drivers/gpu/drm/i915/gem/selftests/i915_gem_mman.c +++ b/drivers/gpu/drm/i915/gem/selftests/i915_gem_mman.c @@ -1681,7 +1681,9 @@ static int igt_mmap_gpu(void *arg) static int check_present_pte(pte_t *pte, unsigned long addr, void *data) { - if (!pte_present(*pte) || pte_none(*pte)) { + pte_t ptent = ptep_get(pte); + + if (!pte_present(ptent) || pte_none(ptent)) { pr_err("missing PTE:%lx\n", (addr - (unsigned long)data) >> PAGE_SHIFT); return -EINVAL; @@ -1692,7 +1694,9 @@ static int check_present_pte(pte_t *pte, unsigned long addr, void *data) static int check_absent_pte(pte_t *pte, unsigned long addr, void *data) { - if (pte_present(*pte) && !pte_none(*pte)) { + pte_t ptent = ptep_get(pte); + + if (pte_present(ptent) && !pte_none(ptent)) { pr_err("present PTE:%lx; expected to be revoked\n", (addr - (unsigned long)data) >> PAGE_SHIFT); return -EINVAL; diff --git a/drivers/misc/sgi-gru/grufault.c b/drivers/misc/sgi-gru/grufault.c index 378cf02a2aa1..629edb6486de 100644 --- a/drivers/misc/sgi-gru/grufault.c +++ b/drivers/misc/sgi-gru/grufault.c @@ -228,7 +228,7 @@ static int atomic_pte_lookup(struct vm_area_struct *vma, unsigned long vaddr, goto err; #ifdef CONFIG_X86_64 if (unlikely(pmd_large(*pmdp))) - pte = *(pte_t *) pmdp; + pte = ptep_get((pte_t *)pmdp); else #endif pte = *pte_offset_kernel(pmdp, vaddr); diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index 306e6f1d1c70..ebe0ad31d0b0 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -514,6 +514,7 @@ static int follow_fault_pfn(struct vm_area_struct *vma, struct mm_struct *mm, bool write_fault) { pte_t *ptep; + pte_t pte; spinlock_t *ptl; int ret; @@ -536,10 +537,12 @@ static int follow_fault_pfn(struct vm_area_struct *vma, struct mm_struct *mm, return ret; } - if (write_fault && !pte_write(*ptep)) + pte = ptep_get(ptep); + + if (write_fault && !pte_write(pte)) ret = -EFAULT; else - *pfn = pte_pfn(*ptep); + *pfn = pte_pfn(pte); pte_unmap_unlock(ptep, ptl); return ret; diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c index e2f580e30a86..f447cd37cc4c 100644 --- a/drivers/xen/privcmd.c +++ b/drivers/xen/privcmd.c @@ -949,7 +949,7 @@ static int privcmd_mmap(struct file *file, struct vm_area_struct *vma) */ static int is_mapped_fn(pte_t *pte, unsigned long addr, void *data) { - return pte_none(*pte) ? 0 : -EBUSY; + return pte_none(ptep_get(pte)) ? 0 : -EBUSY; } static int privcmd_vma_range_is_mapped( diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 0d63b6a0f0d8..507cd4e59d07 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -538,13 +538,14 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr, bool locked = !!(vma->vm_flags & VM_LOCKED); struct page *page = NULL; bool migration = false, young = false, dirty = false; + pte_t ptent = ptep_get(pte); - if (pte_present(*pte)) { - page = vm_normal_page(vma, addr, *pte); - young = pte_young(*pte); - dirty = pte_dirty(*pte); - } else if (is_swap_pte(*pte)) { - swp_entry_t swpent = pte_to_swp_entry(*pte); + if (pte_present(ptent)) { + page = vm_normal_page(vma, addr, ptent); + young = pte_young(ptent); + dirty = pte_dirty(ptent); + } else if (is_swap_pte(ptent)) { + swp_entry_t swpent = pte_to_swp_entry(ptent); if (!non_swap_entry(swpent)) { int mapcount; @@ -732,11 +733,12 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask, struct mem_size_stats *mss = walk->private; struct vm_area_struct *vma = walk->vma; struct page *page = NULL; + pte_t ptent = ptep_get(pte); - if (pte_present(*pte)) { - page = vm_normal_page(vma, addr, *pte); - } else if (is_swap_pte(*pte)) { - swp_entry_t swpent = pte_to_swp_entry(*pte); + if (pte_present(ptent)) { + page = vm_normal_page(vma, addr, ptent); + } else if (is_swap_pte(ptent)) { + swp_entry_t swpent = pte_to_swp_entry(ptent); if (is_pfn_swap_entry(swpent)) page = pfn_swap_entry_to_page(swpent); @@ -1105,7 +1107,7 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma, * Documentation/admin-guide/mm/soft-dirty.rst for full description * of how soft-dirty works. */ - pte_t ptent = *pte; + pte_t ptent = ptep_get(pte); if (pte_present(ptent)) { pte_t old_pte; @@ -1194,7 +1196,7 @@ out: return 0; } for (; addr != end; pte++, addr += PAGE_SIZE) { - ptent = *pte; + ptent = ptep_get(pte); if (cp->type == CLEAR_REFS_SOFT_DIRTY) { clear_soft_dirty(vma, addr, pte); @@ -1550,7 +1552,7 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, for (; addr < end; pte++, addr += PAGE_SIZE) { pagemap_entry_t pme; - pme = pte_to_pagemap_entry(pm, vma, addr, *pte); + pme = pte_to_pagemap_entry(pm, vma, addr, ptep_get(pte)); err = add_to_pagemap(addr, &pme, pm); if (err) break; @@ -1893,10 +1895,11 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr, return 0; } do { - struct page *page = can_gather_numa_stats(*pte, vma, addr); + pte_t ptent = ptep_get(pte); + struct page *page = can_gather_numa_stats(ptent, vma, addr); if (!page) continue; - gather_stats(page, md, pte_dirty(*pte), 1); + gather_stats(page, md, pte_dirty(ptent), 1); } while (pte++, addr += PAGE_SIZE, addr != end); pte_unmap_unlock(orig_pte, ptl); diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index ca83423f8d54..478e2b169c13 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -335,6 +335,7 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx, pud_t *pud; pmd_t *pmd, _pmd; pte_t *pte; + pte_t ptent; bool ret = true; mmap_assert_locked(mm); @@ -374,9 +375,10 @@ again: * changes under us. PTE markers should be handled the same as none * ptes here. */ - if (pte_none_mostly(*pte)) + ptent = ptep_get(pte); + if (pte_none_mostly(ptent)) ret = true; - if (!pte_write(*pte) && (reason & VM_UFFD_WP)) + if (!pte_write(ptent) && (reason & VM_UFFD_WP)) ret = true; pte_unmap(pte); diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 21f942025fec..beb7c63d2871 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -1185,7 +1185,11 @@ static inline void hugetlb_count_sub(long l, struct mm_struct *mm) static inline pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { +#ifdef CONFIG_MMU + return ptep_get(ptep); +#else return *ptep; +#endif } static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 0e1d239a882c..08c2bcefcb2b 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -555,7 +555,7 @@ pte_install_uffd_wp_if_needed(struct vm_area_struct *vma, unsigned long addr, bool arm_uffd_pte = false; /* The current status of the pte should be "cleared" before calling */ - WARN_ON_ONCE(!pte_none(*pte)); + WARN_ON_ONCE(!pte_none(ptep_get(pte))); /* * NOTE: userfaultfd_wp_unpopulated() doesn't need this whole diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index fc06f6419661..5063b482e34f 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -231,7 +231,7 @@ static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) { - pte_t pte = *ptep; + pte_t pte = ptep_get(ptep); int r = 1; if (!pte_young(pte)) r = 0; @@ -318,7 +318,7 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long address, pte_t *ptep) { - pte_t pte = *ptep; + pte_t pte = ptep_get(ptep); pte_clear(mm, address, ptep); page_table_check_pte_clear(mm, address, pte); return pte; @@ -519,7 +519,7 @@ extern pud_t pudp_huge_clear_flush(struct vm_area_struct *vma, struct mm_struct; static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long address, pte_t *ptep) { - pte_t old_pte = *ptep; + pte_t old_pte = ptep_get(ptep); set_pte_at(mm, address, ptep, pte_wrprotect(old_pte)); } #endif diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 607d742caa61..f0ac5b874919 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -192,7 +192,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, inc_mm_counter(mm, MM_ANONPAGES); } - flush_cache_page(vma, addr, pte_pfn(*pvmw.pte)); + flush_cache_page(vma, addr, pte_pfn(ptep_get(pvmw.pte))); ptep_clear_flush_notify(vma, addr, pvmw.pte); if (new_page) set_pte_at_notify(mm, addr, pvmw.pte, diff --git a/mm/damon/ops-common.c b/mm/damon/ops-common.c index d4ab81229136..e940802a15a4 100644 --- a/mm/damon/ops-common.c +++ b/mm/damon/ops-common.c @@ -39,7 +39,7 @@ struct folio *damon_get_folio(unsigned long pfn) void damon_ptep_mkold(pte_t *pte, struct vm_area_struct *vma, unsigned long addr) { - struct folio *folio = damon_get_folio(pte_pfn(*pte)); + struct folio *folio = damon_get_folio(pte_pfn(ptep_get(pte))); if (!folio) return; diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c index 5b3a3463d078..40801e38fcf0 100644 --- a/mm/damon/paddr.c +++ b/mm/damon/paddr.c @@ -89,7 +89,7 @@ static bool __damon_pa_young(struct folio *folio, struct vm_area_struct *vma, while (page_vma_mapped_walk(&pvmw)) { addr = pvmw.address; if (pvmw.pte) { - *accessed = pte_young(*pvmw.pte) || + *accessed = pte_young(ptep_get(pvmw.pte)) || !folio_test_idle(folio) || mmu_notifier_test_young(vma->vm_mm, addr); } else { diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index e814f66dfc2e..2fcc9731528a 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -323,7 +323,7 @@ static int damon_mkold_pmd_entry(pmd_t *pmd, unsigned long addr, walk->action = ACTION_AGAIN; return 0; } - if (!pte_present(*pte)) + if (!pte_present(ptep_get(pte))) goto out; damon_ptep_mkold(pte, walk->vma, addr); out: @@ -433,6 +433,7 @@ static int damon_young_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long next, struct mm_walk *walk) { pte_t *pte; + pte_t ptent; spinlock_t *ptl; struct folio *folio; struct damon_young_walk_private *priv = walk->private; @@ -471,12 +472,13 @@ regular_page: walk->action = ACTION_AGAIN; return 0; } - if (!pte_present(*pte)) + ptent = ptep_get(pte); + if (!pte_present(ptent)) goto out; - folio = damon_get_folio(pte_pfn(*pte)); + folio = damon_get_folio(pte_pfn(ptent)); if (!folio) goto out; - if (pte_young(*pte) || !folio_test_idle(folio) || + if (pte_young(ptent) || !folio_test_idle(folio) || mmu_notifier_test_young(walk->mm, addr)) priv->young = true; *priv->folio_sz = folio_size(folio); diff --git a/mm/filemap.c b/mm/filemap.c index 1893048ec9ff..00933089b8b6 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3523,7 +3523,7 @@ again: * handled in the specific fault path, and it'll prohibit the * fault-around logic. */ - if (!pte_none(*vmf->pte)) + if (!pte_none(ptep_get(vmf->pte))) goto unlock; /* We're about to handle the fault */ diff --git a/mm/gup.c b/mm/gup.c index 838db6c0bfc2..38986e522d34 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -477,13 +477,14 @@ static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address, pte_t *pte, unsigned int flags) { if (flags & FOLL_TOUCH) { - pte_t entry = *pte; + pte_t orig_entry = ptep_get(pte); + pte_t entry = orig_entry; if (flags & FOLL_WRITE) entry = pte_mkdirty(entry); entry = pte_mkyoung(entry); - if (!pte_same(*pte, entry)) { + if (!pte_same(orig_entry, entry)) { set_pte_at(vma->vm_mm, address, pte, entry); update_mmu_cache(vma, address, pte); } @@ -549,7 +550,7 @@ static struct page *follow_page_pte(struct vm_area_struct *vma, ptep = pte_offset_map_lock(mm, pmd, address, &ptl); if (!ptep) return no_page_table(vma, flags); - pte = *ptep; + pte = ptep_get(ptep); if (!pte_present(pte)) goto no_page; if (pte_protnone(pte) && !gup_can_follow_protnone(flags)) @@ -821,6 +822,7 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address, pud_t *pud; pmd_t *pmd; pte_t *pte; + pte_t entry; int ret = -EFAULT; /* user gate pages are read-only */ @@ -844,16 +846,17 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address, pte = pte_offset_map(pmd, address); if (!pte) return -EFAULT; - if (pte_none(*pte)) + entry = ptep_get(pte); + if (pte_none(entry)) goto unmap; *vma = get_gate_vma(mm); if (!page) goto out; - *page = vm_normal_page(*vma, address, *pte); + *page = vm_normal_page(*vma, address, entry); if (!*page) { - if ((gup_flags & FOLL_DUMP) || !is_zero_pfn(pte_pfn(*pte))) + if ((gup_flags & FOLL_DUMP) || !is_zero_pfn(pte_pfn(entry))) goto unmap; - *page = pte_page(*pte); + *page = pte_page(entry); } ret = try_grab_page(*page, gup_flags); if (unlikely(ret)) @@ -2496,7 +2499,7 @@ static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr, } if (unlikely(pmd_val(pmd) != pmd_val(*pmdp)) || - unlikely(pte_val(pte) != pte_val(*ptep))) { + unlikely(pte_val(pte) != pte_val(ptep_get(ptep)))) { gup_put_folio(folio, 1, flags); goto pte_unmap; } @@ -2693,7 +2696,7 @@ static int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, if (!folio) return 0; - if (unlikely(pte_val(pte) != pte_val(*ptep))) { + if (unlikely(pte_val(pte) != pte_val(ptep_get(ptep)))) { gup_put_folio(folio, refs, flags); return 0; } diff --git a/mm/highmem.c b/mm/highmem.c index db251e77f98f..e19269093a93 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -161,7 +161,7 @@ struct page *__kmap_to_page(void *vaddr) /* kmap() mappings */ if (WARN_ON_ONCE(addr >= PKMAP_ADDR(0) && addr < PKMAP_ADDR(LAST_PKMAP))) - return pte_page(pkmap_page_table[PKMAP_NR(addr)]); + return pte_page(ptep_get(&pkmap_page_table[PKMAP_NR(addr)])); /* kmap_local_page() mappings */ if (WARN_ON_ONCE(base >= __fix_to_virt(FIX_KMAP_END) && @@ -191,6 +191,7 @@ static void flush_all_zero_pkmaps(void) for (i = 0; i < LAST_PKMAP; i++) { struct page *page; + pte_t ptent; /* * zero means we don't have anything to do, @@ -203,7 +204,8 @@ static void flush_all_zero_pkmaps(void) pkmap_count[i] = 0; /* sanity check */ - BUG_ON(pte_none(pkmap_page_table[i])); + ptent = ptep_get(&pkmap_page_table[i]); + BUG_ON(pte_none(ptent)); /* * Don't need an atomic fetch-and-clear op here; @@ -212,7 +214,7 @@ static void flush_all_zero_pkmaps(void) * getting the kmap_lock (which is held here). * So no dangers, even with speculative execution. */ - page = pte_page(pkmap_page_table[i]); + page = pte_page(ptent); pte_clear(&init_mm, PKMAP_ADDR(i), &pkmap_page_table[i]); set_page_address(page, NULL); @@ -511,7 +513,7 @@ static inline bool kmap_high_unmap_local(unsigned long vaddr) { #ifdef ARCH_NEEDS_KMAP_HIGH_GET if (vaddr >= PKMAP_ADDR(0) && vaddr < PKMAP_ADDR(LAST_PKMAP)) { - kunmap_high(pte_page(pkmap_page_table[PKMAP_NR(vaddr)])); + kunmap_high(pte_page(ptep_get(&pkmap_page_table[PKMAP_NR(vaddr)]))); return true; } #endif @@ -548,7 +550,7 @@ void *__kmap_local_pfn_prot(unsigned long pfn, pgprot_t prot) idx = arch_kmap_local_map_idx(kmap_local_idx_push(), pfn); vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); kmap_pte = kmap_get_pte(vaddr, idx); - BUG_ON(!pte_none(*kmap_pte)); + BUG_ON(!pte_none(ptep_get(kmap_pte))); pteval = pfn_pte(pfn, prot); arch_kmap_local_set_pte(&init_mm, vaddr, kmap_pte, pteval); arch_kmap_local_post_map(vaddr, pteval); diff --git a/mm/hmm.c b/mm/hmm.c index b1a9159d7c92..855e25e59d8f 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -228,7 +228,7 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, struct hmm_range *range = hmm_vma_walk->range; unsigned int required_fault; unsigned long cpu_flags; - pte_t pte = *ptep; + pte_t pte = ptep_get(ptep); uint64_t pfn_req_flags = *hmm_pfn; if (pte_none_mostly(pte)) { diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 76f970aa5b4d..e94fe292f30a 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2063,7 +2063,7 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, entry = pte_mkspecial(entry); if (pmd_uffd_wp(old_pmd)) entry = pte_mkuffd_wp(entry); - VM_BUG_ON(!pte_none(*pte)); + VM_BUG_ON(!pte_none(ptep_get(pte))); set_pte_at(mm, addr, pte, entry); pte++; } @@ -2257,7 +2257,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, entry = pte_mkuffd_wp(entry); page_add_anon_rmap(page + i, vma, addr, false); } - VM_BUG_ON(!pte_none(*pte)); + VM_BUG_ON(!pte_none(ptep_get(pte))); set_pte_at(mm, addr, pte, entry); pte++; } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 1d3d8a61b336..d76574425da3 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -7246,7 +7246,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, pte = (pte_t *)pmd_alloc(mm, pud, addr); } } - BUG_ON(pte && pte_present(*pte) && !pte_huge(*pte)); + BUG_ON(pte && pte_present(ptep_get(pte)) && !pte_huge(ptep_get(pte))); return pte; } diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index f42079b73f82..c2007ef5e9b0 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -105,7 +105,7 @@ static void vmemmap_pte_range(pmd_t *pmd, unsigned long addr, * remapping (which is calling @walk->remap_pte). */ if (!walk->reuse_page) { - walk->reuse_page = pte_page(*pte); + walk->reuse_page = pte_page(ptep_get(pte)); /* * Because the reuse address is part of the range that we are * walking, skip the reuse address range. @@ -239,7 +239,7 @@ static void vmemmap_remap_pte(pte_t *pte, unsigned long addr, * to the tail pages. */ pgprot_t pgprot = PAGE_KERNEL_RO; - struct page *page = pte_page(*pte); + struct page *page = pte_page(ptep_get(pte)); pte_t entry; /* Remapping the head page requires r/w */ @@ -286,7 +286,7 @@ static void vmemmap_restore_pte(pte_t *pte, unsigned long addr, struct page *page; void *to; - BUG_ON(pte_page(*pte) != walk->reuse_page); + BUG_ON(pte_page(ptep_get(pte)) != walk->reuse_page); page = list_first_entry(walk->vmemmap_pages, struct page, lru); list_del(&page->lru); diff --git a/mm/kasan/init.c b/mm/kasan/init.c index cc64ed6858c6..dcfec277e839 100644 --- a/mm/kasan/init.c +++ b/mm/kasan/init.c @@ -286,7 +286,7 @@ static void kasan_free_pte(pte_t *pte_start, pmd_t *pmd) for (i = 0; i < PTRS_PER_PTE; i++) { pte = pte_start + i; - if (!pte_none(*pte)) + if (!pte_none(ptep_get(pte))) return; } @@ -343,16 +343,19 @@ static void kasan_remove_pte_table(pte_t *pte, unsigned long addr, unsigned long end) { unsigned long next; + pte_t ptent; for (; addr < end; addr = next, pte++) { next = (addr + PAGE_SIZE) & PAGE_MASK; if (next > end) next = end; - if (!pte_present(*pte)) + ptent = ptep_get(pte); + + if (!pte_present(ptent)) continue; - if (WARN_ON(!kasan_early_shadow_page_entry(*pte))) + if (WARN_ON(!kasan_early_shadow_page_entry(ptent))) continue; pte_clear(&init_mm, addr, pte); } diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c index 3e62728ae25d..dd772f9d0f08 100644 --- a/mm/kasan/shadow.c +++ b/mm/kasan/shadow.c @@ -226,7 +226,7 @@ static bool shadow_mapped(unsigned long addr) if (pmd_bad(*pmd)) return true; pte = pte_offset_kernel(pmd, addr); - return !pte_none(*pte); + return !pte_none(ptep_get(pte)); } static int __meminit kasan_mem_notifier(struct notifier_block *nb, @@ -317,7 +317,7 @@ static int kasan_populate_vmalloc_pte(pte_t *ptep, unsigned long addr, unsigned long page; pte_t pte; - if (likely(!pte_none(*ptep))) + if (likely(!pte_none(ptep_get(ptep)))) return 0; page = __get_free_page(GFP_KERNEL); @@ -328,7 +328,7 @@ static int kasan_populate_vmalloc_pte(pte_t *ptep, unsigned long addr, pte = pfn_pte(PFN_DOWN(__pa(page)), PAGE_KERNEL); spin_lock(&init_mm.page_table_lock); - if (likely(pte_none(*ptep))) { + if (likely(pte_none(ptep_get(ptep)))) { set_pte_at(&init_mm, addr, ptep, pte); page = 0; } @@ -418,11 +418,11 @@ static int kasan_depopulate_vmalloc_pte(pte_t *ptep, unsigned long addr, { unsigned long page; - page = (unsigned long)__va(pte_pfn(*ptep) << PAGE_SHIFT); + page = (unsigned long)__va(pte_pfn(ptep_get(ptep)) << PAGE_SHIFT); spin_lock(&init_mm.page_table_lock); - if (likely(!pte_none(*ptep))) { + if (likely(!pte_none(ptep_get(ptep)))) { pte_clear(&init_mm, addr, ptep); free_page(page); } diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 881669e738c0..0b4f00712895 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -511,7 +511,7 @@ static void release_pte_pages(pte_t *pte, pte_t *_pte, struct folio *folio, *tmp; while (--_pte >= pte) { - pte_t pteval = *_pte; + pte_t pteval = ptep_get(_pte); unsigned long pfn; if (pte_none(pteval)) @@ -555,7 +555,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, for (_pte = pte; _pte < pte + HPAGE_PMD_NR; _pte++, address += PAGE_SIZE) { - pte_t pteval = *_pte; + pte_t pteval = ptep_get(_pte); if (pte_none(pteval) || (pte_present(pteval) && is_zero_pfn(pte_pfn(pteval)))) { ++none_or_zero; @@ -699,7 +699,7 @@ static void __collapse_huge_page_copy_succeeded(pte_t *pte, for (_pte = pte; _pte < pte + HPAGE_PMD_NR; _pte++, address += PAGE_SIZE) { - pteval = *_pte; + pteval = ptep_get(_pte); if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1); if (is_zero_pfn(pte_pfn(pteval))) { @@ -797,7 +797,7 @@ static int __collapse_huge_page_copy(pte_t *pte, */ for (_pte = pte, _address = address; _pte < pte + HPAGE_PMD_NR; _pte++, page++, _address += PAGE_SIZE) { - pteval = *_pte; + pteval = ptep_get(_pte); if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { clear_user_highpage(page, _address); continue; @@ -1274,7 +1274,7 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm, for (_address = address, _pte = pte; _pte < pte + HPAGE_PMD_NR; _pte++, _address += PAGE_SIZE) { - pte_t pteval = *_pte; + pte_t pteval = ptep_get(_pte); if (is_swap_pte(pteval)) { ++unmapped; if (!cc->is_khugepaged || @@ -1650,18 +1650,19 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, for (i = 0, addr = haddr, pte = start_pte; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) { struct page *page; + pte_t ptent = ptep_get(pte); /* empty pte, skip */ - if (pte_none(*pte)) + if (pte_none(ptent)) continue; /* page swapped out, abort */ - if (!pte_present(*pte)) { + if (!pte_present(ptent)) { result = SCAN_PTE_NON_PRESENT; goto abort; } - page = vm_normal_page(vma, addr, *pte); + page = vm_normal_page(vma, addr, ptent); if (WARN_ON_ONCE(page && is_zone_device_page(page))) page = NULL; /* @@ -1677,10 +1678,11 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, for (i = 0, addr = haddr, pte = start_pte; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) { struct page *page; + pte_t ptent = ptep_get(pte); - if (pte_none(*pte)) + if (pte_none(ptent)) continue; - page = vm_normal_page(vma, addr, *pte); + page = vm_normal_page(vma, addr, ptent); if (WARN_ON_ONCE(page && is_zone_device_page(page))) goto abort; page_remove_rmap(page, vma, false); diff --git a/mm/ksm.c b/mm/ksm.c index 3dc15459dd20..d995779dc1fe 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -429,15 +429,17 @@ static int break_ksm_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long nex struct page *page = NULL; spinlock_t *ptl; pte_t *pte; + pte_t ptent; int ret; pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); if (!pte) return 0; - if (pte_present(*pte)) { - page = vm_normal_page(walk->vma, addr, *pte); - } else if (!pte_none(*pte)) { - swp_entry_t entry = pte_to_swp_entry(*pte); + ptent = ptep_get(pte); + if (pte_present(ptent)) { + page = vm_normal_page(walk->vma, addr, ptent); + } else if (!pte_none(ptent)) { + swp_entry_t entry = pte_to_swp_entry(ptent); /* * As KSM pages remain KSM pages until freed, no need to wait @@ -1085,6 +1087,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, int err = -EFAULT; struct mmu_notifier_range range; bool anon_exclusive; + pte_t entry; pvmw.address = page_address_in_vma(page, vma); if (pvmw.address == -EFAULT) @@ -1102,10 +1105,9 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, goto out_unlock; anon_exclusive = PageAnonExclusive(page); - if (pte_write(*pvmw.pte) || pte_dirty(*pvmw.pte) || + entry = ptep_get(pvmw.pte); + if (pte_write(entry) || pte_dirty(entry) || anon_exclusive || mm_tlb_flush_pending(mm)) { - pte_t entry; - swapped = PageSwapCache(page); flush_cache_page(vma, pvmw.address, page_to_pfn(page)); /* @@ -1147,7 +1149,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, set_pte_at_notify(mm, pvmw.address, pvmw.pte, entry); } - *orig_pte = *pvmw.pte; + *orig_pte = entry; err = 0; out_unlock: @@ -1204,7 +1206,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, ptep = pte_offset_map_lock(mm, pmd, addr, &ptl); if (!ptep) goto out_mn; - if (!pte_same(*ptep, orig_pte)) { + if (!pte_same(ptep_get(ptep), orig_pte)) { pte_unmap_unlock(ptep, ptl); goto out_mn; } @@ -1231,7 +1233,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, dec_mm_counter(mm, MM_ANONPAGES); } - flush_cache_page(vma, addr, pte_pfn(*ptep)); + flush_cache_page(vma, addr, pte_pfn(ptep_get(ptep))); /* * No need to notify as we are replacing a read only page with another * read only page with the same content. diff --git a/mm/madvise.c b/mm/madvise.c index 9b3c9610052f..886f06066622 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -207,7 +207,7 @@ static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start, break; } - pte = *ptep; + pte = ptep_get(ptep); if (!is_swap_pte(pte)) continue; entry = pte_to_swp_entry(pte); @@ -438,7 +438,7 @@ regular_folio: flush_tlb_batched_pending(mm); arch_enter_lazy_mmu_mode(); for (; addr < end; pte++, addr += PAGE_SIZE) { - ptent = *pte; + ptent = ptep_get(pte); if (pte_none(ptent)) continue; @@ -642,7 +642,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, flush_tlb_batched_pending(mm); arch_enter_lazy_mmu_mode(); for (; addr != end; pte++, addr += PAGE_SIZE) { - ptent = *pte; + ptent = ptep_get(pte); if (pte_none(ptent)) continue; diff --git a/mm/mapping_dirty_helpers.c b/mm/mapping_dirty_helpers.c index 87b4beeda4fa..a26dd8bcfcdb 100644 --- a/mm/mapping_dirty_helpers.c +++ b/mm/mapping_dirty_helpers.c @@ -35,7 +35,7 @@ static int wp_pte(pte_t *pte, unsigned long addr, unsigned long end, struct mm_walk *walk) { struct wp_walk *wpwalk = walk->private; - pte_t ptent = *pte; + pte_t ptent = ptep_get(pte); if (pte_write(ptent)) { pte_t old_pte = ptep_modify_prot_start(walk->vma, addr, pte); @@ -91,7 +91,7 @@ static int clean_record_pte(pte_t *pte, unsigned long addr, { struct wp_walk *wpwalk = walk->private; struct clean_walk *cwalk = to_clean_walk(wpwalk); - pte_t ptent = *pte; + pte_t ptent = ptep_get(pte); if (pte_dirty(ptent)) { pgoff_t pgoff = ((addr - walk->vma->vm_start) >> PAGE_SHIFT) + diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 77d8d2d14fcf..93056918e956 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6025,7 +6025,7 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, if (!pte) return 0; for (; addr != end; pte++, addr += PAGE_SIZE) - if (get_mctgt_type(vma, addr, *pte, NULL)) + if (get_mctgt_type(vma, addr, ptep_get(pte), NULL)) mc.precharge++; /* increment precharge temporarily */ pte_unmap_unlock(pte - 1, ptl); cond_resched(); @@ -6246,7 +6246,7 @@ retry: if (!pte) return 0; for (; addr != end; addr += PAGE_SIZE) { - pte_t ptent = *(pte++); + pte_t ptent = ptep_get(pte++); bool device = false; swp_entry_t ent; diff --git a/mm/memory-failure.c b/mm/memory-failure.c index d5116f0eb1b6..e245191e6b04 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -6,16 +6,16 @@ * High level machine check handler. Handles pages reported by the * hardware as being corrupted usually due to a multi-bit ECC memory or cache * failure. - * + * * In addition there is a "soft offline" entry point that allows stop using * not-yet-corrupted-by-suspicious pages without killing anything. * * Handles page cache pages in various states. The tricky part - * here is that we can access any page asynchronously in respect to - * other VM users, because memory failures could happen anytime and - * anywhere. This could violate some of their assumptions. This is why - * this code has to be extremely careful. Generally it tries to use - * normal locking rules, as in get the standard locks, even if that means + * here is that we can access any page asynchronously in respect to + * other VM users, because memory failures could happen anytime and + * anywhere. This could violate some of their assumptions. This is why + * this code has to be extremely careful. Generally it tries to use + * normal locking rules, as in get the standard locks, even if that means * the error handling takes potentially a long time. * * It can be very tempting to add handling for obscure cases here. @@ -25,12 +25,12 @@ * https://git.kernel.org/cgit/utils/cpu/mce/mce-test.git/ * - The case actually shows up as a frequent (top 10) page state in * tools/mm/page-types when running a real workload. - * + * * There are several operations here with exponential complexity because - * of unsuitable VM data structures. For example the operation to map back - * from RMAP chains to processes has to walk the complete process list and + * of unsuitable VM data structures. For example the operation to map back + * from RMAP chains to processes has to walk the complete process list and * has non linear complexity with the number. But since memory corruptions - * are rare we hope to get away with this. This avoids impacting the core + * are rare we hope to get away with this. This avoids impacting the core * VM. */ @@ -386,6 +386,7 @@ static unsigned long dev_pagemap_mapping_shift(struct vm_area_struct *vma, pud_t *pud; pmd_t *pmd; pte_t *pte; + pte_t ptent; VM_BUG_ON_VMA(address == -EFAULT, vma); pgd = pgd_offset(vma->vm_mm, address); @@ -407,7 +408,8 @@ static unsigned long dev_pagemap_mapping_shift(struct vm_area_struct *vma, pte = pte_offset_map(pmd, address); if (!pte) return 0; - if (pte_present(*pte) && pte_devmap(*pte)) + ptent = ptep_get(pte); + if (pte_present(ptent) && pte_devmap(ptent)) ret = PAGE_SHIFT; pte_unmap(pte); return ret; @@ -799,7 +801,7 @@ static int hwpoison_pte_range(pmd_t *pmdp, unsigned long addr, goto out; for (; addr != end; ptep++, addr += PAGE_SIZE) { - ret = check_hwpoisoned_entry(*ptep, addr, PAGE_SHIFT, + ret = check_hwpoisoned_entry(ptep_get(ptep), addr, PAGE_SHIFT, hwp->pfn, &hwp->tk); if (ret == 1) break; diff --git a/mm/memory.c b/mm/memory.c index 63c30f58142b..3d78b552866d 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -699,15 +699,17 @@ static void restore_exclusive_pte(struct vm_area_struct *vma, struct page *page, unsigned long address, pte_t *ptep) { + pte_t orig_pte; pte_t pte; swp_entry_t entry; + orig_pte = ptep_get(ptep); pte = pte_mkold(mk_pte(page, READ_ONCE(vma->vm_page_prot))); - if (pte_swp_soft_dirty(*ptep)) + if (pte_swp_soft_dirty(orig_pte)) pte = pte_mksoft_dirty(pte); - entry = pte_to_swp_entry(*ptep); - if (pte_swp_uffd_wp(*ptep)) + entry = pte_to_swp_entry(orig_pte); + if (pte_swp_uffd_wp(orig_pte)) pte = pte_mkuffd_wp(pte); else if (is_writable_device_exclusive_entry(entry)) pte = maybe_mkwrite(pte_mkdirty(pte), vma); @@ -744,7 +746,7 @@ static int try_restore_exclusive_pte(pte_t *src_pte, struct vm_area_struct *vma, unsigned long addr) { - swp_entry_t entry = pte_to_swp_entry(*src_pte); + swp_entry_t entry = pte_to_swp_entry(ptep_get(src_pte)); struct page *page = pfn_swap_entry_to_page(entry); if (trylock_page(page)) { @@ -768,9 +770,10 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, struct vm_area_struct *src_vma, unsigned long addr, int *rss) { unsigned long vm_flags = dst_vma->vm_flags; - pte_t pte = *src_pte; + pte_t orig_pte = ptep_get(src_pte); + pte_t pte = orig_pte; struct page *page; - swp_entry_t entry = pte_to_swp_entry(pte); + swp_entry_t entry = pte_to_swp_entry(orig_pte); if (likely(!non_swap_entry(entry))) { if (swap_duplicate(entry) < 0) @@ -785,8 +788,8 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, spin_unlock(&mmlist_lock); } /* Mark the swap entry as shared. */ - if (pte_swp_exclusive(*src_pte)) { - pte = pte_swp_clear_exclusive(*src_pte); + if (pte_swp_exclusive(orig_pte)) { + pte = pte_swp_clear_exclusive(orig_pte); set_pte_at(src_mm, addr, src_pte, pte); } rss[MM_SWAPENTS]++; @@ -805,9 +808,9 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, entry = make_readable_migration_entry( swp_offset(entry)); pte = swp_entry_to_pte(entry); - if (pte_swp_soft_dirty(*src_pte)) + if (pte_swp_soft_dirty(orig_pte)) pte = pte_swp_mksoft_dirty(pte); - if (pte_swp_uffd_wp(*src_pte)) + if (pte_swp_uffd_wp(orig_pte)) pte = pte_swp_mkuffd_wp(pte); set_pte_at(src_mm, addr, src_pte, pte); } @@ -840,7 +843,7 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, entry = make_readable_device_private_entry( swp_offset(entry)); pte = swp_entry_to_pte(entry); - if (pte_swp_uffd_wp(*src_pte)) + if (pte_swp_uffd_wp(orig_pte)) pte = pte_swp_mkuffd_wp(pte); set_pte_at(src_mm, addr, src_pte, pte); } @@ -904,7 +907,7 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma /* All done, just insert the new page copy in the child */ pte = mk_pte(&new_folio->page, dst_vma->vm_page_prot); pte = maybe_mkwrite(pte_mkdirty(pte), dst_vma); - if (userfaultfd_pte_wp(dst_vma, *src_pte)) + if (userfaultfd_pte_wp(dst_vma, ptep_get(src_pte))) /* Uffd-wp needs to be delivered to dest pte as well */ pte = pte_mkuffd_wp(pte); set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte); @@ -922,7 +925,7 @@ copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, { struct mm_struct *src_mm = src_vma->vm_mm; unsigned long vm_flags = src_vma->vm_flags; - pte_t pte = *src_pte; + pte_t pte = ptep_get(src_pte); struct page *page; struct folio *folio; @@ -1002,6 +1005,7 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, struct mm_struct *src_mm = src_vma->vm_mm; pte_t *orig_src_pte, *orig_dst_pte; pte_t *src_pte, *dst_pte; + pte_t ptent; spinlock_t *src_ptl, *dst_ptl; int progress, ret = 0; int rss[NR_MM_COUNTERS]; @@ -1047,17 +1051,18 @@ again: spin_needbreak(src_ptl) || spin_needbreak(dst_ptl)) break; } - if (pte_none(*src_pte)) { + ptent = ptep_get(src_pte); + if (pte_none(ptent)) { progress++; continue; } - if (unlikely(!pte_present(*src_pte))) { + if (unlikely(!pte_present(ptent))) { ret = copy_nonpresent_pte(dst_mm, src_mm, dst_pte, src_pte, dst_vma, src_vma, addr, rss); if (ret == -EIO) { - entry = pte_to_swp_entry(*src_pte); + entry = pte_to_swp_entry(ptep_get(src_pte)); break; } else if (ret == -EBUSY) { break; @@ -1407,7 +1412,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, flush_tlb_batched_pending(mm); arch_enter_lazy_mmu_mode(); do { - pte_t ptent = *pte; + pte_t ptent = ptep_get(pte); struct page *page; if (pte_none(ptent)) @@ -1822,7 +1827,7 @@ static int validate_page_before_insert(struct page *page) static int insert_page_into_pte_locked(struct vm_area_struct *vma, pte_t *pte, unsigned long addr, struct page *page, pgprot_t prot) { - if (!pte_none(*pte)) + if (!pte_none(ptep_get(pte))) return -EBUSY; /* Ok, finally just insert the thing.. */ get_page(page); @@ -2116,7 +2121,8 @@ static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr, pte = get_locked_pte(mm, addr, &ptl); if (!pte) return VM_FAULT_OOM; - if (!pte_none(*pte)) { + entry = ptep_get(pte); + if (!pte_none(entry)) { if (mkwrite) { /* * For read faults on private mappings the PFN passed @@ -2128,11 +2134,11 @@ static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr, * allocation and mapping invalidation so just skip the * update. */ - if (pte_pfn(*pte) != pfn_t_to_pfn(pfn)) { - WARN_ON_ONCE(!is_zero_pfn(pte_pfn(*pte))); + if (pte_pfn(entry) != pfn_t_to_pfn(pfn)) { + WARN_ON_ONCE(!is_zero_pfn(pte_pfn(entry))); goto out_unlock; } - entry = pte_mkyoung(*pte); + entry = pte_mkyoung(entry); entry = maybe_mkwrite(pte_mkdirty(entry), vma); if (ptep_set_access_flags(vma, addr, pte, entry, 1)) update_mmu_cache(vma, addr, pte); @@ -2344,7 +2350,7 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd, return -ENOMEM; arch_enter_lazy_mmu_mode(); do { - BUG_ON(!pte_none(*pte)); + BUG_ON(!pte_none(ptep_get(pte))); if (!pfn_modify_allowed(pfn, prot)) { err = -EACCES; break; @@ -2585,7 +2591,7 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, if (fn) { do { - if (create || !pte_none(*pte)) { + if (create || !pte_none(ptep_get(pte))) { err = fn(pte++, addr, data); if (err) break; @@ -2787,7 +2793,7 @@ static inline int pte_unmap_same(struct vm_fault *vmf) #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPTION) if (sizeof(pte_t) > sizeof(unsigned long)) { spin_lock(vmf->ptl); - same = pte_same(*vmf->pte, vmf->orig_pte); + same = pte_same(ptep_get(vmf->pte), vmf->orig_pte); spin_unlock(vmf->ptl); } #endif @@ -2838,7 +2844,7 @@ static inline int __wp_page_copy_user(struct page *dst, struct page *src, pte_t entry; vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl); - if (unlikely(!vmf->pte || !pte_same(*vmf->pte, vmf->orig_pte))) { + if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte))) { /* * Other thread has already handled the fault * and update local tlb only @@ -2866,7 +2872,7 @@ static inline int __wp_page_copy_user(struct page *dst, struct page *src, /* Re-validate under PTL if the page is still mapped */ vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl); - if (unlikely(!vmf->pte || !pte_same(*vmf->pte, vmf->orig_pte))) { + if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte))) { /* The PTE changed under us, update local tlb */ if (vmf->pte) update_mmu_tlb(vma, addr, vmf->pte); @@ -3114,7 +3120,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) * Re-check the pte - we dropped the lock */ vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, &vmf->ptl); - if (likely(vmf->pte && pte_same(*vmf->pte, vmf->orig_pte))) { + if (likely(vmf->pte && pte_same(ptep_get(vmf->pte), vmf->orig_pte))) { if (old_folio) { if (!folio_test_anon(old_folio)) { dec_mm_counter(mm, mm_counter_file(&old_folio->page)); @@ -3241,7 +3247,7 @@ vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf) * We might have raced with another page fault while we released the * pte_offset_map_lock. */ - if (!pte_same(*vmf->pte, vmf->orig_pte)) { + if (!pte_same(ptep_get(vmf->pte), vmf->orig_pte)) { update_mmu_tlb(vmf->vma, vmf->address, vmf->pte); pte_unmap_unlock(vmf->pte, vmf->ptl); return VM_FAULT_NOPAGE; @@ -3336,7 +3342,7 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) struct folio *folio = NULL; if (likely(!unshare)) { - if (userfaultfd_pte_wp(vma, *vmf->pte)) { + if (userfaultfd_pte_wp(vma, ptep_get(vmf->pte))) { pte_unmap_unlock(vmf->pte, vmf->ptl); return handle_userfault(vmf, VM_UFFD_WP); } @@ -3598,7 +3604,7 @@ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf) vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); - if (likely(vmf->pte && pte_same(*vmf->pte, vmf->orig_pte))) + if (likely(vmf->pte && pte_same(ptep_get(vmf->pte), vmf->orig_pte))) restore_exclusive_pte(vma, vmf->page, vmf->address, vmf->pte); if (vmf->pte) @@ -3643,7 +3649,7 @@ static vm_fault_t pte_marker_clear(struct vm_fault *vmf) * quickly from a PTE_MARKER_UFFD_WP into PTE_MARKER_SWAPIN_ERROR. * So is_pte_marker() check is not enough to safely drop the pte. */ - if (pte_same(vmf->orig_pte, *vmf->pte)) + if (pte_same(vmf->orig_pte, ptep_get(vmf->pte))) pte_clear(vmf->vma->vm_mm, vmf->address, vmf->pte); pte_unmap_unlock(vmf->pte, vmf->ptl); return 0; @@ -3739,7 +3745,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); if (unlikely(!vmf->pte || - !pte_same(*vmf->pte, vmf->orig_pte))) + !pte_same(ptep_get(vmf->pte), + vmf->orig_pte))) goto unlock; /* @@ -3816,7 +3823,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) */ vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); - if (likely(vmf->pte && pte_same(*vmf->pte, vmf->orig_pte))) + if (likely(vmf->pte && + pte_same(ptep_get(vmf->pte), vmf->orig_pte))) ret = VM_FAULT_OOM; goto unlock; } @@ -3886,7 +3894,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) */ vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); - if (unlikely(!vmf->pte || !pte_same(*vmf->pte, vmf->orig_pte))) + if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte))) goto out_nomap; if (unlikely(!folio_test_uptodate(folio))) { @@ -4331,9 +4339,9 @@ void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr) static bool vmf_pte_changed(struct vm_fault *vmf) { if (vmf->flags & FAULT_FLAG_ORIG_PTE_VALID) - return !pte_same(*vmf->pte, vmf->orig_pte); + return !pte_same(ptep_get(vmf->pte), vmf->orig_pte); - return !pte_none(*vmf->pte); + return !pte_none(ptep_get(vmf->pte)); } /** @@ -4643,7 +4651,7 @@ static vm_fault_t do_fault(struct vm_fault *vmf) * we don't have concurrent modification by hardware * followed by an update. */ - if (unlikely(pte_none(*vmf->pte))) + if (unlikely(pte_none(ptep_get(vmf->pte)))) ret = VM_FAULT_SIGBUS; else ret = VM_FAULT_NOPAGE; @@ -4699,7 +4707,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) * the pfn may be screwed if the read is non atomic. */ spin_lock(vmf->ptl); - if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) { + if (unlikely(!pte_same(ptep_get(vmf->pte), vmf->orig_pte))) { pte_unmap_unlock(vmf->pte, vmf->ptl); goto out; } @@ -4772,7 +4780,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) vmf->address, &vmf->ptl); if (unlikely(!vmf->pte)) goto out; - if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) { + if (unlikely(!pte_same(ptep_get(vmf->pte), vmf->orig_pte))) { pte_unmap_unlock(vmf->pte, vmf->ptl); goto out; } @@ -4930,7 +4938,7 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf) spin_lock(vmf->ptl); entry = vmf->orig_pte; - if (unlikely(!pte_same(*vmf->pte, entry))) { + if (unlikely(!pte_same(ptep_get(vmf->pte), entry))) { update_mmu_tlb(vmf->vma, vmf->address, vmf->pte); goto unlock; } @@ -5416,7 +5424,7 @@ int follow_pte(struct mm_struct *mm, unsigned long address, ptep = pte_offset_map_lock(mm, pmd, address, ptlp); if (!ptep) goto out; - if (!pte_present(*ptep)) + if (!pte_present(ptep_get(ptep))) goto unlock; *ptepp = ptep; return 0; @@ -5453,7 +5461,7 @@ int follow_pfn(struct vm_area_struct *vma, unsigned long address, ret = follow_pte(vma->vm_mm, address, &ptep, &ptl); if (ret) return ret; - *pfn = pte_pfn(*ptep); + *pfn = pte_pfn(ptep_get(ptep)); pte_unmap_unlock(ptep, ptl); return 0; } @@ -5473,7 +5481,7 @@ int follow_phys(struct vm_area_struct *vma, if (follow_pte(vma->vm_mm, address, &ptep, &ptl)) goto out; - pte = *ptep; + pte = ptep_get(ptep); if ((flags & FOLL_WRITE) && !pte_write(pte)) goto unlock; @@ -5517,7 +5525,7 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, retry: if (follow_pte(vma->vm_mm, addr, &ptep, &ptl)) return -EINVAL; - pte = *ptep; + pte = ptep_get(ptep); pte_unmap_unlock(ptep, ptl); prot = pgprot_val(pte_pgprot(pte)); @@ -5533,7 +5541,7 @@ retry: if (follow_pte(vma->vm_mm, addr, &ptep, &ptl)) goto out_unmap; - if (!pte_same(pte, *ptep)) { + if (!pte_same(pte, ptep_get(ptep))) { pte_unmap_unlock(ptep, ptl); iounmap(maddr); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 0241bb64978b..edc25195f5bd 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -508,6 +508,7 @@ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr, unsigned long flags = qp->flags; bool has_unmovable = false; pte_t *pte, *mapped_pte; + pte_t ptent; spinlock_t *ptl; ptl = pmd_trans_huge_lock(pmd, vma); @@ -520,9 +521,10 @@ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr, return 0; } for (; addr != end; pte++, addr += PAGE_SIZE) { - if (!pte_present(*pte)) + ptent = ptep_get(pte); + if (!pte_present(ptent)) continue; - folio = vm_normal_folio(vma, addr, *pte); + folio = vm_normal_folio(vma, addr, ptent); if (!folio || folio_is_zone_device(folio)) continue; /* diff --git a/mm/migrate.c b/mm/migrate.c index 363562992046..ce35afdbc1e3 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -188,6 +188,7 @@ static bool remove_migration_pte(struct folio *folio, while (page_vma_mapped_walk(&pvmw)) { rmap_t rmap_flags = RMAP_NONE; + pte_t old_pte; pte_t pte; swp_entry_t entry; struct page *new; @@ -210,17 +211,18 @@ static bool remove_migration_pte(struct folio *folio, folio_get(folio); pte = mk_pte(new, READ_ONCE(vma->vm_page_prot)); - if (pte_swp_soft_dirty(*pvmw.pte)) + old_pte = ptep_get(pvmw.pte); + if (pte_swp_soft_dirty(old_pte)) pte = pte_mksoft_dirty(pte); - entry = pte_to_swp_entry(*pvmw.pte); + entry = pte_to_swp_entry(old_pte); if (!is_migration_entry_young(entry)) pte = pte_mkold(pte); if (folio_test_dirty(folio) && is_migration_entry_dirty(entry)) pte = pte_mkdirty(pte); if (is_writable_migration_entry(entry)) pte = pte_mkwrite(pte); - else if (pte_swp_uffd_wp(*pvmw.pte)) + else if (pte_swp_uffd_wp(old_pte)) pte = pte_mkuffd_wp(pte); if (folio_test_anon(folio) && !is_readable_migration_entry(entry)) @@ -234,9 +236,9 @@ static bool remove_migration_pte(struct folio *folio, entry = make_readable_device_private_entry( page_to_pfn(new)); pte = swp_entry_to_pte(entry); - if (pte_swp_soft_dirty(*pvmw.pte)) + if (pte_swp_soft_dirty(old_pte)) pte = pte_swp_mksoft_dirty(pte); - if (pte_swp_uffd_wp(*pvmw.pte)) + if (pte_swp_uffd_wp(old_pte)) pte = pte_swp_mkuffd_wp(pte); } @@ -308,7 +310,7 @@ void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, if (!ptep) return; - pte = *ptep; + pte = ptep_get(ptep); pte_unmap(ptep); if (!is_swap_pte(pte)) diff --git a/mm/migrate_device.c b/mm/migrate_device.c index a14af6b12b04..02d272b909b5 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -111,7 +111,7 @@ again: swp_entry_t entry; pte_t pte; - pte = *ptep; + pte = ptep_get(ptep); if (pte_none(pte)) { if (vma_is_anonymous(vma)) { @@ -194,7 +194,7 @@ again: bool anon_exclusive; pte_t swp_pte; - flush_cache_page(vma, addr, pte_pfn(*ptep)); + flush_cache_page(vma, addr, pte_pfn(pte)); anon_exclusive = PageAnon(page) && PageAnonExclusive(page); if (anon_exclusive) { pte = ptep_clear_flush(vma, addr, ptep); @@ -573,6 +573,7 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, pud_t *pudp; pmd_t *pmdp; pte_t *ptep; + pte_t orig_pte; /* Only allow populating anonymous memory */ if (!vma_is_anonymous(vma)) @@ -628,16 +629,18 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); if (!ptep) goto abort; + orig_pte = ptep_get(ptep); + if (check_stable_address_space(mm)) goto unlock_abort; - if (pte_present(*ptep)) { - unsigned long pfn = pte_pfn(*ptep); + if (pte_present(orig_pte)) { + unsigned long pfn = pte_pfn(orig_pte); if (!is_zero_pfn(pfn)) goto unlock_abort; flush = true; - } else if (!pte_none(*ptep)) + } else if (!pte_none(orig_pte)) goto unlock_abort; /* @@ -654,7 +657,7 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, get_page(page); if (flush) { - flush_cache_page(vma, addr, pte_pfn(*ptep)); + flush_cache_page(vma, addr, pte_pfn(orig_pte)); ptep_clear_flush_notify(vma, addr, ptep); set_pte_at_notify(mm, addr, ptep, entry); update_mmu_cache(vma, addr, ptep); diff --git a/mm/mincore.c b/mm/mincore.c index f33f6a0b1ded..b7f7a516b26c 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -119,7 +119,7 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, return 0; } for (; addr != end; ptep++, addr += PAGE_SIZE) { - pte_t pte = *ptep; + pte_t pte = ptep_get(ptep); /* We need to do cache lookup too for pte markers */ if (pte_none_mostly(pte)) diff --git a/mm/mlock.c b/mm/mlock.c index 9f2b1173b1b1..d7db94519884 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -312,6 +312,7 @@ static int mlock_pte_range(pmd_t *pmd, unsigned long addr, struct vm_area_struct *vma = walk->vma; spinlock_t *ptl; pte_t *start_pte, *pte; + pte_t ptent; struct folio *folio; ptl = pmd_trans_huge_lock(pmd, vma); @@ -334,9 +335,10 @@ static int mlock_pte_range(pmd_t *pmd, unsigned long addr, return 0; } for (pte = start_pte; addr != end; pte++, addr += PAGE_SIZE) { - if (!pte_present(*pte)) + ptent = ptep_get(pte); + if (!pte_present(ptent)) continue; - folio = vm_normal_folio(vma, addr, *pte); + folio = vm_normal_folio(vma, addr, ptent); if (!folio || folio_is_zone_device(folio)) continue; if (folio_test_large(folio)) diff --git a/mm/mprotect.c b/mm/mprotect.c index 64e1df0af514..327a6eb90afb 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -105,7 +105,7 @@ static long change_pte_range(struct mmu_gather *tlb, flush_tlb_batched_pending(vma->vm_mm); arch_enter_lazy_mmu_mode(); do { - oldpte = *pte; + oldpte = ptep_get(pte); if (pte_present(oldpte)) { pte_t ptent; @@ -544,7 +544,8 @@ long change_protection(struct mmu_gather *tlb, static int prot_none_pte_entry(pte_t *pte, unsigned long addr, unsigned long next, struct mm_walk *walk) { - return pfn_modify_allowed(pte_pfn(*pte), *(pgprot_t *)(walk->private)) ? + return pfn_modify_allowed(pte_pfn(ptep_get(pte)), + *(pgprot_t *)(walk->private)) ? 0 : -EACCES; } @@ -552,7 +553,8 @@ static int prot_none_hugetlb_entry(pte_t *pte, unsigned long hmask, unsigned long addr, unsigned long next, struct mm_walk *walk) { - return pfn_modify_allowed(pte_pfn(*pte), *(pgprot_t *)(walk->private)) ? + return pfn_modify_allowed(pte_pfn(ptep_get(pte)), + *(pgprot_t *)(walk->private)) ? 0 : -EACCES; } diff --git a/mm/mremap.c b/mm/mremap.c index bfc3d1902a94..8ec184ac90ff 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -188,7 +188,7 @@ static int move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE, new_pte++, new_addr += PAGE_SIZE) { - if (pte_none(*old_pte)) + if (pte_none(ptep_get(old_pte))) continue; pte = ptep_get_and_clear(mm, old_addr, old_pte); diff --git a/mm/page_table_check.c b/mm/page_table_check.c index 0c511330dbc9..8f89f9c8f0df 100644 --- a/mm/page_table_check.c +++ b/mm/page_table_check.c @@ -190,7 +190,7 @@ void __page_table_check_pte_set(struct mm_struct *mm, unsigned long addr, if (&init_mm == mm) return; - __page_table_check_pte_clear(mm, addr, *ptep); + __page_table_check_pte_clear(mm, addr, ptep_get(ptep)); if (pte_user_accessible_page(pte)) { page_table_check_set(mm, addr, pte_pfn(pte), PAGE_SIZE >> PAGE_SHIFT, @@ -243,7 +243,7 @@ void __page_table_check_pte_clear_range(struct mm_struct *mm, if (WARN_ON(!ptep)) return; for (i = 0; i < PTRS_PER_PTE; i++) { - __page_table_check_pte_clear(mm, addr, *ptep); + __page_table_check_pte_clear(mm, addr, ptep_get(ptep)); addr += PAGE_SIZE; ptep++; } diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c index 2af734274073..49e0d28f0379 100644 --- a/mm/page_vma_mapped.c +++ b/mm/page_vma_mapped.c @@ -15,6 +15,8 @@ static inline bool not_found(struct page_vma_mapped_walk *pvmw) static bool map_pte(struct page_vma_mapped_walk *pvmw, spinlock_t **ptlp) { + pte_t ptent; + if (pvmw->flags & PVMW_SYNC) { /* Use the stricter lookup */ pvmw->pte = pte_offset_map_lock(pvmw->vma->vm_mm, pvmw->pmd, @@ -35,10 +37,12 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw, spinlock_t **ptlp) if (!pvmw->pte) return false; + ptent = ptep_get(pvmw->pte); + if (pvmw->flags & PVMW_MIGRATION) { - if (!is_swap_pte(*pvmw->pte)) + if (!is_swap_pte(ptent)) return false; - } else if (is_swap_pte(*pvmw->pte)) { + } else if (is_swap_pte(ptent)) { swp_entry_t entry; /* * Handle un-addressable ZONE_DEVICE memory. @@ -56,11 +60,11 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw, spinlock_t **ptlp) * For more details on device private memory see HMM * (include/linux/hmm.h or mm/hmm.c). */ - entry = pte_to_swp_entry(*pvmw->pte); + entry = pte_to_swp_entry(ptent); if (!is_device_private_entry(entry) && !is_device_exclusive_entry(entry)) return false; - } else if (!pte_present(*pvmw->pte)) { + } else if (!pte_present(ptent)) { return false; } pvmw->ptl = *ptlp; @@ -90,33 +94,34 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw, spinlock_t **ptlp) static bool check_pte(struct page_vma_mapped_walk *pvmw) { unsigned long pfn; + pte_t ptent = ptep_get(pvmw->pte); if (pvmw->flags & PVMW_MIGRATION) { swp_entry_t entry; - if (!is_swap_pte(*pvmw->pte)) + if (!is_swap_pte(ptent)) return false; - entry = pte_to_swp_entry(*pvmw->pte); + entry = pte_to_swp_entry(ptent); if (!is_migration_entry(entry) && !is_device_exclusive_entry(entry)) return false; pfn = swp_offset_pfn(entry); - } else if (is_swap_pte(*pvmw->pte)) { + } else if (is_swap_pte(ptent)) { swp_entry_t entry; /* Handle un-addressable ZONE_DEVICE memory */ - entry = pte_to_swp_entry(*pvmw->pte); + entry = pte_to_swp_entry(ptent); if (!is_device_private_entry(entry) && !is_device_exclusive_entry(entry)) return false; pfn = swp_offset_pfn(entry); } else { - if (!pte_present(*pvmw->pte)) + if (!pte_present(ptent)) return false; - pfn = pte_pfn(*pvmw->pte); + pfn = pte_pfn(ptent); } return (pfn - pvmw->pfn) < pvmw->nr_pages; @@ -294,7 +299,7 @@ next_pte: goto restart; } pvmw->pte++; - } while (pte_none(*pvmw->pte)); + } while (pte_none(ptep_get(pvmw->pte))); if (!pvmw->ptl) { pvmw->ptl = ptl; diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index c7ab18a5fb77..4d454953046f 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -68,7 +68,7 @@ int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address, pte_t *ptep, pte_t entry, int dirty) { - int changed = !pte_same(*ptep, entry); + int changed = !pte_same(ptep_get(ptep), entry); if (changed) { set_pte_at(vma->vm_mm, address, ptep, entry); flush_tlb_fix_spurious_fault(vma, address, ptep); diff --git a/mm/rmap.c b/mm/rmap.c index cd918cb9a431..0c0d8857dfce 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -826,7 +826,8 @@ static bool folio_referenced_one(struct folio *folio, } if (pvmw.pte) { - if (lru_gen_enabled() && pte_young(*pvmw.pte)) { + if (lru_gen_enabled() && + pte_young(ptep_get(pvmw.pte))) { lru_gen_look_around(&pvmw); referenced++; } @@ -956,13 +957,13 @@ static int page_vma_mkclean_one(struct page_vma_mapped_walk *pvmw) address = pvmw->address; if (pvmw->pte) { - pte_t entry; pte_t *pte = pvmw->pte; + pte_t entry = ptep_get(pte); - if (!pte_dirty(*pte) && !pte_write(*pte)) + if (!pte_dirty(entry) && !pte_write(entry)) continue; - flush_cache_page(vma, address, pte_pfn(*pte)); + flush_cache_page(vma, address, pte_pfn(entry)); entry = ptep_clear_flush(vma, address, pte); entry = pte_wrprotect(entry); entry = pte_mkclean(entry); @@ -1137,7 +1138,7 @@ void page_move_anon_rmap(struct page *page, struct vm_area_struct *vma) * @folio: Folio which contains page. * @page: Page to add to rmap. * @vma: VM area to add page to. - * @address: User virtual address of the mapping + * @address: User virtual address of the mapping * @exclusive: the page is exclusively owned by the current process */ static void __page_set_anon_rmap(struct folio *folio, struct page *page, @@ -1458,6 +1459,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, bool anon_exclusive, ret = true; struct mmu_notifier_range range; enum ttu_flags flags = (enum ttu_flags)(long)arg; + unsigned long pfn; /* * When racing against e.g. zap_pte_range() on another cpu, @@ -1508,8 +1510,8 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, break; } - subpage = folio_page(folio, - pte_pfn(*pvmw.pte) - folio_pfn(folio)); + pfn = pte_pfn(ptep_get(pvmw.pte)); + subpage = folio_page(folio, pfn - folio_pfn(folio)); address = pvmw.address; anon_exclusive = folio_test_anon(folio) && PageAnonExclusive(subpage); @@ -1571,7 +1573,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, } pteval = huge_ptep_clear_flush(vma, address, pvmw.pte); } else { - flush_cache_page(vma, address, pte_pfn(*pvmw.pte)); + flush_cache_page(vma, address, pfn); /* Nuke the page table entry. */ if (should_defer_flush(mm, flags)) { /* @@ -1818,6 +1820,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, bool anon_exclusive, ret = true; struct mmu_notifier_range range; enum ttu_flags flags = (enum ttu_flags)(long)arg; + unsigned long pfn; /* * When racing against e.g. zap_pte_range() on another cpu, @@ -1877,6 +1880,8 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, /* Unexpected PMD-mapped THP? */ VM_BUG_ON_FOLIO(!pvmw.pte, folio); + pfn = pte_pfn(ptep_get(pvmw.pte)); + if (folio_is_zone_device(folio)) { /* * Our PTE is a non-present device exclusive entry and @@ -1891,8 +1896,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, VM_BUG_ON_FOLIO(folio_nr_pages(folio) > 1, folio); subpage = &folio->page; } else { - subpage = folio_page(folio, - pte_pfn(*pvmw.pte) - folio_pfn(folio)); + subpage = folio_page(folio, pfn - folio_pfn(folio)); } address = pvmw.address; anon_exclusive = folio_test_anon(folio) && @@ -1952,7 +1956,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, /* Nuke the hugetlb page table entry */ pteval = huge_ptep_clear_flush(vma, address, pvmw.pte); } else { - flush_cache_page(vma, address, pte_pfn(*pvmw.pte)); + flush_cache_page(vma, address, pfn); /* Nuke the page table entry. */ if (should_defer_flush(mm, flags)) { /* @@ -2187,6 +2191,7 @@ static bool page_make_device_exclusive_one(struct folio *folio, struct mmu_notifier_range range; swp_entry_t entry; pte_t swp_pte; + pte_t ptent; mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0, vma->vm_mm, address, min(vma->vm_end, @@ -2198,18 +2203,19 @@ static bool page_make_device_exclusive_one(struct folio *folio, /* Unexpected PMD-mapped THP? */ VM_BUG_ON_FOLIO(!pvmw.pte, folio); - if (!pte_present(*pvmw.pte)) { + ptent = ptep_get(pvmw.pte); + if (!pte_present(ptent)) { ret = false; page_vma_mapped_walk_done(&pvmw); break; } subpage = folio_page(folio, - pte_pfn(*pvmw.pte) - folio_pfn(folio)); + pte_pfn(ptent) - folio_pfn(folio)); address = pvmw.address; /* Nuke the page table entry. */ - flush_cache_page(vma, address, pte_pfn(*pvmw.pte)); + flush_cache_page(vma, address, pte_pfn(ptent)); pteval = ptep_clear_flush(vma, address, pvmw.pte); /* Set the dirty flag on the folio now the pte is gone. */ diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 10d73a0dfcec..a044a130405b 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -133,7 +133,7 @@ static void * __meminit altmap_alloc_block_buf(unsigned long size, void __meminit vmemmap_verify(pte_t *pte, int node, unsigned long start, unsigned long end) { - unsigned long pfn = pte_pfn(*pte); + unsigned long pfn = pte_pfn(ptep_get(pte)); int actual_node = early_pfn_to_nid(pfn); if (node_distance(actual_node, node) > LOCAL_DISTANCE) @@ -146,7 +146,7 @@ pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node, struct page *reuse) { pte_t *pte = pte_offset_kernel(pmd, addr); - if (pte_none(*pte)) { + if (pte_none(ptep_get(pte))) { pte_t entry; void *p; @@ -414,7 +414,7 @@ static int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn, * with just tail struct pages. */ return vmemmap_populate_range(start, end, node, NULL, - pte_page(*pte)); + pte_page(ptep_get(pte))); } size = min(end - start, pgmap_vmemmap_nr(pgmap) * sizeof(struct page)); @@ -438,7 +438,7 @@ static int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn, */ next += PAGE_SIZE; rc = vmemmap_populate_range(next, last, node, NULL, - pte_page(*pte)); + pte_page(ptep_get(pte))); if (rc) return -ENOMEM; } diff --git a/mm/swap_state.c b/mm/swap_state.c index a33c60e0158f..4a5c7b748051 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -275,9 +275,9 @@ void clear_shadow_from_swap_cache(int type, unsigned long begin, } } -/* - * If we are the only user, then try to free up the swap cache. - * +/* + * If we are the only user, then try to free up the swap cache. + * * Its ok to check the swapcache flag without the folio lock * here because we are going to recheck again inside * folio_free_swap() _with_ the lock. @@ -294,7 +294,7 @@ void free_swap_cache(struct page *page) } } -/* +/* * Perform a free_page(), also freeing any swap cache associated with * this page if it is the last user of the page. */ diff --git a/mm/swapfile.c b/mm/swapfile.c index 74dd4d2337b7..a6945c2e0d03 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1745,7 +1745,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, struct page *page = folio_file_page(folio, swp_offset(entry)); struct page *swapcache; spinlock_t *ptl; - pte_t *pte, new_pte; + pte_t *pte, new_pte, old_pte; bool hwposioned = false; int ret = 1; @@ -1757,11 +1757,14 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, hwposioned = true; pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); - if (unlikely(!pte || !pte_same_as_swp(*pte, swp_entry_to_pte(entry)))) { + if (unlikely(!pte || !pte_same_as_swp(ptep_get(pte), + swp_entry_to_pte(entry)))) { ret = 0; goto out; } + old_pte = ptep_get(pte); + if (unlikely(hwposioned || !PageUptodate(page))) { swp_entry_t swp_entry; @@ -1793,7 +1796,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, * call and have the page locked. */ VM_BUG_ON_PAGE(PageWriteback(page), page); - if (pte_swp_exclusive(*pte)) + if (pte_swp_exclusive(old_pte)) rmap_flags |= RMAP_EXCLUSIVE; page_add_anon_rmap(page, vma, addr, rmap_flags); @@ -1802,9 +1805,9 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, lru_cache_add_inactive_or_unevictable(page, vma); } new_pte = pte_mkold(mk_pte(page, vma->vm_page_prot)); - if (pte_swp_soft_dirty(*pte)) + if (pte_swp_soft_dirty(old_pte)) new_pte = pte_mksoft_dirty(new_pte); - if (pte_swp_uffd_wp(*pte)) + if (pte_swp_uffd_wp(old_pte)) new_pte = pte_mkuffd_wp(new_pte); setpte: set_pte_at(vma->vm_mm, addr, pte, new_pte); @@ -1833,6 +1836,7 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, unsigned char swp_count; swp_entry_t entry; int ret; + pte_t ptent; if (!pte++) { pte = pte_offset_map(pmd, addr); @@ -1840,10 +1844,12 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, break; } - if (!is_swap_pte(*pte)) + ptent = ptep_get_lockless(pte); + + if (!is_swap_pte(ptent)) continue; - entry = pte_to_swp_entry(*pte); + entry = pte_to_swp_entry(ptent); if (swp_type(entry) != type) continue; diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 5fd787158c70..a2bf37ee276d 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -97,7 +97,7 @@ int mfill_atomic_install_pte(pmd_t *dst_pmd, * registered, we firstly wr-protect a none pte which has no page cache * page backing it, then access the page. */ - if (!pte_none_mostly(*dst_pte)) + if (!pte_none_mostly(ptep_get(dst_pte))) goto out_unlock; folio = page_folio(page); @@ -230,7 +230,7 @@ static int mfill_atomic_pte_zeropage(pmd_t *dst_pmd, goto out_unlock; } ret = -EEXIST; - if (!pte_none(*dst_pte)) + if (!pte_none(ptep_get(dst_pte))) goto out_unlock; set_pte_at(dst_vma->vm_mm, dst_addr, dst_pte, _dst_pte); /* No need to invalidate - it was non-present before */ diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 7382e0a60ce1..5a3bf408251b 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -103,7 +103,7 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, if (!pte) return -ENOMEM; do { - BUG_ON(!pte_none(*pte)); + BUG_ON(!pte_none(ptep_get(pte))); #ifdef CONFIG_HUGETLB_PAGE size = arch_vmap_pte_range_map_size(addr, end, pfn, max_page_shift); @@ -472,7 +472,7 @@ static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr, do { struct page *page = pages[*nr]; - if (WARN_ON(!pte_none(*pte))) + if (WARN_ON(!pte_none(ptep_get(pte)))) return -EBUSY; if (WARN_ON(!page)) return -ENOMEM; @@ -704,7 +704,7 @@ struct page *vmalloc_to_page(const void *vmalloc_addr) return NULL; ptep = pte_offset_kernel(pmd, addr); - pte = *ptep; + pte = ptep_get(ptep); if (pte_present(pte)) page = pte_page(pte); diff --git a/mm/vmscan.c b/mm/vmscan.c index 3f64c8d9f629..e305c11ec8fc 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -4037,15 +4037,16 @@ restart: for (i = pte_index(start), addr = start; addr != end; i++, addr += PAGE_SIZE) { unsigned long pfn; struct folio *folio; + pte_t ptent = ptep_get(pte + i); total++; walk->mm_stats[MM_LEAF_TOTAL]++; - pfn = get_pte_pfn(pte[i], args->vma, addr); + pfn = get_pte_pfn(ptent, args->vma, addr); if (pfn == -1) continue; - if (!pte_young(pte[i])) { + if (!pte_young(ptent)) { walk->mm_stats[MM_LEAF_OLD]++; continue; } @@ -4060,7 +4061,7 @@ restart: young++; walk->mm_stats[MM_LEAF_YOUNG]++; - if (pte_dirty(pte[i]) && !folio_test_dirty(folio) && + if (pte_dirty(ptent) && !folio_test_dirty(folio) && !(folio_test_anon(folio) && folio_test_swapbacked(folio) && !folio_test_swapcache(folio))) folio_mark_dirty(folio); @@ -4703,12 +4704,13 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) for (i = 0, addr = start; addr != end; i++, addr += PAGE_SIZE) { unsigned long pfn; + pte_t ptent = ptep_get(pte + i); - pfn = get_pte_pfn(pte[i], pvmw->vma, addr); + pfn = get_pte_pfn(ptent, pvmw->vma, addr); if (pfn == -1) continue; - if (!pte_young(pte[i])) + if (!pte_young(ptent)) continue; folio = get_pfn_folio(pfn, memcg, pgdat, !walk || walk->can_swap); @@ -4720,7 +4722,7 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) young++; - if (pte_dirty(pte[i]) && !folio_test_dirty(folio) && + if (pte_dirty(ptent) && !folio_test_dirty(folio) && !(folio_test_anon(folio) && folio_test_swapbacked(folio) && !folio_test_swapcache(folio))) folio_mark_dirty(folio); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 51e4882d0873..fb37adecfc91 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2578,6 +2578,7 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma, { kvm_pfn_t pfn; pte_t *ptep; + pte_t pte; spinlock_t *ptl; int r; @@ -2601,14 +2602,16 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma, return r; } - if (write_fault && !pte_write(*ptep)) { + pte = ptep_get(ptep); + + if (write_fault && !pte_write(pte)) { pfn = KVM_PFN_ERR_RO_FAULT; goto out; } if (writable) - *writable = pte_write(*ptep); - pfn = pte_pfn(*ptep); + *writable = pte_write(pte); + pfn = pte_pfn(pte); /* * Get a reference here because callers of *hva_to_pfn* and @@ -2626,7 +2629,7 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma, * tail pages of non-compound higher order allocations, which * would then underflow the refcount when the caller does the * required put_page. Don't allow those pages here. - */ + */ if (!kvm_try_get_pfn(pfn)) r = -EFAULT; -- cgit v1.2.3 From 35499e2b79ffc51ea704c3268a5830164825a43e Mon Sep 17 00:00:00 2001 From: Domenico Cerasuolo Date: Mon, 12 Jun 2023 11:38:13 +0200 Subject: mm: zswap: remove shrink from zpool interface Now that all three zswap backends have removed their shrink code, it is no longer necessary for the zpool interface to include shrink/writeback endpoints. Link: https://lkml.kernel.org/r/20230612093815.133504-6-cerasuolodomenico@gmail.com Signed-off-by: Domenico Cerasuolo Reviewed-by: Yosry Ahmed Acked-by: Nhat Pham Acked-by: Johannes Weiner Reviewed-by: Sergey Senozhatsky Cc: Dan Streetman Cc: Minchan Kim Cc: Seth Jennings Cc: Vitaly Wool Signed-off-by: Andrew Morton --- include/linux/zpool.h | 20 ++------------------ mm/z3fold.c | 4 +--- mm/zbud.c | 4 +--- mm/zpool.c | 48 ++---------------------------------------------- mm/zsmalloc.c | 4 +--- mm/zswap.c | 27 +++++++-------------------- 6 files changed, 14 insertions(+), 93 deletions(-) (limited to 'include') diff --git a/include/linux/zpool.h b/include/linux/zpool.h index e8997010612a..3296438eec06 100644 --- a/include/linux/zpool.h +++ b/include/linux/zpool.h @@ -14,10 +14,6 @@ struct zpool; -struct zpool_ops { - int (*evict)(struct zpool *pool, unsigned long handle); -}; - /* * Control how a handle is mapped. It will be ignored if the * implementation does not support it. Its use is optional. @@ -39,8 +35,7 @@ enum zpool_mapmode { bool zpool_has_pool(char *type); -struct zpool *zpool_create_pool(const char *type, const char *name, - gfp_t gfp, const struct zpool_ops *ops); +struct zpool *zpool_create_pool(const char *type, const char *name, gfp_t gfp); const char *zpool_get_type(struct zpool *pool); @@ -53,9 +48,6 @@ int zpool_malloc(struct zpool *pool, size_t size, gfp_t gfp, void zpool_free(struct zpool *pool, unsigned long handle); -int zpool_shrink(struct zpool *pool, unsigned int pages, - unsigned int *reclaimed); - void *zpool_map_handle(struct zpool *pool, unsigned long handle, enum zpool_mapmode mm); @@ -72,7 +64,6 @@ u64 zpool_get_total_size(struct zpool *pool); * @destroy: destroy a pool. * @malloc: allocate mem from a pool. * @free: free mem from a pool. - * @shrink: shrink the pool. * @sleep_mapped: whether zpool driver can sleep during map. * @map: map a handle. * @unmap: unmap a handle. @@ -87,10 +78,7 @@ struct zpool_driver { atomic_t refcount; struct list_head list; - void *(*create)(const char *name, - gfp_t gfp, - const struct zpool_ops *ops, - struct zpool *zpool); + void *(*create)(const char *name, gfp_t gfp); void (*destroy)(void *pool); bool malloc_support_movable; @@ -98,9 +86,6 @@ struct zpool_driver { unsigned long *handle); void (*free)(void *pool, unsigned long handle); - int (*shrink)(void *pool, unsigned int pages, - unsigned int *reclaimed); - bool sleep_mapped; void *(*map)(void *pool, unsigned long handle, enum zpool_mapmode mm); @@ -113,7 +98,6 @@ void zpool_register_driver(struct zpool_driver *driver); int zpool_unregister_driver(struct zpool_driver *driver); -bool zpool_evictable(struct zpool *pool); bool zpool_can_sleep_mapped(struct zpool *pool); #endif diff --git a/mm/z3fold.c b/mm/z3fold.c index 238a214de59f..e84de91ecccb 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -1364,9 +1364,7 @@ static const struct movable_operations z3fold_mops = { * zpool ****************/ -static void *z3fold_zpool_create(const char *name, gfp_t gfp, - const struct zpool_ops *zpool_ops, - struct zpool *zpool) +static void *z3fold_zpool_create(const char *name, gfp_t gfp) { return z3fold_create_pool(name, gfp); } diff --git a/mm/zbud.c b/mm/zbud.c index 9d35fd4091ed..2190cc1f37b3 100644 --- a/mm/zbud.c +++ b/mm/zbud.c @@ -380,9 +380,7 @@ static u64 zbud_get_pool_size(struct zbud_pool *pool) * zpool ****************/ -static void *zbud_zpool_create(const char *name, gfp_t gfp, - const struct zpool_ops *zpool_ops, - struct zpool *zpool) +static void *zbud_zpool_create(const char *name, gfp_t gfp) { return zbud_create_pool(gfp); } diff --git a/mm/zpool.c b/mm/zpool.c index 6a19c4a58f77..846410479c2f 100644 --- a/mm/zpool.c +++ b/mm/zpool.c @@ -133,7 +133,6 @@ EXPORT_SYMBOL(zpool_has_pool); * @type: The type of the zpool to create (e.g. zbud, zsmalloc) * @name: The name of the zpool (e.g. zram0, zswap) * @gfp: The GFP flags to use when allocating the pool. - * @ops: The optional ops callback. * * This creates a new zpool of the specified type. The gfp flags will be * used when allocating memory, if the implementation supports it. If the @@ -145,8 +144,7 @@ EXPORT_SYMBOL(zpool_has_pool); * * Returns: New zpool on success, NULL on failure. */ -struct zpool *zpool_create_pool(const char *type, const char *name, gfp_t gfp, - const struct zpool_ops *ops) +struct zpool *zpool_create_pool(const char *type, const char *name, gfp_t gfp) { struct zpool_driver *driver; struct zpool *zpool; @@ -173,7 +171,7 @@ struct zpool *zpool_create_pool(const char *type, const char *name, gfp_t gfp, } zpool->driver = driver; - zpool->pool = driver->create(name, gfp, ops, zpool); + zpool->pool = driver->create(name, gfp); if (!zpool->pool) { pr_err("couldn't create %s pool\n", type); @@ -279,30 +277,6 @@ void zpool_free(struct zpool *zpool, unsigned long handle) zpool->driver->free(zpool->pool, handle); } -/** - * zpool_shrink() - Shrink the pool size - * @zpool: The zpool to shrink. - * @pages: The number of pages to shrink the pool. - * @reclaimed: The number of pages successfully evicted. - * - * This attempts to shrink the actual memory size of the pool - * by evicting currently used handle(s). If the pool was - * created with no zpool_ops, or the evict call fails for any - * of the handles, this will fail. If non-NULL, the @reclaimed - * parameter will be set to the number of pages reclaimed, - * which may be more than the number of pages requested. - * - * Implementations must guarantee this to be thread-safe. - * - * Returns: 0 on success, negative value on error/failure. - */ -int zpool_shrink(struct zpool *zpool, unsigned int pages, - unsigned int *reclaimed) -{ - return zpool->driver->shrink ? - zpool->driver->shrink(zpool->pool, pages, reclaimed) : -EINVAL; -} - /** * zpool_map_handle() - Map a previously allocated handle into memory * @zpool: The zpool that the handle was allocated from @@ -359,24 +333,6 @@ u64 zpool_get_total_size(struct zpool *zpool) return zpool->driver->total_size(zpool->pool); } -/** - * zpool_evictable() - Test if zpool is potentially evictable - * @zpool: The zpool to test - * - * Zpool is only potentially evictable when it's created with struct - * zpool_ops.evict and its driver implements struct zpool_driver.shrink. - * - * However, it doesn't necessarily mean driver will use zpool_ops.evict - * in its implementation of zpool_driver.shrink. It could do internal - * defragmentation instead. - * - * Returns: true if potentially evictable; false otherwise. - */ -bool zpool_evictable(struct zpool *zpool) -{ - return zpool->driver->shrink; -} - /** * zpool_can_sleep_mapped - Test if zpool can sleep when do mapped. * @zpool: The zpool to test diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index e4d1ad521738..3f057970504e 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -351,9 +351,7 @@ static void record_obj(unsigned long handle, unsigned long obj) #ifdef CONFIG_ZPOOL -static void *zs_zpool_create(const char *name, gfp_t gfp, - const struct zpool_ops *zpool_ops, - struct zpool *zpool) +static void *zs_zpool_create(const char *name, gfp_t gfp) { /* * Ignore global gfp flags: zs_malloc() may be invoked from diff --git a/mm/zswap.c b/mm/zswap.c index 0024ec5ed574..a4f8c20e161b 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -258,10 +258,6 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle); static int zswap_pool_get(struct zswap_pool *pool); static void zswap_pool_put(struct zswap_pool *pool); -static const struct zpool_ops zswap_zpool_ops = { - .evict = zswap_writeback_entry -}; - static bool zswap_is_full(void) { return totalram_pages() * zswap_max_pool_percent / 100 < @@ -379,12 +375,9 @@ static void zswap_free_entry(struct zswap_entry *entry) if (!entry->length) atomic_dec(&zswap_same_filled_pages); else { - /* zpool_evictable will be removed once all 3 backends have migrated */ - if (!zpool_evictable(entry->pool->zpool)) { - spin_lock(&entry->pool->lru_lock); - list_del(&entry->lru); - spin_unlock(&entry->pool->lru_lock); - } + spin_lock(&entry->pool->lru_lock); + list_del(&entry->lru); + spin_unlock(&entry->pool->lru_lock); zpool_free(entry->pool->zpool, entry->handle); zswap_pool_put(entry->pool); } @@ -665,12 +658,8 @@ static void shrink_worker(struct work_struct *w) shrink_work); int ret, failures = 0; - /* zpool_evictable will be removed once all 3 backends have migrated */ do { - if (zpool_evictable(pool->zpool)) - ret = zpool_shrink(pool->zpool, 1, NULL); - else - ret = zswap_reclaim_entry(pool); + ret = zswap_reclaim_entry(pool); if (ret) { zswap_reject_reclaim_fail++; if (ret != -EAGAIN) @@ -708,7 +697,7 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor) /* unique name for each pool specifically required by zsmalloc */ snprintf(name, 38, "zswap%x", atomic_inc_return(&zswap_pools_count)); - pool->zpool = zpool_create_pool(type, name, gfp, &zswap_zpool_ops); + pool->zpool = zpool_create_pool(type, name, gfp); if (!pool->zpool) { pr_err("%s zpool not available\n", type); goto error; @@ -1394,8 +1383,7 @@ insert_entry: zswap_entry_put(tree, dupentry); } } while (ret == -EEXIST); - /* zpool_evictable will be removed once all 3 backends have migrated */ - if (entry->length && !zpool_evictable(entry->pool->zpool)) { + if (entry->length) { spin_lock(&entry->pool->lru_lock); list_add(&entry->lru, &entry->pool->lru); spin_unlock(&entry->pool->lru_lock); @@ -1514,8 +1502,7 @@ freeentry: if (!ret && zswap_exclusive_loads_enabled) { zswap_invalidate_entry(tree, entry); *exclusive = true; - } else if (entry->length && !zpool_evictable(entry->pool->zpool)) { - /* zpool_evictable will be removed once all 3 backends have migrated */ + } else if (entry->length) { spin_lock(&entry->pool->lru_lock); list_move(&entry->lru, &entry->pool->lru); spin_unlock(&entry->pool->lru_lock); -- cgit v1.2.3 From 1e3be4856f49d55c60b6cd500297b06acfe216a9 Mon Sep 17 00:00:00 2001 From: Tarun Sahu Date: Mon, 12 Jun 2023 15:05:14 +0530 Subject: mm/folio: replace set_compound_order with folio_set_order The patch ("mm/folio: Avoid special handling for order value 0 in folio_set_order") [1] removed the need for special handling of order = 0 in folio_set_order. Now, folio_set_order and set_compound_order becomes similar function. This patch removes the set_compound_order and uses folio_set_order instead. [1] https://lore.kernel.org/all/20230609183032.13E08C433D2@smtp.kernel.org/ Link: https://lkml.kernel.org/r/20230612093514.689846-1-tsahu@linux.ibm.com Signed-off-by: Tarun Sahu Reviewed-by Sidhartha Kumar Reviewed-by: Muchun Song Cc: Aneesh Kumar K.V Cc: Gerald Schaefer Cc: Matthew Wilcox Cc: Mike Kravetz Signed-off-by: Andrew Morton --- include/linux/mm.h | 10 ---------- mm/internal.h | 32 ++++++++++++++++---------------- 2 files changed, 16 insertions(+), 26 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 8f40bf17d597..ab04756b2240 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1232,16 +1232,6 @@ static inline void folio_set_compound_dtor(struct folio *folio, void destroy_large_folio(struct folio *folio); -static inline void set_compound_order(struct page *page, unsigned int order) -{ - struct folio *folio = (struct folio *)page; - - folio->_folio_order = order; -#ifdef CONFIG_64BIT - folio->_folio_nr_pages = 1U << order; -#endif -} - /* Returns the number of bytes in this potentially compound page. */ static inline unsigned long page_size(struct page *page) { diff --git a/mm/internal.h b/mm/internal.h index 33b8b8f66af3..b0d8778dd910 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -387,12 +387,27 @@ extern void memblock_free_pages(struct page *page, unsigned long pfn, unsigned int order); extern void __free_pages_core(struct page *page, unsigned int order); +/* + * This will have no effect, other than possibly generating a warning, if the + * caller passes in a non-large folio. + */ +static inline void folio_set_order(struct folio *folio, unsigned int order) +{ + if (WARN_ON_ONCE(!order || !folio_test_large(folio))) + return; + + folio->_folio_order = order; +#ifdef CONFIG_64BIT + folio->_folio_nr_pages = 1U << order; +#endif +} + static inline void prep_compound_head(struct page *page, unsigned int order) { struct folio *folio = (struct folio *)page; folio_set_compound_dtor(folio, COMPOUND_PAGE_DTOR); - set_compound_order(page, order); + folio_set_order(folio, order); atomic_set(&folio->_entire_mapcount, -1); atomic_set(&folio->_nr_pages_mapped, 0); atomic_set(&folio->_pincount, 0); @@ -432,21 +447,6 @@ void memmap_init_range(unsigned long, int, unsigned long, unsigned long, int split_free_page(struct page *free_page, unsigned int order, unsigned long split_pfn_offset); -/* - * This will have no effect, other than possibly generating a warning, if the - * caller passes in a non-large folio. - */ -static inline void folio_set_order(struct folio *folio, unsigned int order) -{ - if (WARN_ON_ONCE(!order || !folio_test_large(folio))) - return; - - folio->_folio_order = order; -#ifdef CONFIG_64BIT - folio->_folio_nr_pages = 1U << order; -#endif -} - #if defined CONFIG_COMPACTION || defined CONFIG_CMA /* -- cgit v1.2.3 From 65ac132027a884c411b8f9f96d240ba2dde34dec Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Wed, 31 May 2023 21:54:02 -0400 Subject: userfaultfd: fix regression in userfaultfd_unmap_prep() Android reported a performance regression in the userfaultfd unmap path. A closer inspection on the userfaultfd_unmap_prep() change showed that a second tree walk would be necessary in the reworked code. Fix the regression by passing each VMA that will be unmapped through to the userfaultfd_unmap_prep() function as they are added to the unmap list, instead of re-walking the tree for the VMA. Link: https://lkml.kernel.org/r/20230601015402.2819343-1-Liam.Howlett@oracle.com Fixes: 69dbe6daf104 ("userfaultfd: use maple tree iterator to iterate VMAs") Signed-off-by: Liam R. Howlett Reported-by: Suren Baghdasaryan Suggested-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- fs/userfaultfd.c | 35 +++++++++++++++-------------------- include/linux/userfaultfd_k.h | 6 +++--- mm/mmap.c | 31 +++++++++++++++---------------- 3 files changed, 33 insertions(+), 39 deletions(-) (limited to 'include') diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 478e2b169c13..0aa5caac5164 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -852,31 +852,26 @@ static bool has_unmap_ctx(struct userfaultfd_ctx *ctx, struct list_head *unmaps, return false; } -int userfaultfd_unmap_prep(struct mm_struct *mm, unsigned long start, +int userfaultfd_unmap_prep(struct vm_area_struct *vma, unsigned long start, unsigned long end, struct list_head *unmaps) { - VMA_ITERATOR(vmi, mm, start); - struct vm_area_struct *vma; - - for_each_vma_range(vmi, vma, end) { - struct userfaultfd_unmap_ctx *unmap_ctx; - struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx; + struct userfaultfd_unmap_ctx *unmap_ctx; + struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx; - if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_UNMAP) || - has_unmap_ctx(ctx, unmaps, start, end)) - continue; + if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_UNMAP) || + has_unmap_ctx(ctx, unmaps, start, end)) + return 0; - unmap_ctx = kzalloc(sizeof(*unmap_ctx), GFP_KERNEL); - if (!unmap_ctx) - return -ENOMEM; + unmap_ctx = kzalloc(sizeof(*unmap_ctx), GFP_KERNEL); + if (!unmap_ctx) + return -ENOMEM; - userfaultfd_ctx_get(ctx); - atomic_inc(&ctx->mmap_changing); - unmap_ctx->ctx = ctx; - unmap_ctx->start = start; - unmap_ctx->end = end; - list_add_tail(&unmap_ctx->list, unmaps); - } + userfaultfd_ctx_get(ctx); + atomic_inc(&ctx->mmap_changing); + unmap_ctx->ctx = ctx; + unmap_ctx->start = start; + unmap_ctx->end = end; + list_add_tail(&unmap_ctx->list, unmaps); return 0; } diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index d78b01524349..ac7b0c96d351 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -188,8 +188,8 @@ extern bool userfaultfd_remove(struct vm_area_struct *vma, unsigned long start, unsigned long end); -extern int userfaultfd_unmap_prep(struct mm_struct *mm, unsigned long start, - unsigned long end, struct list_head *uf); +extern int userfaultfd_unmap_prep(struct vm_area_struct *vma, + unsigned long start, unsigned long end, struct list_head *uf); extern void userfaultfd_unmap_complete(struct mm_struct *mm, struct list_head *uf); extern bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma); @@ -271,7 +271,7 @@ static inline bool userfaultfd_remove(struct vm_area_struct *vma, return true; } -static inline int userfaultfd_unmap_prep(struct mm_struct *mm, +static inline int userfaultfd_unmap_prep(struct vm_area_struct *vma, unsigned long start, unsigned long end, struct list_head *uf) { diff --git a/mm/mmap.c b/mm/mmap.c index f084b7940431..4fc496bc5b95 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2417,6 +2417,21 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, goto munmap_sidetree_failed; count++; + if (unlikely(uf)) { + /* + * If userfaultfd_unmap_prep returns an error the vmas + * will remain split, but userland will get a + * highly unexpected error anyway. This is no + * different than the case where the first of the two + * __split_vma fails, but we don't undo the first + * split, despite we could. This is unlikely enough + * failure that it's not worth optimizing it for. + */ + error = userfaultfd_unmap_prep(next, start, end, uf); + + if (error) + goto userfaultfd_error; + } #ifdef CONFIG_DEBUG_VM_MAPLE_TREE BUG_ON(next->vm_start < start); BUG_ON(next->vm_start > end); @@ -2429,22 +2444,6 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, if (!next) next = vma_next(vmi); - if (unlikely(uf)) { - /* - * If userfaultfd_unmap_prep returns an error the vmas - * will remain split, but userland will get a - * highly unexpected error anyway. This is no - * different than the case where the first of the two - * __split_vma fails, but we don't undo the first - * split, despite we could. This is unlikely enough - * failure that it's not worth optimizing it for. - */ - error = userfaultfd_unmap_prep(mm, start, end, uf); - - if (error) - goto userfaultfd_error; - } - #if defined(CONFIG_DEBUG_VM_MAPLE_TREE) /* Make sure no VMAs are about to be lost. */ { -- cgit v1.2.3 From e4d86756159b5794edad5b0d0d19c6f3d9888240 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Sat, 10 Jun 2023 18:19:56 +0800 Subject: mm: remove unused vma_init_lock() commit c7f8f31c00d1 ("mm: separate vma->lock from vm_area_struct") left this behind. Link: https://lkml.kernel.org/r/20230610101956.20592-1-yuehaibing@huawei.com Signed-off-by: YueHaibing Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton --- include/linux/mm.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index ab04756b2240..f20ac57b634d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -725,7 +725,6 @@ struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, #else /* CONFIG_PER_VMA_LOCK */ -static inline void vma_init_lock(struct vm_area_struct *vma) {} static inline bool vma_start_read(struct vm_area_struct *vma) { return false; } static inline void vma_end_read(struct vm_area_struct *vma) {} -- cgit v1.2.3 From 833dfc0090b3f8017ddac82d818b2d8e5ceb61db Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Sat, 10 Jun 2023 11:46:15 +0800 Subject: mm: compaction: mark kcompactd_run() and kcompactd_stop() __meminit Add __meminit to kcompactd_run() and kcompactd_stop() to ensure they're default to __init when memory hotplug is not enabled. Link: https://lkml.kernel.org/r/20230610034615.997813-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Baolin Wang Signed-off-by: Andrew Morton --- include/linux/compaction.h | 4 ++-- mm/compaction.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/compaction.h b/include/linux/compaction.h index 57b16e69c19a..e94776496049 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -98,8 +98,8 @@ extern void compaction_defer_reset(struct zone *zone, int order, bool compaction_zonelist_suitable(struct alloc_context *ac, int order, int alloc_flags); -extern void kcompactd_run(int nid); -extern void kcompactd_stop(int nid); +extern void __meminit kcompactd_run(int nid); +extern void __meminit kcompactd_stop(int nid); extern void wakeup_kcompactd(pg_data_t *pgdat, int order, int highest_zoneidx); #else diff --git a/mm/compaction.c b/mm/compaction.c index 767b0815c874..6149a2d324be 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -3050,7 +3050,7 @@ static int kcompactd(void *p) * This kcompactd start function will be called by init and node-hot-add. * On node-hot-add, kcompactd will moved to proper cpus if cpus are hot-added. */ -void kcompactd_run(int nid) +void __meminit kcompactd_run(int nid) { pg_data_t *pgdat = NODE_DATA(nid); @@ -3068,7 +3068,7 @@ void kcompactd_run(int nid) * Called by memory hotplug when all memory in a node is offlined. Caller must * be holding mem_hotplug_begin/done(). */ -void kcompactd_stop(int nid) +void __meminit kcompactd_stop(int nid) { struct task_struct *kcompactd = NODE_DATA(nid)->kcompactd; -- cgit v1.2.3 From 53418a18fcbbb086dbfacbdd9b853c1071d3ec16 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 12 Jun 2023 22:01:31 +0100 Subject: buffer: convert __block_write_full_page() to __block_write_full_folio() Remove nine hidden calls to compound_head() by using a folio instead of a page. Link: https://lkml.kernel.org/r/20230612210141.730128-5-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Tested-by: Bob Peterson Reviewed-by: Bob Peterson Cc: Andreas Gruenbacher Cc: Hannes Reinecke Cc: Luis Chamberlain Signed-off-by: Andrew Morton --- fs/buffer.c | 53 +++++++++++++++++++++++---------------------- fs/gfs2/aops.c | 5 ++--- fs/ntfs/aops.c | 2 +- fs/reiserfs/inode.c | 2 +- include/linux/buffer_head.h | 2 +- 5 files changed, 32 insertions(+), 32 deletions(-) (limited to 'include') diff --git a/fs/buffer.c b/fs/buffer.c index a7fc561758b1..4d518df50fab 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1764,7 +1764,7 @@ static struct buffer_head *folio_create_buffers(struct folio *folio, * WB_SYNC_ALL, the writes are posted using REQ_SYNC; this * causes the writes to be flagged as synchronous writes. */ -int __block_write_full_page(struct inode *inode, struct page *page, +int __block_write_full_folio(struct inode *inode, struct folio *folio, get_block_t *get_block, struct writeback_control *wbc, bh_end_io_t *handler) { @@ -1776,14 +1776,14 @@ int __block_write_full_page(struct inode *inode, struct page *page, int nr_underway = 0; blk_opf_t write_flags = wbc_to_write_flags(wbc); - head = folio_create_buffers(page_folio(page), inode, + head = folio_create_buffers(folio, inode, (1 << BH_Dirty) | (1 << BH_Uptodate)); /* * Be very careful. We have no exclusion from block_dirty_folio * here, and the (potentially unmapped) buffers may become dirty at * any time. If a buffer becomes dirty here after we've inspected it - * then we just miss that fact, and the page stays dirty. + * then we just miss that fact, and the folio stays dirty. * * Buffers outside i_size may be dirtied by block_dirty_folio; * handle that here by just cleaning them. @@ -1793,7 +1793,7 @@ int __block_write_full_page(struct inode *inode, struct page *page, blocksize = bh->b_size; bbits = block_size_bits(blocksize); - block = (sector_t)page->index << (PAGE_SHIFT - bbits); + block = (sector_t)folio->index << (PAGE_SHIFT - bbits); last_block = (i_size_read(inode) - 1) >> bbits; /* @@ -1804,7 +1804,7 @@ int __block_write_full_page(struct inode *inode, struct page *page, if (block > last_block) { /* * mapped buffers outside i_size will occur, because - * this page can be outside i_size when there is a + * this folio can be outside i_size when there is a * truncate in progress. */ /* @@ -1834,7 +1834,7 @@ int __block_write_full_page(struct inode *inode, struct page *page, continue; /* * If it's a fully non-blocking write attempt and we cannot - * lock the buffer then redirty the page. Note that this can + * lock the buffer then redirty the folio. Note that this can * potentially cause a busy-wait loop from writeback threads * and kswapd activity, but those code paths have their own * higher-level throttling. @@ -1842,7 +1842,7 @@ int __block_write_full_page(struct inode *inode, struct page *page, if (wbc->sync_mode != WB_SYNC_NONE) { lock_buffer(bh); } else if (!trylock_buffer(bh)) { - redirty_page_for_writepage(wbc, page); + folio_redirty_for_writepage(wbc, folio); continue; } if (test_clear_buffer_dirty(bh)) { @@ -1853,11 +1853,11 @@ int __block_write_full_page(struct inode *inode, struct page *page, } while ((bh = bh->b_this_page) != head); /* - * The page and its buffers are protected by PageWriteback(), so we can - * drop the bh refcounts early. + * The folio and its buffers are protected by the writeback flag, + * so we can drop the bh refcounts early. */ - BUG_ON(PageWriteback(page)); - set_page_writeback(page); + BUG_ON(folio_test_writeback(folio)); + folio_start_writeback(folio); do { struct buffer_head *next = bh->b_this_page; @@ -1867,20 +1867,20 @@ int __block_write_full_page(struct inode *inode, struct page *page, } bh = next; } while (bh != head); - unlock_page(page); + folio_unlock(folio); err = 0; done: if (nr_underway == 0) { /* - * The page was marked dirty, but the buffers were + * The folio was marked dirty, but the buffers were * clean. Someone wrote them back by hand with * write_dirty_buffer/submit_bh. A rare case. */ - end_page_writeback(page); + folio_end_writeback(folio); /* - * The page and buffer_heads can be released at any time from + * The folio and buffer_heads can be released at any time from * here on. */ } @@ -1891,7 +1891,7 @@ recover: * ENOSPC, or some other error. We may already have added some * blocks to the file, so we need to write these out to avoid * exposing stale data. - * The page is currently locked and not marked for writeback + * The folio is currently locked and not marked for writeback */ bh = head; /* Recovery: lock and submit the mapped buffers */ @@ -1903,15 +1903,15 @@ recover: } else { /* * The buffer may have been set dirty during - * attachment to a dirty page. + * attachment to a dirty folio. */ clear_buffer_dirty(bh); } } while ((bh = bh->b_this_page) != head); - SetPageError(page); - BUG_ON(PageWriteback(page)); - mapping_set_error(page->mapping, err); - set_page_writeback(page); + folio_set_error(folio); + BUG_ON(folio_test_writeback(folio)); + mapping_set_error(folio->mapping, err); + folio_start_writeback(folio); do { struct buffer_head *next = bh->b_this_page; if (buffer_async_write(bh)) { @@ -1921,10 +1921,10 @@ recover: } bh = next; } while (bh != head); - unlock_page(page); + folio_unlock(folio); goto done; } -EXPORT_SYMBOL(__block_write_full_page); +EXPORT_SYMBOL(__block_write_full_folio); /* * If a page has any new buffers, zero them out here, and mark them uptodate @@ -2677,6 +2677,7 @@ EXPORT_SYMBOL(block_truncate_page); int block_write_full_page(struct page *page, get_block_t *get_block, struct writeback_control *wbc) { + struct folio *folio = page_folio(page); struct inode * const inode = page->mapping->host; loff_t i_size = i_size_read(inode); const pgoff_t end_index = i_size >> PAGE_SHIFT; @@ -2684,13 +2685,13 @@ int block_write_full_page(struct page *page, get_block_t *get_block, /* Is the page fully inside i_size? */ if (page->index < end_index) - return __block_write_full_page(inode, page, get_block, wbc, + return __block_write_full_folio(inode, folio, get_block, wbc, end_buffer_async_write); /* Is the page fully outside i_size? (truncate in progress) */ offset = i_size & (PAGE_SIZE-1); if (page->index >= end_index+1 || !offset) { - unlock_page(page); + folio_unlock(folio); return 0; /* don't care */ } @@ -2702,7 +2703,7 @@ int block_write_full_page(struct page *page, get_block_t *get_block, * writes to that region are not written out to the file." */ zero_user_segment(page, offset, PAGE_SIZE); - return __block_write_full_page(inode, page, get_block, wbc, + return __block_write_full_folio(inode, folio, get_block, wbc, end_buffer_async_write); } EXPORT_SYMBOL(block_write_full_page); diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index ec5b5c1ea634..3a2be1901e1e 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -107,9 +107,8 @@ static int gfs2_write_jdata_folio(struct folio *folio, folio_zero_segment(folio, offset_in_folio(folio, i_size), folio_size(folio)); - return __block_write_full_page(inode, &folio->page, - gfs2_get_block_noalloc, wbc, - end_buffer_async_write); + return __block_write_full_folio(inode, folio, gfs2_get_block_noalloc, + wbc, end_buffer_async_write); } /** diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c index e8aeba124a95..4e158bce4192 100644 --- a/fs/ntfs/aops.c +++ b/fs/ntfs/aops.c @@ -526,7 +526,7 @@ err_out: * * Return 0 on success and -errno on error. * - * Based on ntfs_read_block() and __block_write_full_page(). + * Based on ntfs_read_block() and __block_write_full_folio(). */ static int ntfs_write_block(struct page *page, struct writeback_control *wbc) { diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index d8debbb6105f..ff34ee49106f 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -2506,7 +2506,7 @@ out: /* * mason@suse.com: updated in 2.5.54 to follow the same general io - * start/recovery path as __block_write_full_page, along with special + * start/recovery path as __block_write_full_folio, along with special * code to handle reiserfs tails. */ static int reiserfs_write_full_page(struct page *page, diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 1520793c72da..a366e01f8bd4 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -263,7 +263,7 @@ extern int buffer_heads_over_limit; void block_invalidate_folio(struct folio *folio, size_t offset, size_t length); int block_write_full_page(struct page *page, get_block_t *get_block, struct writeback_control *wbc); -int __block_write_full_page(struct inode *inode, struct page *page, +int __block_write_full_folio(struct inode *inode, struct folio *folio, get_block_t *get_block, struct writeback_control *wbc, bh_end_io_t *handler); int block_read_full_folio(struct folio *, get_block_t *); -- cgit v1.2.3 From 4a9622f2fdaee84c373f3f285d898a3ea60ee9f2 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 12 Jun 2023 22:01:36 +0100 Subject: buffer: convert page_zero_new_buffers() to folio_zero_new_buffers() Most of the callers already have a folio; convert reiserfs_write_end() to have a folio. Removes a couple of hidden calls to compound_head(). Link: https://lkml.kernel.org/r/20230612210141.730128-10-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Andreas Gruenbacher Cc: Bob Peterson Cc: Hannes Reinecke Cc: Luis Chamberlain Signed-off-by: Andrew Morton --- fs/buffer.c | 27 ++++++++++++++------------- fs/ext4/inode.c | 4 ++-- fs/reiserfs/inode.c | 7 ++++--- include/linux/buffer_head.h | 2 +- 4 files changed, 21 insertions(+), 19 deletions(-) (limited to 'include') diff --git a/fs/buffer.c b/fs/buffer.c index 97c64b05151f..e4bd465ecee8 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1927,33 +1927,34 @@ recover: EXPORT_SYMBOL(__block_write_full_folio); /* - * If a page has any new buffers, zero them out here, and mark them uptodate + * If a folio has any new buffers, zero them out here, and mark them uptodate * and dirty so they'll be written out (in order to prevent uninitialised * block data from leaking). And clear the new bit. */ -void page_zero_new_buffers(struct page *page, unsigned from, unsigned to) +void folio_zero_new_buffers(struct folio *folio, size_t from, size_t to) { - unsigned int block_start, block_end; + size_t block_start, block_end; struct buffer_head *head, *bh; - BUG_ON(!PageLocked(page)); - if (!page_has_buffers(page)) + BUG_ON(!folio_test_locked(folio)); + head = folio_buffers(folio); + if (!head) return; - bh = head = page_buffers(page); + bh = head; block_start = 0; do { block_end = block_start + bh->b_size; if (buffer_new(bh)) { if (block_end > from && block_start < to) { - if (!PageUptodate(page)) { - unsigned start, size; + if (!folio_test_uptodate(folio)) { + size_t start, xend; start = max(from, block_start); - size = min(to, block_end) - start; + xend = min(to, block_end); - zero_user(page, start, size); + folio_zero_segment(folio, start, xend); set_buffer_uptodate(bh); } @@ -1966,7 +1967,7 @@ void page_zero_new_buffers(struct page *page, unsigned from, unsigned to) bh = bh->b_this_page; } while (bh != head); } -EXPORT_SYMBOL(page_zero_new_buffers); +EXPORT_SYMBOL(folio_zero_new_buffers); static void iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh, @@ -2104,7 +2105,7 @@ int __block_write_begin_int(struct folio *folio, loff_t pos, unsigned len, err = -EIO; } if (unlikely(err)) - page_zero_new_buffers(&folio->page, from, to); + folio_zero_new_buffers(folio, from, to); return err; } @@ -2208,7 +2209,7 @@ int block_write_end(struct file *file, struct address_space *mapping, if (!folio_test_uptodate(folio)) copied = 0; - page_zero_new_buffers(&folio->page, start+copied, start+len); + folio_zero_new_buffers(folio, start+copied, start+len); } flush_dcache_folio(folio); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index ce5f21b6c2b3..31b839a0ce8b 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1093,7 +1093,7 @@ static int ext4_block_write_begin(struct folio *folio, loff_t pos, unsigned len, err = -EIO; } if (unlikely(err)) { - page_zero_new_buffers(&folio->page, from, to); + folio_zero_new_buffers(folio, from, to); } else if (fscrypt_inode_uses_fs_layer_crypto(inode)) { for (i = 0; i < nr_wait; i++) { int err2; @@ -1339,7 +1339,7 @@ static int ext4_write_end(struct file *file, } /* - * This is a private version of page_zero_new_buffers() which doesn't + * This is a private version of folio_zero_new_buffers() which doesn't * set the buffer to be dirty, since in data=journalled mode we need * to call ext4_dirty_journalled_data() instead. */ diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index ff34ee49106f..77bd3b27059f 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -2872,6 +2872,7 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata) { + struct folio *folio = page_folio(page); struct inode *inode = page->mapping->host; int ret = 0; int update_sd = 0; @@ -2887,12 +2888,12 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping, start = pos & (PAGE_SIZE - 1); if (unlikely(copied < len)) { - if (!PageUptodate(page)) + if (!folio_test_uptodate(folio)) copied = 0; - page_zero_new_buffers(page, start + copied, start + len); + folio_zero_new_buffers(folio, start + copied, start + len); } - flush_dcache_page(page); + flush_dcache_folio(folio); reiserfs_commit_page(inode, page, start, start + copied); diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index a366e01f8bd4..c794ea7096ba 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -278,7 +278,7 @@ int block_write_end(struct file *, struct address_space *, int generic_write_end(struct file *, struct address_space *, loff_t, unsigned, unsigned, struct page *, void *); -void page_zero_new_buffers(struct page *page, unsigned from, unsigned to); +void folio_zero_new_buffers(struct folio *folio, size_t from, size_t to); void clean_page_buffers(struct page *page); int cont_write_begin(struct file *, struct address_space *, loff_t, unsigned, struct page **, void **, -- cgit v1.2.3 From 6c77b607ee26472fb945aa41734281c39d06d68f Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 14 Jun 2023 22:36:12 +0800 Subject: mm: kill lock|unlock_page_memcg() Since commit c7c3dec1c9db ("mm: rmap: remove lock_page_memcg()"), no more user, kill lock_page_memcg() and unlock_page_memcg(). Link: https://lkml.kernel.org/r/20230614143612.62575-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Acked-by: Johannes Weiner Reviewed-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- Documentation/admin-guide/cgroup-v1/memory.rst | 2 +- include/linux/memcontrol.h | 12 +----------- mm/filemap.c | 2 +- mm/memcontrol.c | 18 ++++-------------- mm/page-writeback.c | 6 +++--- 5 files changed, 10 insertions(+), 30 deletions(-) (limited to 'include') diff --git a/Documentation/admin-guide/cgroup-v1/memory.rst b/Documentation/admin-guide/cgroup-v1/memory.rst index 47d1d7d932a8..fabaad3fd9c2 100644 --- a/Documentation/admin-guide/cgroup-v1/memory.rst +++ b/Documentation/admin-guide/cgroup-v1/memory.rst @@ -297,7 +297,7 @@ Lock order is as follows:: Page lock (PG_locked bit of page->flags) mm->page_table_lock or split pte_lock - lock_page_memcg (memcg->move_lock) + folio_memcg_lock (memcg->move_lock) mapping->i_pages lock lruvec->lru_lock. diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 00a88cf947e1..c3d3a0c09315 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -419,7 +419,7 @@ static inline struct obj_cgroup *__folio_objcg(struct folio *folio) * * - the folio lock * - LRU isolation - * - lock_page_memcg() + * - folio_memcg_lock() * - exclusive reference * - mem_cgroup_trylock_pages() * @@ -949,8 +949,6 @@ void mem_cgroup_print_oom_group(struct mem_cgroup *memcg); void folio_memcg_lock(struct folio *folio); void folio_memcg_unlock(struct folio *folio); -void lock_page_memcg(struct page *page); -void unlock_page_memcg(struct page *page); void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val); @@ -1438,14 +1436,6 @@ mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg) { } -static inline void lock_page_memcg(struct page *page) -{ -} - -static inline void unlock_page_memcg(struct page *page) -{ -} - static inline void folio_memcg_lock(struct folio *folio) { } diff --git a/mm/filemap.c b/mm/filemap.c index 00933089b8b6..758bbdf300e7 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -117,7 +117,7 @@ * ->i_pages lock (page_remove_rmap->set_page_dirty) * bdi.wb->list_lock (page_remove_rmap->set_page_dirty) * ->inode->i_lock (page_remove_rmap->set_page_dirty) - * ->memcg->move_lock (page_remove_rmap->lock_page_memcg) + * ->memcg->move_lock (page_remove_rmap->folio_memcg_lock) * bdi.wb->list_lock (zap_pte_range->set_page_dirty) * ->inode->i_lock (zap_pte_range->set_page_dirty) * ->private_lock (zap_pte_range->block_dirty_folio) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 93056918e956..cf06b1c9b3bb 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2148,17 +2148,12 @@ again: * When charge migration first begins, we can have multiple * critical sections holding the fast-path RCU lock and one * holding the slowpath move_lock. Track the task who has the - * move_lock for unlock_page_memcg(). + * move_lock for folio_memcg_unlock(). */ memcg->move_lock_task = current; memcg->move_lock_flags = flags; } -void lock_page_memcg(struct page *page) -{ - folio_memcg_lock(page_folio(page)); -} - static void __folio_memcg_unlock(struct mem_cgroup *memcg) { if (memcg && memcg->move_lock_task == current) { @@ -2186,11 +2181,6 @@ void folio_memcg_unlock(struct folio *folio) __folio_memcg_unlock(folio_memcg(folio)); } -void unlock_page_memcg(struct page *page) -{ - folio_memcg_unlock(page_folio(page)); -} - struct memcg_stock_pcp { local_lock_t stock_lock; struct mem_cgroup *cached; /* this never be root cgroup */ @@ -2866,7 +2856,7 @@ static void commit_charge(struct folio *folio, struct mem_cgroup *memcg) * * - the page lock * - LRU isolation - * - lock_page_memcg() + * - folio_memcg_lock() * - exclusive reference * - mem_cgroup_trylock_pages() */ @@ -5829,7 +5819,7 @@ static int mem_cgroup_move_account(struct page *page, * with (un)charging, migration, LRU putback, or anything else * that would rely on a stable page's memory cgroup. * - * Note that lock_page_memcg is a memcg lock, not a page lock, + * Note that folio_memcg_lock is a memcg lock, not a page lock, * to save space. As soon as we switch page's memory cgroup to a * new memcg that isn't locked, the above state can change * concurrently again. Make sure we're truly done with it. @@ -6320,7 +6310,7 @@ static void mem_cgroup_move_charge(void) { lru_add_drain_all(); /* - * Signal lock_page_memcg() to take the memcg's move_lock + * Signal folio_memcg_lock() to take the memcg's move_lock * while we're moving its pages to another memcg. Then wait * for already started RCU-only updates to finish. */ diff --git a/mm/page-writeback.c b/mm/page-writeback.c index db7943999007..1d17fb1ec863 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2597,7 +2597,7 @@ EXPORT_SYMBOL(noop_dirty_folio); /* * Helper function for set_page_dirty family. * - * Caller must hold lock_page_memcg(). + * Caller must hold folio_memcg_lock(). * * NOTE: This relies on being atomic wrt interrupts. */ @@ -2631,7 +2631,7 @@ static void folio_account_dirtied(struct folio *folio, /* * Helper function for deaccounting dirty page without writeback. * - * Caller must hold lock_page_memcg(). + * Caller must hold folio_memcg_lock(). */ void folio_account_cleaned(struct folio *folio, struct bdi_writeback *wb) { @@ -2650,7 +2650,7 @@ void folio_account_cleaned(struct folio *folio, struct bdi_writeback *wb) * If warn is true, then emit a warning if the folio is not uptodate and has * not been truncated. * - * The caller must hold lock_page_memcg(). Most callers have the folio + * The caller must hold folio_memcg_lock(). Most callers have the folio * locked. A few have the folio blocked from truncation through other * means (eg zap_vma_pages() has it mapped and is holding the page table * lock). This can also be called from mark_buffer_dirty(), which I -- cgit v1.2.3 From 708ff4914dfb410761227a219c17c3e9dbd68c05 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Tue, 13 Jun 2023 19:13:08 -0700 Subject: mmzone: introduce folio_is_zone_movable() Patch series "Replace is_longterm_pinnable_page()", v2. This patchset introduces some more helper functions for the folio conversions, and converts all callers of is_longterm_pinnable_page() to use folios. This patch (of 5): Introduce folio_is_zone_movable() to act as a folio equivalent for is_zone_movable_page(). This is to assist in later folio conversions. Link: https://lkml.kernel.org/r/20230614021312.34085-1-vishal.moola@gmail.com Link: https://lkml.kernel.org/r/20230614021312.34085-2-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Reviewed-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 5a7ada0413da..f10902491ead 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1116,6 +1116,11 @@ static inline bool is_zone_movable_page(const struct page *page) { return page_zonenum(page) == ZONE_MOVABLE; } + +static inline bool folio_is_zone_movable(const struct folio *folio) +{ + return folio_zonenum(folio) == ZONE_MOVABLE; +} #endif /* -- cgit v1.2.3 From 28fb54f6a2fd6cc471165cce1650a57dfbf49746 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Tue, 13 Jun 2023 19:13:09 -0700 Subject: mmzone: introduce folio_migratetype() Introduce folio_migratetype() as a folio equivalent for get_pageblock_migratetype(). This function intends to return the migratetype the folio is located in, hence the name choice. Link: https://lkml.kernel.org/r/20230614021312.34085-3-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Reviewed-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index f10902491ead..3e822335f214 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -105,6 +105,9 @@ extern int page_group_by_mobility_disabled; #define get_pageblock_migratetype(page) \ get_pfnblock_flags_mask(page, page_to_pfn(page), MIGRATETYPE_MASK) +#define folio_migratetype(folio) \ + get_pfnblock_flags_mask(&folio->page, folio_pfn(folio), \ + MIGRATETYPE_MASK) struct free_area { struct list_head free_list[MIGRATE_TYPES]; unsigned long nr_free; -- cgit v1.2.3 From 5d949953f841fd661a2a49df188426d5930ed723 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Tue, 13 Jun 2023 19:13:12 -0700 Subject: mm: remove is_longterm_pinnable_page() and reimplement folio_is_longterm_pinnable() folio_is_longterm_pinnable() already exists as a wrapper function. Now that the whole implementation of is_longterm_pinnable_page() can be implemented using folios, folio_is_longterm_pinnable() can be made its own standalone function - and we can remove is_longterm_pinnable_page(). Link: https://lkml.kernel.org/r/20230614021312.34085-6-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: Lorenzo Stoakes Signed-off-by: Andrew Morton --- include/linux/mm.h | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index f20ac57b634d..a8baa34d0747 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1902,39 +1902,35 @@ static inline bool page_needs_cow_for_dma(struct vm_area_struct *vma, return page_maybe_dma_pinned(page); } -/* MIGRATE_CMA and ZONE_MOVABLE do not allow pin pages */ +/* MIGRATE_CMA and ZONE_MOVABLE do not allow pin folios */ #ifdef CONFIG_MIGRATION -static inline bool is_longterm_pinnable_page(struct page *page) +static inline bool folio_is_longterm_pinnable(struct folio *folio) { #ifdef CONFIG_CMA - int mt = get_pageblock_migratetype(page); + int mt = folio_migratetype(folio); if (mt == MIGRATE_CMA || mt == MIGRATE_ISOLATE) return false; #endif /* The zero page may always be pinned */ - if (is_zero_pfn(page_to_pfn(page))) + if (is_zero_pfn(folio_pfn(folio))) return true; /* Coherent device memory must always allow eviction. */ - if (is_device_coherent_page(page)) + if (folio_is_device_coherent(folio)) return false; - /* Otherwise, non-movable zone pages can be pinned. */ - return !is_zone_movable_page(page); + /* Otherwise, non-movable zone folios can be pinned. */ + return !folio_is_zone_movable(folio); + } #else -static inline bool is_longterm_pinnable_page(struct page *page) +static inline bool folio_is_longterm_pinnable(struct folio *folio) { return true; } #endif -static inline bool folio_is_longterm_pinnable(struct folio *folio) -{ - return is_longterm_pinnable_page(&folio->page); -} - static inline void set_page_zone(struct page *page, enum zone_type zone) { page->flags &= ~(ZONES_MASK << ZONES_PGSHIFT); -- cgit v1.2.3 From 025b7799b35d32e46988ba0614ea2f91b85d6375 Mon Sep 17 00:00:00 2001 From: ZhangPeng Date: Fri, 16 Jun 2023 14:30:30 +0800 Subject: mm/memcg: remove return value of mem_cgroup_scan_tasks() No user checks the return value of mem_cgroup_scan_tasks(). Make the return value void. Link: https://lkml.kernel.org/r/20230616063030.977586-1-zhangpeng362@huawei.com Signed-off-by: ZhangPeng Cc: Johannes Weiner Cc: Kefeng Wang Cc: Michal Hocko Cc: Muchun Song Cc: Nanyong Sun Cc: Roman Gushchin Cc: Shakeel Butt Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 7 +++---- mm/memcontrol.c | 9 ++++----- 2 files changed, 7 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index c3d3a0c09315..5818af8eca5a 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -820,8 +820,8 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *, struct mem_cgroup *, struct mem_cgroup_reclaim_cookie *); void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *); -int mem_cgroup_scan_tasks(struct mem_cgroup *, - int (*)(struct task_struct *, void *), void *); +void mem_cgroup_scan_tasks(struct mem_cgroup *memcg, + int (*)(struct task_struct *, void *), void *arg); static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg) { @@ -1364,10 +1364,9 @@ static inline void mem_cgroup_iter_break(struct mem_cgroup *root, { } -static inline int mem_cgroup_scan_tasks(struct mem_cgroup *memcg, +static inline void mem_cgroup_scan_tasks(struct mem_cgroup *memcg, int (*fn)(struct task_struct *, void *), void *arg) { - return 0; } static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index cf06b1c9b3bb..a834b1edcde9 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1259,13 +1259,13 @@ static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg) * * This function iterates over tasks attached to @memcg or to any of its * descendants and calls @fn for each task. If @fn returns a non-zero - * value, the function breaks the iteration loop and returns the value. - * Otherwise, it will iterate over all tasks and return 0. + * value, the function breaks the iteration loop. Otherwise, it will iterate + * over all tasks and return 0. * * This function must not be called for the root memory cgroup. */ -int mem_cgroup_scan_tasks(struct mem_cgroup *memcg, - int (*fn)(struct task_struct *, void *), void *arg) +void mem_cgroup_scan_tasks(struct mem_cgroup *memcg, + int (*fn)(struct task_struct *, void *), void *arg) { struct mem_cgroup *iter; int ret = 0; @@ -1285,7 +1285,6 @@ int mem_cgroup_scan_tasks(struct mem_cgroup *memcg, break; } } - return ret; } #ifdef CONFIG_DEBUG_VM -- cgit v1.2.3 From c1753fd02a0058ea43cbb31ab26d25be2f6cfe08 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Mon, 15 May 2023 10:35:36 -0400 Subject: mm: move mm_count into its own cache line The mm_struct mm_count field is frequently updated by mmgrab/mmdrop performed by context switch. This causes false-sharing for surrounding mm_struct fields which are read-mostly. This has been observed on a 2sockets/112core/224cpu Intel Sapphire Rapids server running hackbench, and by the kernel test robot will-it-scale testcase. Move the mm_count field into its own cache line to prevent false-sharing with other mm_struct fields. Move mm_count to the first field of mm_struct to minimize the amount of padding required: rather than adding padding before and after the mm_count field, padding is only added after mm_count. Note that I noticed this odd comment in mm_struct: commit 2e3025434a6b ("mm: relocate 'write_protect_seq' in struct mm_struct") /* * With some kernel config, the current mmap_lock's offset * inside 'mm_struct' is at 0x120, which is very optimal, as * its two hot fields 'count' and 'owner' sit in 2 different * cachelines, and when mmap_lock is highly contended, both * of the 2 fields will be accessed frequently, current layout * will help to reduce cache bouncing. * * So please be careful with adding new fields before * mmap_lock, which can easily push the 2 fields into one * cacheline. */ struct rw_semaphore mmap_lock; This comment is rather odd for a few reasons: - It requires addition/removal of mm_struct fields to carefully consider field alignment of _other_ fields, - It expresses the wish to keep an "optimal" alignment for a specific kernel config. I suspect that the author of this comment may want to revisit this topic and perhaps introduce a split-struct approach for struct rw_semaphore, if the need is to place various fields of this structure in different cache lines. Link: https://lkml.kernel.org/r/20230515143536.114960-1-mathieu.desnoyers@efficios.com Fixes: 223baf9d17f2 ("sched: Fix performance regression introduced by mm_cid") Fixes: af7f588d8f73 ("sched: Introduce per-memory-map concurrency ID") Link: https://lore.kernel.org/lkml/7a0c1db1-103d-d518-ed96-1584a28fbf32@efficios.com Reported-by: kernel test robot Link: https://lore.kernel.org/oe-lkp/202305151017.27581d75-yujie.liu@intel.com Signed-off-by: Mathieu Desnoyers Reviewed-by: Aaron Lu Reviewed-by: John Hubbard Cc: Peter Zijlstra Cc: Olivier Dion Cc: Cc: Feng Tang Cc: Jason Gunthorpe Cc: Peter Xu Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 306a3d1a0fa6..de10fc797c8e 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -583,6 +583,21 @@ struct mm_cid { struct kioctx_table; struct mm_struct { struct { + /* + * Fields which are often written to are placed in a separate + * cache line. + */ + struct { + /** + * @mm_count: The number of references to &struct + * mm_struct (@mm_users count as 1). + * + * Use mmgrab()/mmdrop() to modify. When this drops to + * 0, the &struct mm_struct is freed. + */ + atomic_t mm_count; + } ____cacheline_aligned_in_smp; + struct maple_tree mm_mt; #ifdef CONFIG_MMU unsigned long (*get_unmapped_area) (struct file *filp, @@ -620,14 +635,6 @@ struct mm_struct { */ atomic_t mm_users; - /** - * @mm_count: The number of references to &struct mm_struct - * (@mm_users count as 1). - * - * Use mmgrab()/mmdrop() to modify. When this drops to 0, the - * &struct mm_struct is freed. - */ - atomic_t mm_count; #ifdef CONFIG_SCHED_MM_CID /** * @pcpu_cid: Per-cpu current cid. -- cgit v1.2.3 From cf01724e2d73a90524450e3dd8798cfb9d7aca05 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Sat, 17 Jun 2023 11:46:22 +0800 Subject: mm: page_alloc: make compound_page_dtors static It's only used inside page_alloc.c now. So make it static and remove the declaration in mm.h. Link: https://lkml.kernel.org/r/20230617034622.1235913-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Signed-off-by: Andrew Morton --- include/linux/mm.h | 1 - mm/page_alloc.c | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index a8baa34d0747..cf43deb25553 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1220,7 +1220,6 @@ enum compound_dtor_id { #endif NR_COMPOUND_DTORS, }; -extern compound_page_dtor * const compound_page_dtors[NR_COMPOUND_DTORS]; static inline void folio_set_compound_dtor(struct folio *folio, enum compound_dtor_id compound_dtor) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6a18f2232e3e..5b8a9d610b72 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -284,7 +284,7 @@ const char * const migratetype_names[MIGRATE_TYPES] = { #endif }; -compound_page_dtor * const compound_page_dtors[NR_COMPOUND_DTORS] = { +static compound_page_dtor * const compound_page_dtors[NR_COMPOUND_DTORS] = { [NULL_COMPOUND_DTOR] = NULL, [COMPOUND_PAGE_DTOR] = free_compound_page, #ifdef CONFIG_HUGETLB_PAGE -- cgit v1.2.3 From 61167ad5fecdeaa037f3df1ba354dddd5f66a1ed Mon Sep 17 00:00:00 2001 From: Yajun Deng Date: Mon, 19 Jun 2023 10:34:06 +0800 Subject: mm: pass nid to reserve_bootmem_region() early_pfn_to_nid() is called frequently in init_reserved_page(), it returns the node id of the PFN. These PFN are probably from the same memory region, they have the same node id. It's not necessary to call early_pfn_to_nid() for each PFN. Pass nid to reserve_bootmem_region() and drop the call to early_pfn_to_nid() in init_reserved_page(). Also, set nid on all reserved pages before doing this, as some reserved memory regions may not be set nid. The most beneficial function is memmap_init_reserved_pages() if CONFIG_DEFERRED_STRUCT_PAGE_INIT is enabled. The following data was tested on an x86 machine with 190GB of RAM. before: memmap_init_reserved_pages() 67ms after: memmap_init_reserved_pages() 20ms Link: https://lkml.kernel.org/r/20230619023406.424298-1-yajun.deng@linux.dev Signed-off-by: Yajun Deng Reviewed-by: Mike Rapoport (IBM) Signed-off-by: Andrew Morton --- include/linux/mm.h | 3 ++- mm/memblock.c | 31 +++++++++++++++++++++---------- mm/mm_init.c | 30 +++++++++++++++++------------- 3 files changed, 40 insertions(+), 24 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index cf43deb25553..9ecb8b9c07f6 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2940,7 +2940,8 @@ extern unsigned long free_reserved_area(void *start, void *end, extern void adjust_managed_page_count(struct page *page, long count); -extern void reserve_bootmem_region(phys_addr_t start, phys_addr_t end); +extern void reserve_bootmem_region(phys_addr_t start, + phys_addr_t end, int nid); /* Free the reserved page into the buddy system, so it gets managed. */ static inline void free_reserved_page(struct page *page) diff --git a/mm/memblock.c b/mm/memblock.c index da4264528e1e..46739551d4d1 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -2082,19 +2082,30 @@ static void __init memmap_init_reserved_pages(void) { struct memblock_region *region; phys_addr_t start, end; - u64 i; + int nid; + + /* + * set nid on all reserved pages and also treat struct + * pages for the NOMAP regions as PageReserved + */ + for_each_mem_region(region) { + nid = memblock_get_region_node(region); + start = region->base; + end = start + region->size; + + if (memblock_is_nomap(region)) + reserve_bootmem_region(start, end, nid); + + memblock_set_node(start, end, &memblock.reserved, nid); + } /* initialize struct pages for the reserved regions */ - for_each_reserved_mem_range(i, &start, &end) - reserve_bootmem_region(start, end); + for_each_reserved_mem_region(region) { + nid = memblock_get_region_node(region); + start = region->base; + end = start + region->size; - /* and also treat struct pages for the NOMAP regions as PageReserved */ - for_each_mem_region(region) { - if (memblock_is_nomap(region)) { - start = region->base; - end = start + region->size; - reserve_bootmem_region(start, end); - } + reserve_bootmem_region(start, end, nid); } } diff --git a/mm/mm_init.c b/mm/mm_init.c index 122e9bf3fa73..7ffa609673ea 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -646,10 +646,8 @@ static inline void pgdat_set_deferred_range(pg_data_t *pgdat) } /* Returns true if the struct page for the pfn is initialised */ -static inline bool __meminit early_page_initialised(unsigned long pfn) +static inline bool __meminit early_page_initialised(unsigned long pfn, int nid) { - int nid = early_pfn_to_nid(pfn); - if (node_online(nid) && pfn >= NODE_DATA(nid)->first_deferred_pfn) return false; @@ -695,15 +693,14 @@ defer_init(int nid, unsigned long pfn, unsigned long end_pfn) return false; } -static void __meminit init_reserved_page(unsigned long pfn) +static void __meminit init_reserved_page(unsigned long pfn, int nid) { pg_data_t *pgdat; - int nid, zid; + int zid; - if (early_page_initialised(pfn)) + if (early_page_initialised(pfn, nid)) return; - nid = early_pfn_to_nid(pfn); pgdat = NODE_DATA(nid); for (zid = 0; zid < MAX_NR_ZONES; zid++) { @@ -717,7 +714,7 @@ static void __meminit init_reserved_page(unsigned long pfn) #else static inline void pgdat_set_deferred_range(pg_data_t *pgdat) {} -static inline bool early_page_initialised(unsigned long pfn) +static inline bool early_page_initialised(unsigned long pfn, int nid) { return true; } @@ -727,7 +724,7 @@ static inline bool defer_init(int nid, unsigned long pfn, unsigned long end_pfn) return false; } -static inline void init_reserved_page(unsigned long pfn) +static inline void init_reserved_page(unsigned long pfn, int nid) { } #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ @@ -738,7 +735,8 @@ static inline void init_reserved_page(unsigned long pfn) * marks the pages PageReserved. The remaining valid pages are later * sent to the buddy page allocator. */ -void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end) +void __meminit reserve_bootmem_region(phys_addr_t start, + phys_addr_t end, int nid) { unsigned long start_pfn = PFN_DOWN(start); unsigned long end_pfn = PFN_UP(end); @@ -747,7 +745,7 @@ void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end) if (pfn_valid(start_pfn)) { struct page *page = pfn_to_page(start_pfn); - init_reserved_page(start_pfn); + init_reserved_page(start_pfn, nid); /* Avoid false-positive PageTail() */ INIT_LIST_HEAD(&page->lru); @@ -2572,8 +2570,14 @@ void __init set_dma_reserve(unsigned long new_dma_reserve) void __init memblock_free_pages(struct page *page, unsigned long pfn, unsigned int order) { - if (!early_page_initialised(pfn)) - return; + + if (IS_ENABLED(CONFIG_DEFERRED_STRUCT_PAGE_INIT)) { + int nid = early_pfn_to_nid(pfn); + + if (!early_page_initialised(pfn, nid)) + return; + } + if (!kmsan_memblock_free_pages(page, order)) { /* KMSAN will take care of these pages. */ return; -- cgit v1.2.3 From 982a7194afc9a58ec55ed174c61869c2722bb918 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 21 Jun 2023 17:45:46 +0100 Subject: mm: add __folio_batch_release() This performs the same role as __pagevec_release(), ie skipping the check for batch length of 0. Link: https://lkml.kernel.org/r/20230621164557.3510324-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/pagevec.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h index f582f7213ea5..42aad53e382e 100644 --- a/include/linux/pagevec.h +++ b/include/linux/pagevec.h @@ -127,9 +127,15 @@ static inline unsigned folio_batch_add(struct folio_batch *fbatch, return fbatch_space(fbatch); } +static inline void __folio_batch_release(struct folio_batch *fbatch) +{ + __pagevec_release((struct pagevec *)fbatch); +} + static inline void folio_batch_release(struct folio_batch *fbatch) { - pagevec_release((struct pagevec *)fbatch); + if (folio_batch_count(fbatch)) + __folio_batch_release(fbatch); } void folio_batch_remove_exceptionals(struct folio_batch *fbatch); -- cgit v1.2.3 From bdadc6d83156016d2b5eed582c1458c881c53a1e Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 21 Jun 2023 17:45:47 +0100 Subject: scatterlist: add sg_set_folio() This wrapper for sg_set_page() lets drivers add folios to a scatterlist more easily. We could, perhaps, do better by using a different page in the folio if offset is larger than UINT_MAX, but let's hope we get a better data structure than this before we need to care about such large folios. Link: https://lkml.kernel.org/r/20230621164557.3510324-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/scatterlist.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'include') diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h index ec46d8e8e49d..77df3d7b18a6 100644 --- a/include/linux/scatterlist.h +++ b/include/linux/scatterlist.h @@ -141,6 +141,30 @@ static inline void sg_set_page(struct scatterlist *sg, struct page *page, sg->length = len; } +/** + * sg_set_folio - Set sg entry to point at given folio + * @sg: SG entry + * @folio: The folio + * @len: Length of data + * @offset: Offset into folio + * + * Description: + * Use this function to set an sg entry pointing at a folio, never assign + * the folio directly. We encode sg table information in the lower bits + * of the folio pointer. See sg_page() for looking up the page belonging + * to an sg entry. + * + **/ +static inline void sg_set_folio(struct scatterlist *sg, struct folio *folio, + size_t len, size_t offset) +{ + WARN_ON_ONCE(len > UINT_MAX); + WARN_ON_ONCE(offset > UINT_MAX); + sg_assign_page(sg, &folio->page); + sg->offset = offset; + sg->length = len; +} + static inline struct page *sg_page(struct scatterlist *sg) { #ifdef CONFIG_DEBUG_SG -- cgit v1.2.3 From e0b72c14d8dcc9477e580c261041dae86d4906fe Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 21 Jun 2023 17:45:50 +0100 Subject: mm: remove check_move_unevictable_pages() All callers have now been converted to call check_move_unevictable_folios(). Link: https://lkml.kernel.org/r/20230621164557.3510324-7-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/swap.h | 1 - mm/vmscan.c | 17 ----------------- 2 files changed, 18 deletions(-) (limited to 'include') diff --git a/include/linux/swap.h b/include/linux/swap.h index ce7e82cf787f..456546443f1f 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -439,7 +439,6 @@ static inline bool node_reclaim_enabled(void) } void check_move_unevictable_folios(struct folio_batch *fbatch); -void check_move_unevictable_pages(struct pagevec *pvec); extern void __meminit kswapd_run(int nid); extern void __meminit kswapd_stop(int nid); diff --git a/mm/vmscan.c b/mm/vmscan.c index 27f90896f789..049342b6317c 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -8075,23 +8075,6 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order) } #endif -void check_move_unevictable_pages(struct pagevec *pvec) -{ - struct folio_batch fbatch; - unsigned i; - - folio_batch_init(&fbatch); - for (i = 0; i < pvec->nr; i++) { - struct page *page = pvec->pages[i]; - - if (PageTransTail(page)) - continue; - folio_batch_add(&fbatch, page_folio(page)); - } - check_move_unevictable_folios(&fbatch); -} -EXPORT_SYMBOL_GPL(check_move_unevictable_pages); - /** * check_move_unevictable_folios - Move evictable folios to appropriate zone * lru list -- cgit v1.2.3 From ce06442812fc584337c5b23a43bd2be7d037041d Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 21 Jun 2023 17:45:51 +0100 Subject: pagevec: rename fbatch_count() This should always have been called folio_batch_count(). Link: https://lkml.kernel.org/r/20230621164557.3510324-8-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/pagevec.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h index 42aad53e382e..3a9d29dd28a3 100644 --- a/include/linux/pagevec.h +++ b/include/linux/pagevec.h @@ -105,7 +105,7 @@ static inline unsigned int folio_batch_count(struct folio_batch *fbatch) return fbatch->nr; } -static inline unsigned int fbatch_space(struct folio_batch *fbatch) +static inline unsigned int folio_batch_space(struct folio_batch *fbatch) { return PAGEVEC_SIZE - fbatch->nr; } @@ -124,7 +124,7 @@ static inline unsigned folio_batch_add(struct folio_batch *fbatch, struct folio *folio) { fbatch->folios[fbatch->nr++] = folio; - return fbatch_space(fbatch); + return folio_batch_space(fbatch); } static inline void __folio_batch_release(struct folio_batch *fbatch) -- cgit v1.2.3 From 76fa88429075667fe76d4905f2f471e0ac3d543c Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 21 Jun 2023 17:45:53 +0100 Subject: net: convert sunrpc from pagevec to folio_batch Remove the last usage of pagevecs. There is a slight change here; we now free the folio_batch as soon as it fills up instead of freeing the folio_batch when we try to add a page to a full batch. This should have no effect in practice. Link: https://lkml.kernel.org/r/20230621164557.3510324-10-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Chuck Lever Signed-off-by: Andrew Morton --- include/linux/sunrpc/svc.h | 2 +- net/sunrpc/svc.c | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 762d7231e574..a3a64fb4053c 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -223,7 +223,7 @@ struct svc_rqst { struct page * *rq_next_page; /* next reply page to use */ struct page * *rq_page_end; /* one past the last page */ - struct pagevec rq_pvec; + struct folio_batch rq_fbatch; struct kvec rq_vec[RPCSVC_MAXPAGES]; /* generally useful.. */ struct bio_vec rq_bvec[RPCSVC_MAXPAGES]; diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 79967b6925bd..8b9011bbece7 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -649,7 +649,7 @@ svc_rqst_alloc(struct svc_serv *serv, struct svc_pool *pool, int node) if (!rqstp) return rqstp; - pagevec_init(&rqstp->rq_pvec); + folio_batch_init(&rqstp->rq_fbatch); __set_bit(RQ_BUSY, &rqstp->rq_flags); rqstp->rq_server = serv; @@ -860,9 +860,9 @@ bool svc_rqst_replace_page(struct svc_rqst *rqstp, struct page *page) } if (*rqstp->rq_next_page) { - if (!pagevec_space(&rqstp->rq_pvec)) - __pagevec_release(&rqstp->rq_pvec); - pagevec_add(&rqstp->rq_pvec, *rqstp->rq_next_page); + if (!folio_batch_add(&rqstp->rq_fbatch, + page_folio(*rqstp->rq_next_page))) + __folio_batch_release(&rqstp->rq_fbatch); } get_page(page); @@ -896,7 +896,7 @@ void svc_rqst_release_pages(struct svc_rqst *rqstp) void svc_rqst_free(struct svc_rqst *rqstp) { - pagevec_release(&rqstp->rq_pvec); + folio_batch_release(&rqstp->rq_fbatch); svc_release_buffer(rqstp); if (rqstp->rq_scratch_page) put_page(rqstp->rq_scratch_page); -- cgit v1.2.3 From 1e0877d58b1e22517d8939b22b963c043e6c63fd Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 21 Jun 2023 17:45:54 +0100 Subject: mm: remove struct pagevec All users are now converted to use the folio_batch so we can get rid of this data structure. Link: https://lkml.kernel.org/r/20230621164557.3510324-11-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/pagevec.h | 63 ++++--------------------------------------------- mm/swap.c | 18 +++++++------- 2 files changed, 13 insertions(+), 68 deletions(-) (limited to 'include') diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h index 3a9d29dd28a3..87cc678adc85 100644 --- a/include/linux/pagevec.h +++ b/include/linux/pagevec.h @@ -3,65 +3,18 @@ * include/linux/pagevec.h * * In many places it is efficient to batch an operation up against multiple - * pages. A pagevec is a multipage container which is used for that. + * folios. A folio_batch is a container which is used for that. */ #ifndef _LINUX_PAGEVEC_H #define _LINUX_PAGEVEC_H -#include +#include -/* 15 pointers + header align the pagevec structure to a power of two */ +/* 15 pointers + header align the folio_batch structure to a power of two */ #define PAGEVEC_SIZE 15 -struct page; struct folio; -struct address_space; - -/* Layout must match folio_batch */ -struct pagevec { - unsigned char nr; - bool percpu_pvec_drained; - struct page *pages[PAGEVEC_SIZE]; -}; - -void __pagevec_release(struct pagevec *pvec); - -static inline void pagevec_init(struct pagevec *pvec) -{ - pvec->nr = 0; - pvec->percpu_pvec_drained = false; -} - -static inline void pagevec_reinit(struct pagevec *pvec) -{ - pvec->nr = 0; -} - -static inline unsigned pagevec_count(struct pagevec *pvec) -{ - return pvec->nr; -} - -static inline unsigned pagevec_space(struct pagevec *pvec) -{ - return PAGEVEC_SIZE - pvec->nr; -} - -/* - * Add a page to a pagevec. Returns the number of slots still available. - */ -static inline unsigned pagevec_add(struct pagevec *pvec, struct page *page) -{ - pvec->pages[pvec->nr++] = page; - return pagevec_space(pvec); -} - -static inline void pagevec_release(struct pagevec *pvec) -{ - if (pagevec_count(pvec)) - __pagevec_release(pvec); -} /** * struct folio_batch - A collection of folios. @@ -78,11 +31,6 @@ struct folio_batch { struct folio *folios[PAGEVEC_SIZE]; }; -/* Layout must match pagevec */ -static_assert(sizeof(struct pagevec) == sizeof(struct folio_batch)); -static_assert(offsetof(struct pagevec, pages) == - offsetof(struct folio_batch, folios)); - /** * folio_batch_init() - Initialise a batch of folios * @fbatch: The folio batch. @@ -127,10 +75,7 @@ static inline unsigned folio_batch_add(struct folio_batch *fbatch, return folio_batch_space(fbatch); } -static inline void __folio_batch_release(struct folio_batch *fbatch) -{ - __pagevec_release((struct pagevec *)fbatch); -} +void __folio_batch_release(struct folio_batch *pvec); static inline void folio_batch_release(struct folio_batch *fbatch) { diff --git a/mm/swap.c b/mm/swap.c index 423199ee8478..10348c1cf9c5 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -1044,25 +1044,25 @@ void release_pages(release_pages_arg arg, int nr) EXPORT_SYMBOL(release_pages); /* - * The pages which we're about to release may be in the deferred lru-addition + * The folios which we're about to release may be in the deferred lru-addition * queues. That would prevent them from really being freed right now. That's - * OK from a correctness point of view but is inefficient - those pages may be + * OK from a correctness point of view but is inefficient - those folios may be * cache-warm and we want to give them back to the page allocator ASAP. * - * So __pagevec_release() will drain those queues here. + * So __folio_batch_release() will drain those queues here. * folio_batch_move_lru() calls folios_put() directly to avoid * mutual recursion. */ -void __pagevec_release(struct pagevec *pvec) +void __folio_batch_release(struct folio_batch *fbatch) { - if (!pvec->percpu_pvec_drained) { + if (!fbatch->percpu_pvec_drained) { lru_add_drain(); - pvec->percpu_pvec_drained = true; + fbatch->percpu_pvec_drained = true; } - release_pages(pvec->pages, pagevec_count(pvec)); - pagevec_reinit(pvec); + release_pages(fbatch->folios, folio_batch_count(fbatch)); + folio_batch_reinit(fbatch); } -EXPORT_SYMBOL(__pagevec_release); +EXPORT_SYMBOL(__folio_batch_release); /** * folio_batch_remove_exceptionals() - Prune non-folios from a batch. -- cgit v1.2.3 From 7302338a14f97eb44cd13f34aab0dc6596f1632c Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Mon, 19 Jun 2023 19:07:18 +0800 Subject: mm: kill [add|del]_page_to_lru_list() Now no one call [add|del]_page_to_lru_list(), let's drop unused page interfaces. Link:https://lkml.kernel.org/r/20230619110718.65679-2-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Acked-by: Yu Zhao Reviewed-by: Baolin Wang Cc: James Gowans Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- include/linux/mm_inline.h | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'include') diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 08c2bcefcb2b..21d6c72bcc71 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -323,12 +323,6 @@ void lruvec_add_folio(struct lruvec *lruvec, struct folio *folio) list_add(&folio->lru, &lruvec->lists[lru]); } -static __always_inline void add_page_to_lru_list(struct page *page, - struct lruvec *lruvec) -{ - lruvec_add_folio(lruvec, page_folio(page)); -} - static __always_inline void lruvec_add_folio_tail(struct lruvec *lruvec, struct folio *folio) { @@ -357,12 +351,6 @@ void lruvec_del_folio(struct lruvec *lruvec, struct folio *folio) -folio_nr_pages(folio)); } -static __always_inline void del_page_from_lru_list(struct page *page, - struct lruvec *lruvec) -{ - lruvec_del_folio(lruvec, page_folio(page)); -} - #ifdef CONFIG_ANON_VMA_NAME /* * mmap_lock should be read-locked when calling anon_vma_name(). Caller should -- cgit v1.2.3 From 1bc545bff45ce9eefc176ccf663074462a209cb6 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Wed, 21 Jun 2023 02:31:01 +0000 Subject: mm/vmscan: fix root proactive reclaim unthrottling unbalanced node When memory.reclaim was introduced, it became the first case where cgroup_reclaim() is true for the root cgroup. Johannes concluded [1] that for most cases this is okay, except for one case. Historically, kswapd would throttle reclaim on a node if a lot of pages marked for reclaim are under writeback (aka the node is congested). This occurred by setting LRUVEC_CONGESTED bit in lruvec->flags. The bit would be cleared when the node is balanced. Similarly, cgroup reclaim would set the same bit when an lruvec is congested, and clear it on the way out of reclaim (to throttle local reclaimers). Before the introduction of memory.reclaim, the root memcg was the only target of kswapd reclaim, and non-root memcgs were the only targets of cgroup reclaim, so they would never interfere. Using the same bit for both was fine. After memory.reclaim, it is possible for cgroup reclaim on the root cgroup to clear the bit set by kswapd. This would result in reclaim on the node to be unthrottled before the node is balanced. Fix this by introducing separate bits for cgroup-level and node-level congestion. kswapd can unthrottle an lruvec that is marked as congested by cgroup reclaim (as the entire node should no longer be congested), but not vice versa (to prevent premature unthrottling before the entire node is balanced). [1]https://lore.kernel.org/lkml/20230405200150.GA35884@cmpxchg.org/ Link: https://lkml.kernel.org/r/20230621023101.432780-1-yosryahmed@google.com Signed-off-by: Yosry Ahmed Reported-by: Johannes Weiner Closes: https://lore.kernel.org/lkml/20230405200150.GA35884@cmpxchg.org/ Cc: Michal Hocko Cc: Roman Gushchin Cc: Shakeel Butt Cc: Muchun Song Cc: Yu Zhao Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 18 +++++++++++++++--- mm/vmscan.c | 19 ++++++++++++------- 2 files changed, 27 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 3e822335f214..d863698a84e0 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -293,9 +293,21 @@ static inline bool is_active_lru(enum lru_list lru) #define ANON_AND_FILE 2 enum lruvec_flags { - LRUVEC_CONGESTED, /* lruvec has many dirty pages - * backed by a congested BDI - */ + /* + * An lruvec has many dirty pages backed by a congested BDI: + * 1. LRUVEC_CGROUP_CONGESTED is set by cgroup-level reclaim. + * It can be cleared by cgroup reclaim or kswapd. + * 2. LRUVEC_NODE_CONGESTED is set by kswapd node-level reclaim. + * It can only be cleared by kswapd. + * + * Essentially, kswapd can unthrottle an lruvec throttled by cgroup + * reclaim, but not vice versa. This only applies to the root cgroup. + * The goal is to prevent cgroup reclaim on the root cgroup (e.g. + * memory.reclaim) to unthrottle an unbalanced node (that was throttled + * by kswapd). + */ + LRUVEC_CGROUP_CONGESTED, + LRUVEC_NODE_CONGESTED, }; #endif /* !__GENERATING_BOUNDS_H */ diff --git a/mm/vmscan.c b/mm/vmscan.c index b7068be8a034..1080209a568b 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -6578,10 +6578,13 @@ again: * Legacy memcg will stall in page writeback so avoid forcibly * stalling in reclaim_throttle(). */ - if ((current_is_kswapd() || - (cgroup_reclaim(sc) && writeback_throttling_sane(sc))) && - sc->nr.dirty && sc->nr.dirty == sc->nr.congested) - set_bit(LRUVEC_CONGESTED, &target_lruvec->flags); + if (sc->nr.dirty && sc->nr.dirty == sc->nr.congested) { + if (cgroup_reclaim(sc) && writeback_throttling_sane(sc)) + set_bit(LRUVEC_CGROUP_CONGESTED, &target_lruvec->flags); + + if (current_is_kswapd()) + set_bit(LRUVEC_NODE_CONGESTED, &target_lruvec->flags); + } /* * Stall direct reclaim for IO completions if the lruvec is @@ -6591,7 +6594,8 @@ again: */ if (!current_is_kswapd() && current_may_throttle() && !sc->hibernation_mode && - test_bit(LRUVEC_CONGESTED, &target_lruvec->flags)) + (test_bit(LRUVEC_CGROUP_CONGESTED, &target_lruvec->flags) || + test_bit(LRUVEC_NODE_CONGESTED, &target_lruvec->flags))) reclaim_throttle(pgdat, VMSCAN_THROTTLE_CONGESTED); if (should_continue_reclaim(pgdat, nr_node_reclaimed, sc)) @@ -6848,7 +6852,7 @@ retry: lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, zone->zone_pgdat); - clear_bit(LRUVEC_CONGESTED, &lruvec->flags); + clear_bit(LRUVEC_CGROUP_CONGESTED, &lruvec->flags); } } @@ -7237,7 +7241,8 @@ static void clear_pgdat_congested(pg_data_t *pgdat) { struct lruvec *lruvec = mem_cgroup_lruvec(NULL, pgdat); - clear_bit(LRUVEC_CONGESTED, &lruvec->flags); + clear_bit(LRUVEC_NODE_CONGESTED, &lruvec->flags); + clear_bit(LRUVEC_CGROUP_CONGESTED, &lruvec->flags); clear_bit(PGDAT_DIRTY, &pgdat->flags); clear_bit(PGDAT_WRITEBACK, &pgdat->flags); } -- cgit v1.2.3 From acc72d59c7509540c27c49625cb4b5a8db1f1a84 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Thu, 22 Jun 2023 22:49:48 -0700 Subject: mm/hugetlb: remove hugetlb_set_page_subpool() All users have been converted to hugetlb_set_folio_subpool() so we can safely remove this function. Link: https://lkml.kernel.org/r/20230623054948.280627-1-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Cc: Mike Kravetz Cc: Muchun Song Cc: Tarun Sahu Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index beb7c63d2871..ca3c8e10f24a 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -761,12 +761,6 @@ static inline void hugetlb_set_folio_subpool(struct folio *folio, folio->_hugetlb_subpool = subpool; } -static inline void hugetlb_set_page_subpool(struct page *hpage, - struct hugepage_subpool *subpool) -{ - hugetlb_set_folio_subpool(page_folio(hpage), subpool); -} - static inline struct hstate *hstate_file(struct file *f) { return hstate_inode(file_inode(f)); -- cgit v1.2.3