From 23e9f0138963ceef2a252d887534923a0502b2da Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Fri, 3 Nov 2023 11:14:50 +0800 Subject: mm/vmstat: move pgdemote_* to per-node stats Demotion will migrate pages across nodes. Previously, only the global demotion statistics were accounted for. Changed them to per-node statistics, making it easier to observe where demotion occurs on each node. This will help to identify which nodes are under pressure. This patch also make pgdemote_* behind CONFIG_NUMA_BALANCING, since demotion is not available for !CONFIG_NUMA_BALANCING With this patch, here is a sample where node0 node1 are DRAM, node3 is PMEM: Global stats: $ grep demote /proc/vmstat pgdemote_kswapd 254288 pgdemote_direct 113497 pgdemote_khugepaged 0 Per-node stats: $ grep demote /sys/devices/system/node/node0/vmstat # demotion source pgdemote_kswapd 68454 pgdemote_direct 83431 pgdemote_khugepaged 0 $ grep demote /sys/devices/system/node/node1/vmstat # demotion source pgdemote_kswapd 185834 pgdemote_direct 30066 pgdemote_khugepaged 0 $ grep demote /sys/devices/system/node/node3/vmstat # demotion target pgdemote_kswapd 0 pgdemote_direct 0 pgdemote_khugepaged 0 Link: https://lkml.kernel.org/r/20231103031450.1456523-1-lizhijian@fujitsu.com Signed-off-by: Li Zhijian Acked-by: "Huang, Ying" Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 4 ++++ include/linux/vm_event_item.h | 3 --- 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 3c25226beeed..14faffa4354f 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -206,6 +206,10 @@ enum node_stat_item { #ifdef CONFIG_NUMA_BALANCING PGPROMOTE_SUCCESS, /* promote successfully */ PGPROMOTE_CANDIDATE, /* candidate pages to promote */ + /* PGDEMOTE_*: pages demoted */ + PGDEMOTE_KSWAPD, + PGDEMOTE_DIRECT, + PGDEMOTE_KHUGEPAGED, #endif NR_VM_NODE_STAT_ITEMS }; diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 8abfa1240040..d1b847502f09 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -41,9 +41,6 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, PGSTEAL_KSWAPD, PGSTEAL_DIRECT, PGSTEAL_KHUGEPAGED, - PGDEMOTE_KSWAPD, - PGDEMOTE_DIRECT, - PGDEMOTE_KHUGEPAGED, PGSCAN_KSWAPD, PGSCAN_DIRECT, PGSCAN_KHUGEPAGED, -- cgit v1.2.3 From b2472efe4316b2687c153919c1513a098bd82c17 Mon Sep 17 00:00:00 2001 From: Peng Zhang Date: Fri, 27 Oct 2023 11:38:37 +0800 Subject: maple_tree: introduce {mtree,mas}_lock_nested() In some cases, nested locks may be needed, so {mtree,mas}_lock_nested is introduced. For example, when duplicating maple tree, we need to hold the locks of two trees, in which case nested locks are needed. At the same time, add the definition of spin_lock_nested() in tools for testing. Link: https://lkml.kernel.org/r/20231027033845.90608-3-zhangpeng.00@bytedance.com Signed-off-by: Peng Zhang Reviewed-by: Liam R. Howlett Cc: Christian Brauner Cc: Jonathan Corbet Cc: Mateusz Guzik Cc: Mathieu Desnoyers Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Mike Christie Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- include/linux/maple_tree.h | 4 ++++ tools/include/linux/spinlock.h | 1 + 2 files changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index d01e850b570f..f91dbc7fe091 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -256,6 +256,8 @@ struct maple_tree { struct maple_tree name = MTREE_INIT(name, 0) #define mtree_lock(mt) spin_lock((&(mt)->ma_lock)) +#define mtree_lock_nested(mas, subclass) \ + spin_lock_nested((&(mt)->ma_lock), subclass) #define mtree_unlock(mt) spin_unlock((&(mt)->ma_lock)) /* @@ -406,6 +408,8 @@ struct ma_wr_state { }; #define mas_lock(mas) spin_lock(&((mas)->tree->ma_lock)) +#define mas_lock_nested(mas, subclass) \ + spin_lock_nested(&((mas)->tree->ma_lock), subclass) #define mas_unlock(mas) spin_unlock(&((mas)->tree->ma_lock)) diff --git a/tools/include/linux/spinlock.h b/tools/include/linux/spinlock.h index 622266b197d0..a6cdf25b6b9d 100644 --- a/tools/include/linux/spinlock.h +++ b/tools/include/linux/spinlock.h @@ -11,6 +11,7 @@ #define spin_lock_init(x) pthread_mutex_init(x, NULL) #define spin_lock(x) pthread_mutex_lock(x) +#define spin_lock_nested(x, subclass) pthread_mutex_lock(x) #define spin_unlock(x) pthread_mutex_unlock(x) #define spin_lock_bh(x) pthread_mutex_lock(x) #define spin_unlock_bh(x) pthread_mutex_unlock(x) -- cgit v1.2.3 From fd32e4e9b7646510ee9010e0d5f8b8857d48a6f7 Mon Sep 17 00:00:00 2001 From: Peng Zhang Date: Fri, 27 Oct 2023 11:38:38 +0800 Subject: maple_tree: introduce interfaces __mt_dup() and mtree_dup() Introduce interfaces __mt_dup() and mtree_dup(), which are used to duplicate a maple tree. They duplicate a maple tree in Depth-First Search (DFS) pre-order traversal. It uses memcopy() to copy nodes in the source tree and allocate new child nodes in non-leaf nodes. The new node is exactly the same as the source node except for all the addresses stored in it. It will be faster than traversing all elements in the source tree and inserting them one by one into the new tree. The time complexity of these two functions is O(n). The difference between __mt_dup() and mtree_dup() is that mtree_dup() handles locks internally. Analysis of the average time complexity of this algorithm: For simplicity, let's assume that the maximum branching factor of all non-leaf nodes is 16 (in allocation mode, it is 10), and the tree is a full tree. Under the given conditions, if there is a maple tree with n elements, the number of its leaves is n/16. From bottom to top, the number of nodes in each level is 1/16 of the number of nodes in the level below. So the total number of nodes in the entire tree is given by the sum of n/16 + n/16^2 + n/16^3 + ... + 1. This is a geometric series, and it has log(n) terms with base 16. According to the formula for the sum of a geometric series, the sum of this series can be calculated as (n-1)/15. Each node has only one parent node pointer, which can be considered as an edge. In total, there are (n-1)/15-1 edges. This algorithm consists of two operations: 1. Traversing all nodes in DFS order. 2. For each node, making a copy and performing necessary modifications to create a new node. For the first part, DFS traversal will visit each edge twice. Let T(ascend) represent the cost of taking one step downwards, and T(descend) represent the cost of taking one step upwards. And both of them are constants (although mas_ascend() may not be, as it contains a loop, but here we ignore it and treat it as a constant). So the time spent on the first part can be represented as ((n-1)/15-1) * (T(ascend) + T(descend)). For the second part, each node will be copied, and the cost of copying a node is denoted as T(copy_node). For each non-leaf node, it is necessary to reallocate all child nodes, and the cost of this operation is denoted as T(dup_alloc). The behavior behind memory allocation is complex and not specific to the maple tree operation. Here, we assume that the time required for a single allocation is constant. Since the size of a node is fixed, both of these symbols are also constants. We can calculate that the time spent on the second part is ((n-1)/15) * T(copy_node) + ((n-1)/15 - n/16) * T(dup_alloc). Adding both parts together, the total time spent by the algorithm can be represented as: ((n-1)/15) * (T(ascend) + T(descend) + T(copy_node) + T(dup_alloc)) - n/16 * T(dup_alloc) - (T(ascend) + T(descend)) Let C1 = T(ascend) + T(descend) + T(copy_node) + T(dup_alloc) Let C2 = T(dup_alloc) Let C3 = T(ascend) + T(descend) Finally, the expression can be simplified as: ((16 * C1 - 15 * C2) / (15 * 16)) * n - (C1 / 15 + C3). This is a linear function, so the average time complexity is O(n). Link: https://lkml.kernel.org/r/20231027033845.90608-4-zhangpeng.00@bytedance.com Signed-off-by: Peng Zhang Suggested-by: Liam R. Howlett Cc: Christian Brauner Cc: Jonathan Corbet Cc: Mateusz Guzik Cc: Mathieu Desnoyers Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Mike Christie Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- include/linux/maple_tree.h | 3 + lib/maple_tree.c | 274 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 277 insertions(+) (limited to 'include') diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index f91dbc7fe091..a452dd8a1e5c 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -329,6 +329,9 @@ int mtree_store(struct maple_tree *mt, unsigned long index, void *entry, gfp_t gfp); void *mtree_erase(struct maple_tree *mt, unsigned long index); +int mtree_dup(struct maple_tree *mt, struct maple_tree *new, gfp_t gfp); +int __mt_dup(struct maple_tree *mt, struct maple_tree *new, gfp_t gfp); + void mtree_destroy(struct maple_tree *mt); void __mt_destroy(struct maple_tree *mt); diff --git a/lib/maple_tree.c b/lib/maple_tree.c index ca7039633844..718a222cc090 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -4,6 +4,8 @@ * Copyright (c) 2018-2022 Oracle Corporation * Authors: Liam R. Howlett * Matthew Wilcox + * Copyright (c) 2023 ByteDance + * Author: Peng Zhang */ /* @@ -6475,6 +6477,278 @@ void *mtree_erase(struct maple_tree *mt, unsigned long index) } EXPORT_SYMBOL(mtree_erase); +/* + * mas_dup_free() - Free an incomplete duplication of a tree. + * @mas: The maple state of a incomplete tree. + * + * The parameter @mas->node passed in indicates that the allocation failed on + * this node. This function frees all nodes starting from @mas->node in the + * reverse order of mas_dup_build(). There is no need to hold the source tree + * lock at this time. + */ +static void mas_dup_free(struct ma_state *mas) +{ + struct maple_node *node; + enum maple_type type; + void __rcu **slots; + unsigned char count, i; + + /* Maybe the first node allocation failed. */ + if (mas_is_none(mas)) + return; + + while (!mte_is_root(mas->node)) { + mas_ascend(mas); + if (mas->offset) { + mas->offset--; + do { + mas_descend(mas); + mas->offset = mas_data_end(mas); + } while (!mte_is_leaf(mas->node)); + + mas_ascend(mas); + } + + node = mte_to_node(mas->node); + type = mte_node_type(mas->node); + slots = ma_slots(node, type); + count = mas_data_end(mas) + 1; + for (i = 0; i < count; i++) + ((unsigned long *)slots)[i] &= ~MAPLE_NODE_MASK; + mt_free_bulk(count, slots); + } + + node = mte_to_node(mas->node); + mt_free_one(node); +} + +/* + * mas_copy_node() - Copy a maple node and replace the parent. + * @mas: The maple state of source tree. + * @new_mas: The maple state of new tree. + * @parent: The parent of the new node. + * + * Copy @mas->node to @new_mas->node, set @parent to be the parent of + * @new_mas->node. If memory allocation fails, @mas is set to -ENOMEM. + */ +static inline void mas_copy_node(struct ma_state *mas, struct ma_state *new_mas, + struct maple_pnode *parent) +{ + struct maple_node *node = mte_to_node(mas->node); + struct maple_node *new_node = mte_to_node(new_mas->node); + unsigned long val; + + /* Copy the node completely. */ + memcpy(new_node, node, sizeof(struct maple_node)); + /* Update the parent node pointer. */ + val = (unsigned long)node->parent & MAPLE_NODE_MASK; + new_node->parent = ma_parent_ptr(val | (unsigned long)parent); +} + +/* + * mas_dup_alloc() - Allocate child nodes for a maple node. + * @mas: The maple state of source tree. + * @new_mas: The maple state of new tree. + * @gfp: The GFP_FLAGS to use for allocations. + * + * This function allocates child nodes for @new_mas->node during the duplication + * process. If memory allocation fails, @mas is set to -ENOMEM. + */ +static inline void mas_dup_alloc(struct ma_state *mas, struct ma_state *new_mas, + gfp_t gfp) +{ + struct maple_node *node = mte_to_node(mas->node); + struct maple_node *new_node = mte_to_node(new_mas->node); + enum maple_type type; + unsigned char request, count, i; + void __rcu **slots; + void __rcu **new_slots; + unsigned long val; + + /* Allocate memory for child nodes. */ + type = mte_node_type(mas->node); + new_slots = ma_slots(new_node, type); + request = mas_data_end(mas) + 1; + count = mt_alloc_bulk(gfp, request, (void **)new_slots); + if (unlikely(count < request)) { + memset(new_slots, 0, request * sizeof(void *)); + mas_set_err(mas, -ENOMEM); + return; + } + + /* Restore node type information in slots. */ + slots = ma_slots(node, type); + for (i = 0; i < count; i++) { + val = (unsigned long)mt_slot_locked(mas->tree, slots, i); + val &= MAPLE_NODE_MASK; + ((unsigned long *)new_slots)[i] |= val; + } +} + +/* + * mas_dup_build() - Build a new maple tree from a source tree + * @mas: The maple state of source tree, need to be in MAS_START state. + * @new_mas: The maple state of new tree, need to be in MAS_START state. + * @gfp: The GFP_FLAGS to use for allocations. + * + * This function builds a new tree in DFS preorder. If the memory allocation + * fails, the error code -ENOMEM will be set in @mas, and @new_mas points to the + * last node. mas_dup_free() will free the incomplete duplication of a tree. + * + * Note that the attributes of the two trees need to be exactly the same, and the + * new tree needs to be empty, otherwise -EINVAL will be set in @mas. + */ +static inline void mas_dup_build(struct ma_state *mas, struct ma_state *new_mas, + gfp_t gfp) +{ + struct maple_node *node; + struct maple_pnode *parent = NULL; + struct maple_enode *root; + enum maple_type type; + + if (unlikely(mt_attr(mas->tree) != mt_attr(new_mas->tree)) || + unlikely(!mtree_empty(new_mas->tree))) { + mas_set_err(mas, -EINVAL); + return; + } + + root = mas_start(mas); + if (mas_is_ptr(mas) || mas_is_none(mas)) + goto set_new_tree; + + node = mt_alloc_one(gfp); + if (!node) { + new_mas->node = MAS_NONE; + mas_set_err(mas, -ENOMEM); + return; + } + + type = mte_node_type(mas->node); + root = mt_mk_node(node, type); + new_mas->node = root; + new_mas->min = 0; + new_mas->max = ULONG_MAX; + root = mte_mk_root(root); + while (1) { + mas_copy_node(mas, new_mas, parent); + if (!mte_is_leaf(mas->node)) { + /* Only allocate child nodes for non-leaf nodes. */ + mas_dup_alloc(mas, new_mas, gfp); + if (unlikely(mas_is_err(mas))) + return; + } else { + /* + * This is the last leaf node and duplication is + * completed. + */ + if (mas->max == ULONG_MAX) + goto done; + + /* This is not the last leaf node and needs to go up. */ + do { + mas_ascend(mas); + mas_ascend(new_mas); + } while (mas->offset == mas_data_end(mas)); + + /* Move to the next subtree. */ + mas->offset++; + new_mas->offset++; + } + + mas_descend(mas); + parent = ma_parent_ptr(mte_to_node(new_mas->node)); + mas_descend(new_mas); + mas->offset = 0; + new_mas->offset = 0; + } +done: + /* Specially handle the parent of the root node. */ + mte_to_node(root)->parent = ma_parent_ptr(mas_tree_parent(new_mas)); +set_new_tree: + /* Make them the same height */ + new_mas->tree->ma_flags = mas->tree->ma_flags; + rcu_assign_pointer(new_mas->tree->ma_root, root); +} + +/** + * __mt_dup(): Duplicate an entire maple tree + * @mt: The source maple tree + * @new: The new maple tree + * @gfp: The GFP_FLAGS to use for allocations + * + * This function duplicates a maple tree in Depth-First Search (DFS) pre-order + * traversal. It uses memcpy() to copy nodes in the source tree and allocate + * new child nodes in non-leaf nodes. The new node is exactly the same as the + * source node except for all the addresses stored in it. It will be faster than + * traversing all elements in the source tree and inserting them one by one into + * the new tree. + * The user needs to ensure that the attributes of the source tree and the new + * tree are the same, and the new tree needs to be an empty tree, otherwise + * -EINVAL will be returned. + * Note that the user needs to manually lock the source tree and the new tree. + * + * Return: 0 on success, -ENOMEM if memory could not be allocated, -EINVAL If + * the attributes of the two trees are different or the new tree is not an empty + * tree. + */ +int __mt_dup(struct maple_tree *mt, struct maple_tree *new, gfp_t gfp) +{ + int ret = 0; + MA_STATE(mas, mt, 0, 0); + MA_STATE(new_mas, new, 0, 0); + + mas_dup_build(&mas, &new_mas, gfp); + if (unlikely(mas_is_err(&mas))) { + ret = xa_err(mas.node); + if (ret == -ENOMEM) + mas_dup_free(&new_mas); + } + + return ret; +} +EXPORT_SYMBOL(__mt_dup); + +/** + * mtree_dup(): Duplicate an entire maple tree + * @mt: The source maple tree + * @new: The new maple tree + * @gfp: The GFP_FLAGS to use for allocations + * + * This function duplicates a maple tree in Depth-First Search (DFS) pre-order + * traversal. It uses memcpy() to copy nodes in the source tree and allocate + * new child nodes in non-leaf nodes. The new node is exactly the same as the + * source node except for all the addresses stored in it. It will be faster than + * traversing all elements in the source tree and inserting them one by one into + * the new tree. + * The user needs to ensure that the attributes of the source tree and the new + * tree are the same, and the new tree needs to be an empty tree, otherwise + * -EINVAL will be returned. + * + * Return: 0 on success, -ENOMEM if memory could not be allocated, -EINVAL If + * the attributes of the two trees are different or the new tree is not an empty + * tree. + */ +int mtree_dup(struct maple_tree *mt, struct maple_tree *new, gfp_t gfp) +{ + int ret = 0; + MA_STATE(mas, mt, 0, 0); + MA_STATE(new_mas, new, 0, 0); + + mas_lock(&new_mas); + mas_lock_nested(&mas, SINGLE_DEPTH_NESTING); + mas_dup_build(&mas, &new_mas, gfp); + mas_unlock(&mas); + if (unlikely(mas_is_err(&mas))) { + ret = xa_err(mas.node); + if (ret == -ENOMEM) + mas_dup_free(&new_mas); + } + + mas_unlock(&new_mas); + return ret; +} +EXPORT_SYMBOL(mtree_dup); + /** * __mt_destroy() - Walk and free all nodes of a locked maple tree. * @mt: The maple tree -- cgit v1.2.3 From d2406291483775ecddaee929231a39c70c08fda2 Mon Sep 17 00:00:00 2001 From: Peng Zhang Date: Fri, 27 Oct 2023 11:38:45 +0800 Subject: fork: use __mt_dup() to duplicate maple tree in dup_mmap() In dup_mmap(), using __mt_dup() to duplicate the old maple tree and then directly replacing the entries of VMAs in the new maple tree can result in better performance. __mt_dup() uses DFS pre-order to duplicate the maple tree, so it is efficient. The average time complexity of __mt_dup() is O(n), where n is the number of VMAs. The proof of the time complexity is provided in the commit log that introduces __mt_dup(). After duplicating the maple tree, each element is traversed and replaced (ignoring the cases of deletion, which are rare). Since it is only a replacement operation for each element, this process is also O(n). Analyzing the exact time complexity of the previous algorithm is challenging because each insertion can involve appending to a node, pushing data to adjacent nodes, or even splitting nodes. The frequency of each action is difficult to calculate. The worst-case scenario for a single insertion is when the tree undergoes splitting at every level. If we consider each insertion as the worst-case scenario, we can determine that the upper bound of the time complexity is O(n*log(n)), although this is a loose upper bound. However, based on the test data, it appears that the actual time complexity is likely to be O(n). As the entire maple tree is duplicated using __mt_dup(), if dup_mmap() fails, there will be a portion of VMAs that have not been duplicated in the maple tree. To handle this, we mark the failure point with XA_ZERO_ENTRY. In exit_mmap(), if this marker is encountered, stop releasing VMAs that have not been duplicated after this point. There is a "spawn" in byte-unixbench[1], which can be used to test the performance of fork(). I modified it slightly to make it work with different number of VMAs. Below are the test results. The first row shows the number of VMAs. The second and third rows show the number of fork() calls per ten seconds, corresponding to next-20231006 and the this patchset, respectively. The test results were obtained with CPU binding to avoid scheduler load balancing that could cause unstable results. There are still some fluctuations in the test results, but at least they are better than the original performance. 21 121 221 421 821 1621 3221 6421 12821 25621 51221 112100 76261 54227 34035 20195 11112 6017 3161 1606 802 393 114558 83067 65008 45824 28751 16072 8922 4747 2436 1233 599 2.19% 8.92% 19.88% 34.64% 42.37% 44.64% 48.28% 50.17% 51.68% 53.74% 52.42% [1] https://github.com/kdlucas/byte-unixbench/tree/master Link: https://lkml.kernel.org/r/20231027033845.90608-11-zhangpeng.00@bytedance.com Signed-off-by: Peng Zhang Suggested-by: Liam R. Howlett Reviewed-by: Liam R. Howlett Cc: Christian Brauner Cc: Jonathan Corbet Cc: Mateusz Guzik Cc: Mathieu Desnoyers Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Mike Christie Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- include/linux/mm.h | 11 +++++++++++ kernel/fork.c | 40 +++++++++++++++++++++++++++++----------- mm/internal.h | 11 ----------- mm/memory.c | 7 ++++++- mm/mmap.c | 9 ++++++--- 5 files changed, 52 insertions(+), 26 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 418d26608ece..64cd1ee4aacc 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -994,6 +994,17 @@ static inline int vma_iter_bulk_alloc(struct vma_iterator *vmi, return mas_expected_entries(&vmi->mas, count); } +static inline int vma_iter_clear_gfp(struct vma_iterator *vmi, + unsigned long start, unsigned long end, gfp_t gfp) +{ + __mas_set_range(&vmi->mas, start, end - 1); + mas_store_gfp(&vmi->mas, NULL, gfp); + if (unlikely(mas_is_err(&vmi->mas))) + return -ENOMEM; + + return 0; +} + /* Free any unused preallocations */ static inline void vma_iter_free(struct vma_iterator *vmi) { diff --git a/kernel/fork.c b/kernel/fork.c index 10917c3e1f03..93924392a5c3 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -650,7 +650,6 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, int retval; unsigned long charge = 0; LIST_HEAD(uf); - VMA_ITERATOR(old_vmi, oldmm, 0); VMA_ITERATOR(vmi, mm, 0); uprobe_start_dup_mmap(); @@ -678,16 +677,22 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, goto out; khugepaged_fork(mm, oldmm); - retval = vma_iter_bulk_alloc(&vmi, oldmm->map_count); - if (retval) + /* Use __mt_dup() to efficiently build an identical maple tree. */ + retval = __mt_dup(&oldmm->mm_mt, &mm->mm_mt, GFP_KERNEL); + if (unlikely(retval)) goto out; mt_clear_in_rcu(vmi.mas.tree); - for_each_vma(old_vmi, mpnt) { + for_each_vma(vmi, mpnt) { struct file *file; vma_start_write(mpnt); if (mpnt->vm_flags & VM_DONTCOPY) { + retval = vma_iter_clear_gfp(&vmi, mpnt->vm_start, + mpnt->vm_end, GFP_KERNEL); + if (retval) + goto loop_out; + vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt)); continue; } @@ -749,9 +754,11 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, if (is_vm_hugetlb_page(tmp)) hugetlb_dup_vma_private(tmp); - /* Link the vma into the MT */ - if (vma_iter_bulk_store(&vmi, tmp)) - goto fail_nomem_vmi_store; + /* + * Link the vma into the MT. After using __mt_dup(), memory + * allocation is not necessary here, so it cannot fail. + */ + vma_iter_bulk_store(&vmi, tmp); mm->map_count++; if (!(tmp->vm_flags & VM_WIPEONFORK)) @@ -760,15 +767,28 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, if (tmp->vm_ops && tmp->vm_ops->open) tmp->vm_ops->open(tmp); - if (retval) + if (retval) { + mpnt = vma_next(&vmi); goto loop_out; + } } /* a new mm has just been created */ retval = arch_dup_mmap(oldmm, mm); loop_out: vma_iter_free(&vmi); - if (!retval) + if (!retval) { mt_set_in_rcu(vmi.mas.tree); + } else if (mpnt) { + /* + * The entire maple tree has already been duplicated. If the + * mmap duplication fails, mark the failure point with + * XA_ZERO_ENTRY. In exit_mmap(), if this marker is encountered, + * stop releasing VMAs that have not been duplicated after this + * point. + */ + mas_set_range(&vmi.mas, mpnt->vm_start, mpnt->vm_end - 1); + mas_store(&vmi.mas, XA_ZERO_ENTRY); + } out: mmap_write_unlock(mm); flush_tlb_mm(oldmm); @@ -778,8 +798,6 @@ fail_uprobe_end: uprobe_end_dup_mmap(); return retval; -fail_nomem_vmi_store: - unlink_anon_vmas(tmp); fail_nomem_anon_vma_fork: mpol_put(vma_policy(tmp)); fail_nomem_policy: diff --git a/mm/internal.h b/mm/internal.h index b61034bd50f5..89a5a794d68f 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1154,17 +1154,6 @@ static inline void vma_iter_clear(struct vma_iterator *vmi) mas_store_prealloc(&vmi->mas, NULL); } -static inline int vma_iter_clear_gfp(struct vma_iterator *vmi, - unsigned long start, unsigned long end, gfp_t gfp) -{ - __mas_set_range(&vmi->mas, start, end - 1); - mas_store_gfp(&vmi->mas, NULL, gfp); - if (unlikely(mas_is_err(&vmi->mas))) - return -ENOMEM; - - return 0; -} - static inline struct vm_area_struct *vma_iter_load(struct vma_iterator *vmi) { return mas_walk(&vmi->mas); diff --git a/mm/memory.c b/mm/memory.c index 5c757fba8858..a7025ed5c65b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -374,6 +374,8 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas, * be 0. This will underflow and is okay. */ next = mas_find(mas, ceiling - 1); + if (unlikely(xa_is_zero(next))) + next = NULL; /* * Hide vma from rmap and truncate_pagecache before freeing @@ -395,6 +397,8 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas, && !is_vm_hugetlb_page(next)) { vma = next; next = mas_find(mas, ceiling - 1); + if (unlikely(xa_is_zero(next))) + next = NULL; if (mm_wr_locked) vma_start_write(vma); unlink_anon_vmas(vma); @@ -1744,7 +1748,8 @@ void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas, unmap_single_vma(tlb, vma, start, end, &details, mm_wr_locked); hugetlb_zap_end(vma, &details); - } while ((vma = mas_find(mas, tree_end - 1)) != NULL); + vma = mas_find(mas, tree_end - 1); + } while (vma && likely(!xa_is_zero(vma))); mmu_notifier_invalidate_range_end(&range); } diff --git a/mm/mmap.c b/mm/mmap.c index 1971bfffcc03..4f1cb814586d 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -3294,10 +3294,11 @@ void exit_mmap(struct mm_struct *mm) arch_exit_mmap(mm); vma = mas_find(&mas, ULONG_MAX); - if (!vma) { + if (!vma || unlikely(xa_is_zero(vma))) { /* Can happen if dup_mmap() received an OOM */ mmap_read_unlock(mm); - return; + mmap_write_lock(mm); + goto destroy; } lru_add_drain(); @@ -3332,11 +3333,13 @@ void exit_mmap(struct mm_struct *mm) remove_vma(vma, true); count++; cond_resched(); - } while ((vma = mas_find(&mas, ULONG_MAX)) != NULL); + vma = mas_find(&mas, ULONG_MAX); + } while (vma && likely(!xa_is_zero(vma))); BUG_ON(count != mm->map_count); trace_exit_mmap(mm); +destroy: __mt_destroy(&mm->mm_mt); mmap_write_unlock(mm); vm_unacct_memory(nr_accounted); -- cgit v1.2.3 From ff6c3d81f2e86b63a3a530683f89ef393882782a Mon Sep 17 00:00:00 2001 From: Liam Ni Date: Thu, 26 Oct 2023 10:03:29 +0800 Subject: NUMA: optimize detection of memory with no node id assigned by firmware Sanity check that makes sure the nodes cover all memory loops over numa_meminfo to count the pages that have node id assigned by the firmware, then loops again over memblock.memory to find the total amount of memory and in the end checks that the difference between the total memory and memory that covered by nodes is less than some threshold. Worse, the loop over numa_meminfo calls __absent_pages_in_range() that also partially traverses memblock.memory. It's much simpler and more efficient to have a single traversal of memblock.memory that verifies that amount of memory not covered by nodes is less than a threshold. Introduce memblock_validate_numa_coverage() that does exactly that and use it instead of numa_meminfo_cover_memory(). Link: https://lkml.kernel.org/r/20231026020329.327329-1-zhiguangni01@gmail.com Signed-off-by: Liam Ni Reviewed-by: Mike Rapoport (IBM) Cc: Andy Lutomirski Cc: Bibo Mao Cc: Binbin Zhou Cc: Borislav Petkov Cc: Dave Hansen Cc: Feiyang Chen Cc: "H. Peter Anvin" Cc: Huacai Chen Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: WANG Xuerui Signed-off-by: Andrew Morton --- arch/loongarch/kernel/numa.c | 28 +--------------------------- arch/x86/mm/numa.c | 34 ++-------------------------------- include/linux/memblock.h | 1 + mm/memblock.c | 34 ++++++++++++++++++++++++++++++++++ 4 files changed, 38 insertions(+), 59 deletions(-) (limited to 'include') diff --git a/arch/loongarch/kernel/numa.c b/arch/loongarch/kernel/numa.c index 6e65ff12d5c7..8fe21f868f72 100644 --- a/arch/loongarch/kernel/numa.c +++ b/arch/loongarch/kernel/numa.c @@ -226,32 +226,6 @@ static void __init node_mem_init(unsigned int node) #ifdef CONFIG_ACPI_NUMA -/* - * Sanity check to catch more bad NUMA configurations (they are amazingly - * common). Make sure the nodes cover all memory. - */ -static bool __init numa_meminfo_cover_memory(const struct numa_meminfo *mi) -{ - int i; - u64 numaram, biosram; - - numaram = 0; - for (i = 0; i < mi->nr_blks; i++) { - u64 s = mi->blk[i].start >> PAGE_SHIFT; - u64 e = mi->blk[i].end >> PAGE_SHIFT; - - numaram += e - s; - numaram -= __absent_pages_in_range(mi->blk[i].nid, s, e); - if ((s64)numaram < 0) - numaram = 0; - } - max_pfn = max_low_pfn; - biosram = max_pfn - absent_pages_in_range(0, max_pfn); - - BUG_ON((s64)(biosram - numaram) >= (1 << (20 - PAGE_SHIFT))); - return true; -} - static void __init add_node_intersection(u32 node, u64 start, u64 size, u32 type) { static unsigned long num_physpages; @@ -396,7 +370,7 @@ int __init init_numa_memory(void) return -EINVAL; init_node_memblock(); - if (numa_meminfo_cover_memory(&numa_meminfo) == false) + if (!memblock_validate_numa_coverage(SZ_1M)) return -EINVAL; for_each_node_mask(node, node_possible_map) { diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index b29ceb19e46e..adc497b93f03 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -449,37 +449,6 @@ int __node_distance(int from, int to) } EXPORT_SYMBOL(__node_distance); -/* - * Sanity check to catch more bad NUMA configurations (they are amazingly - * common). Make sure the nodes cover all memory. - */ -static bool __init numa_meminfo_cover_memory(const struct numa_meminfo *mi) -{ - u64 numaram, e820ram; - int i; - - numaram = 0; - for (i = 0; i < mi->nr_blks; i++) { - u64 s = mi->blk[i].start >> PAGE_SHIFT; - u64 e = mi->blk[i].end >> PAGE_SHIFT; - numaram += e - s; - numaram -= __absent_pages_in_range(mi->blk[i].nid, s, e); - if ((s64)numaram < 0) - numaram = 0; - } - - e820ram = max_pfn - absent_pages_in_range(0, max_pfn); - - /* We seem to lose 3 pages somewhere. Allow 1M of slack. */ - if ((s64)(e820ram - numaram) >= (1 << (20 - PAGE_SHIFT))) { - printk(KERN_ERR "NUMA: nodes only cover %LuMB of your %LuMB e820 RAM. Not used.\n", - (numaram << PAGE_SHIFT) >> 20, - (e820ram << PAGE_SHIFT) >> 20); - return false; - } - return true; -} - /* * Mark all currently memblock-reserved physical memory (which covers the * kernel's own memory ranges) as hot-unswappable. @@ -585,7 +554,8 @@ static int __init numa_register_memblks(struct numa_meminfo *mi) return -EINVAL; } } - if (!numa_meminfo_cover_memory(mi)) + + if (!memblock_validate_numa_coverage(SZ_1M)) return -EINVAL; /* Finally register nodes. */ diff --git a/include/linux/memblock.h b/include/linux/memblock.h index ae3bde302f70..b695f9e946da 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -123,6 +123,7 @@ int memblock_physmem_add(phys_addr_t base, phys_addr_t size); void memblock_trim_memory(phys_addr_t align); bool memblock_overlaps_region(struct memblock_type *type, phys_addr_t base, phys_addr_t size); +bool memblock_validate_numa_coverage(unsigned long threshold_bytes); int memblock_mark_hotplug(phys_addr_t base, phys_addr_t size); int memblock_clear_hotplug(phys_addr_t base, phys_addr_t size); int memblock_mark_mirror(phys_addr_t base, phys_addr_t size); diff --git a/mm/memblock.c b/mm/memblock.c index 5a88d6d24d79..4a62f7774b65 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -734,6 +734,40 @@ int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size) return memblock_add_range(&memblock.memory, base, size, MAX_NUMNODES, 0); } +/** + * memblock_validate_numa_coverage - check if amount of memory with + * no node ID assigned is less than a threshold + * @threshold_bytes: maximal number of pages that can have unassigned node + * ID (in bytes). + * + * A buggy firmware may report memory that does not belong to any node. + * Check if amount of such memory is below @threshold_bytes. + * + * Return: true on success, false on failure. + */ +bool __init_memblock memblock_validate_numa_coverage(unsigned long threshold_bytes) +{ + unsigned long nr_pages = 0; + unsigned long start_pfn, end_pfn, mem_size_mb; + int nid, i; + + /* calculate lose page */ + for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) { + if (nid == NUMA_NO_NODE) + nr_pages += end_pfn - start_pfn; + } + + if ((nr_pages << PAGE_SHIFT) >= threshold_bytes) { + mem_size_mb = memblock_phys_mem_size() >> 20; + pr_err("NUMA: no nodes coverage for %luMB of %luMB RAM\n", + (nr_pages << PAGE_SHIFT) >> 20, mem_size_mb); + return false; + } + + return true; +} + + /** * memblock_isolate_range - isolate given range into disjoint memblocks * @type: memblock type to isolate range for -- cgit v1.2.3 From e6a9a2cbc13bf43e4c03f57666e93d511249d5d7 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Mon, 6 Nov 2023 14:09:58 -0800 Subject: fs/proc/task_mmu: report SOFT_DIRTY bits through the PAGEMAP_SCAN ioctl MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The PAGEMAP_SCAN ioctl returns information regarding page table entries. It is more efficient compared to reading pagemap files. CRIU can start to utilize this ioctl, but it needs info about soft-dirty bits to track memory changes. We are aware of a new method for tracking memory changes implemented in the PAGEMAP_SCAN ioctl. For CRIU, the primary advantage of this method is its usability by unprivileged users. However, it is not feasible to transparently replace the soft-dirty tracker with the new one. The main problem here is userfault descriptors that have to be preserved between pre-dump iterations. It means criu continues supporting the soft-dirty method to avoid breakage for current users. The new method will be implemented as a separate feature. [avagin@google.com: update tools/include/uapi/linux/fs.h] Link: https://lkml.kernel.org/r/20231107164139.576046-1-avagin@google.com Link: https://lkml.kernel.org/r/20231106220959.296568-1-avagin@google.com Signed-off-by: Andrei Vagin Reviewed-by: Muhammad Usama Anjum Cc: Michał Mirosław Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/pagemap.rst | 1 + fs/proc/task_mmu.c | 17 ++++++++++++++++- include/uapi/linux/fs.h | 1 + tools/include/uapi/linux/fs.h | 1 + 4 files changed, 19 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/Documentation/admin-guide/mm/pagemap.rst b/Documentation/admin-guide/mm/pagemap.rst index fe17cf210426..f5f065c67615 100644 --- a/Documentation/admin-guide/mm/pagemap.rst +++ b/Documentation/admin-guide/mm/pagemap.rst @@ -253,6 +253,7 @@ Following flags about pages are currently supported: - ``PAGE_IS_SWAPPED`` - Page is in swapped - ``PAGE_IS_PFNZERO`` - Page has zero PFN - ``PAGE_IS_HUGE`` - Page is THP or Hugetlb backed +- ``PAGE_IS_SOFT_DIRTY`` - Page is soft-dirty The ``struct pm_scan_arg`` is used as the argument of the IOCTL. diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 435b61054b5b..d19924bf0a39 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1761,7 +1761,7 @@ static int pagemap_release(struct inode *inode, struct file *file) #define PM_SCAN_CATEGORIES (PAGE_IS_WPALLOWED | PAGE_IS_WRITTEN | \ PAGE_IS_FILE | PAGE_IS_PRESENT | \ PAGE_IS_SWAPPED | PAGE_IS_PFNZERO | \ - PAGE_IS_HUGE) + PAGE_IS_HUGE | PAGE_IS_SOFT_DIRTY) #define PM_SCAN_FLAGS (PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC) struct pagemap_scan_private { @@ -1793,6 +1793,8 @@ static unsigned long pagemap_page_category(struct pagemap_scan_private *p, if (is_zero_pfn(pte_pfn(pte))) categories |= PAGE_IS_PFNZERO; + if (pte_soft_dirty(pte)) + categories |= PAGE_IS_SOFT_DIRTY; } else if (is_swap_pte(pte)) { swp_entry_t swp; @@ -1806,6 +1808,8 @@ static unsigned long pagemap_page_category(struct pagemap_scan_private *p, !PageAnon(pfn_swap_entry_to_page(swp))) categories |= PAGE_IS_FILE; } + if (pte_swp_soft_dirty(pte)) + categories |= PAGE_IS_SOFT_DIRTY; } return categories; @@ -1853,12 +1857,16 @@ static unsigned long pagemap_thp_category(struct pagemap_scan_private *p, if (is_zero_pfn(pmd_pfn(pmd))) categories |= PAGE_IS_PFNZERO; + if (pmd_soft_dirty(pmd)) + categories |= PAGE_IS_SOFT_DIRTY; } else if (is_swap_pmd(pmd)) { swp_entry_t swp; categories |= PAGE_IS_SWAPPED; if (!pmd_swp_uffd_wp(pmd)) categories |= PAGE_IS_WRITTEN; + if (pmd_swp_soft_dirty(pmd)) + categories |= PAGE_IS_SOFT_DIRTY; if (p->masks_of_interest & PAGE_IS_FILE) { swp = pmd_to_swp_entry(pmd); @@ -1905,10 +1913,14 @@ static unsigned long pagemap_hugetlb_category(pte_t pte) categories |= PAGE_IS_FILE; if (is_zero_pfn(pte_pfn(pte))) categories |= PAGE_IS_PFNZERO; + if (pte_soft_dirty(pte)) + categories |= PAGE_IS_SOFT_DIRTY; } else if (is_swap_pte(pte)) { categories |= PAGE_IS_SWAPPED; if (!pte_swp_uffd_wp_any(pte)) categories |= PAGE_IS_WRITTEN; + if (pte_swp_soft_dirty(pte)) + categories |= PAGE_IS_SOFT_DIRTY; } return categories; @@ -2007,6 +2019,9 @@ static int pagemap_scan_test_walk(unsigned long start, unsigned long end, if (wp_allowed) vma_category |= PAGE_IS_WPALLOWED; + if (vma->vm_flags & VM_SOFTDIRTY) + vma_category |= PAGE_IS_SOFT_DIRTY; + if (!pagemap_scan_is_interesting_vma(vma_category, p)) return 1; diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index da43810b7485..48ad69f7722e 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -316,6 +316,7 @@ typedef int __bitwise __kernel_rwf_t; #define PAGE_IS_SWAPPED (1 << 4) #define PAGE_IS_PFNZERO (1 << 5) #define PAGE_IS_HUGE (1 << 6) +#define PAGE_IS_SOFT_DIRTY (1 << 7) /* * struct page_region - Page region with flags diff --git a/tools/include/uapi/linux/fs.h b/tools/include/uapi/linux/fs.h index da43810b7485..48ad69f7722e 100644 --- a/tools/include/uapi/linux/fs.h +++ b/tools/include/uapi/linux/fs.h @@ -316,6 +316,7 @@ typedef int __bitwise __kernel_rwf_t; #define PAGE_IS_SWAPPED (1 << 4) #define PAGE_IS_PFNZERO (1 << 5) #define PAGE_IS_HUGE (1 << 6) +#define PAGE_IS_SOFT_DIRTY (1 << 7) /* * struct page_region - Page region with flags -- cgit v1.2.3 From a4fc4a0c45f2617c3aa8b693739de264e0c09909 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 7 Nov 2023 21:26:40 +0000 Subject: mm: add folio_zero_tail() and use it in ext4 Patch series "Add folio_zero_tail() and folio_fill_tail()". I'm trying to make it easier for filesystems with tailpacking / stuffing / inline data to use folios. The primary function here is folio_fill_tail(). You give it a pointer to memory where the data currently is, and it takes care of copying it into the folio at that offset. That works for gfs2 & iomap. Then There's Ext4. Rather than gin up some kind of specialist "Here's a two pointers to two blocks of memory" routine, just let it do its current thing, and let it call folio_zero_tail(), which is also called by folio_fill_tail(). Other filesystems can be converted later; these ones seemed like good examples as they're already partly or completely converted to folios. This patch (of 3): Instead of unmapping the folio after copying the data to it, then mapping it again to zero the tail, provide folio_zero_tail() to zero the tail of an already-mapped folio. [akpm@linux-foundation.org: fix kerneldoc argument ordering] Link: https://lkml.kernel.org/r/20231107212643.3490372-1-willy@infradead.org Link: https://lkml.kernel.org/r/20231107212643.3490372-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Andreas Gruenbacher Cc: Darrick J. Wong Cc: Theodore Ts'o Signed-off-by: Andrew Morton --- fs/ext4/inline.c | 3 +-- include/linux/highmem.h | 38 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 9a84a5f9fef4..d5bd1e3a5d36 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -502,9 +502,8 @@ static int ext4_read_inline_folio(struct inode *inode, struct folio *folio) BUG_ON(len > PAGE_SIZE); kaddr = kmap_local_folio(folio, 0); ret = ext4_read_inline_data(inode, kaddr, len, &iloc); - flush_dcache_folio(folio); + kaddr = folio_zero_tail(folio, len, kaddr + len); kunmap_local(kaddr); - folio_zero_segment(folio, len, folio_size(folio)); folio_mark_uptodate(folio); brelse(iloc.bh); diff --git a/include/linux/highmem.h b/include/linux/highmem.h index be20cff4ba73..5ebd5e4dfbf8 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -483,6 +483,44 @@ static inline void memcpy_to_folio(struct folio *folio, size_t offset, flush_dcache_folio(folio); } +/** + * folio_zero_tail - Zero the tail of a folio. + * @folio: The folio to zero. + * @offset: The byte offset in the folio to start zeroing at. + * @kaddr: The address the folio is currently mapped to. + * + * If you have already used kmap_local_folio() to map a folio, written + * some data to it and now need to zero the end of the folio (and flush + * the dcache), you can use this function. If you do not have the + * folio kmapped (eg the folio has been partially populated by DMA), + * use folio_zero_range() or folio_zero_segment() instead. + * + * Return: An address which can be passed to kunmap_local(). + */ +static inline __must_check void *folio_zero_tail(struct folio *folio, + size_t offset, void *kaddr) +{ + size_t len = folio_size(folio) - offset; + + if (folio_test_highmem(folio)) { + size_t max = PAGE_SIZE - offset_in_page(offset); + + while (len > max) { + memset(kaddr, 0, max); + kunmap_local(kaddr); + len -= max; + offset += max; + max = PAGE_SIZE; + kaddr = kmap_local_folio(folio, offset); + } + } + + memset(kaddr, 0, len); + flush_dcache_folio(folio); + + return kaddr; +} + /** * memcpy_from_file_folio - Copy some bytes from a file folio. * @to: The destination buffer. -- cgit v1.2.3 From 6eaa266b54660f6b3654ad8902b4f7027054f55a Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 7 Nov 2023 21:26:41 +0000 Subject: mm: add folio_fill_tail() and use it in iomap The iomap code was limited to PAGE_SIZE bytes; generalise it to cover an arbitrary-sized folio, and move it to be a common helper. [akpm@linux-foundation.org: fix folio_fill_tail(), per Andreas Gruenbacher] Link: https://lkml.kernel.org/r/20231107212643.3490372-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Andreas Gruenbacher Cc: Andreas Dilger Cc: Darrick J. Wong Cc: Theodore Ts'o Cc: Andreas Gruenbacher Signed-off-by: Andrew Morton --- fs/iomap/buffered-io.c | 14 ++------------ include/linux/highmem.h | 38 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index f72df2babe56..093c4515b22a 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -305,28 +305,18 @@ static int iomap_read_inline_data(const struct iomap_iter *iter, { const struct iomap *iomap = iomap_iter_srcmap(iter); size_t size = i_size_read(iter->inode) - iomap->offset; - size_t poff = offset_in_page(iomap->offset); size_t offset = offset_in_folio(folio, iomap->offset); - void *addr; if (folio_test_uptodate(folio)) return 0; - if (WARN_ON_ONCE(size > PAGE_SIZE - poff)) - return -EIO; - if (WARN_ON_ONCE(size > PAGE_SIZE - - offset_in_page(iomap->inline_data))) - return -EIO; if (WARN_ON_ONCE(size > iomap->length)) return -EIO; if (offset > 0) ifs_alloc(iter->inode, folio, iter->flags); - addr = kmap_local_folio(folio, offset); - memcpy(addr, iomap->inline_data, size); - memset(addr + size, 0, PAGE_SIZE - poff - size); - kunmap_local(addr); - iomap_set_range_uptodate(folio, offset, PAGE_SIZE - poff); + folio_fill_tail(folio, offset, iomap->inline_data, size); + iomap_set_range_uptodate(folio, offset, folio_size(folio) - offset); return 0; } diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 5ebd5e4dfbf8..451c1dff0e87 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -521,6 +521,44 @@ static inline __must_check void *folio_zero_tail(struct folio *folio, return kaddr; } +/** + * folio_fill_tail - Copy some data to a folio and pad with zeroes. + * @folio: The destination folio. + * @offset: The offset into @folio at which to start copying. + * @from: The data to copy. + * @len: How many bytes of data to copy. + * + * This function is most useful for filesystems which support inline data. + * When they want to copy data from the inode into the page cache, this + * function does everything for them. It supports large folios even on + * HIGHMEM configurations. + */ +static inline void folio_fill_tail(struct folio *folio, size_t offset, + const char *from, size_t len) +{ + char *to = kmap_local_folio(folio, offset); + + VM_BUG_ON(offset + len > folio_size(folio)); + + if (folio_test_highmem(folio)) { + size_t max = PAGE_SIZE - offset_in_page(offset); + + while (len > max) { + memcpy(to, from, max); + kunmap_local(to); + len -= max; + from += max; + offset += max; + max = PAGE_SIZE; + to = kmap_local_folio(folio, offset); + } + } + + memcpy(to, from, len); + to = folio_zero_tail(folio, offset + len, to + len); + kunmap_local(to); +} + /** * memcpy_from_file_folio - Copy some bytes from a file folio. * @to: The destination buffer. -- cgit v1.2.3 From c36f9d3d2c3e17f9eef1d2f47a63c91d51d55e87 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 8 Nov 2023 20:46:02 +0000 Subject: mm: remove test_set_page_writeback() Patch series "Make folio_start_writeback return void". Most of the folio flag-setting functions return void. folio_start_writeback is gratuitously different; the only two filesystems that do anything with the return value emit debug messages if it's already set, and we can (and should) do that internally without bothering the filesystem to do it. This patch (of 4): There are no more callers of this wrapper. Link: https://lkml.kernel.org/r/20231108204605.745109-1-willy@infradead.org Link: https://lkml.kernel.org/r/20231108204605.745109-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: David Howells Cc: Steve French Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index a88e64acebfe..a440062e9386 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -780,11 +780,6 @@ bool set_page_writeback(struct page *page); #define folio_start_writeback_keepwrite(folio) \ __folio_start_writeback(folio, true) -static inline bool test_set_page_writeback(struct page *page) -{ - return set_page_writeback(page); -} - static __always_inline bool folio_test_head(struct folio *folio) { return test_bit(PG_head, folio_flags(folio, FOLIO_PF_ANY)); -- cgit v1.2.3 From b5612c368648a7be52411b288d09593e5945d1aa Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 8 Nov 2023 20:46:05 +0000 Subject: mm: return void from folio_start_writeback() and related functions Nobody now checks the return value from any of these functions, so add an assertion at the beginning of the function and return void. Link: https://lkml.kernel.org/r/20231108204605.745109-5-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Josef Bacik Cc: David Howells Cc: Steve French Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 4 ++-- mm/folio-compat.c | 4 ++-- mm/page-writeback.c | 54 +++++++++++++++++++++------------------------- 3 files changed, 29 insertions(+), 33 deletions(-) (limited to 'include') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index a440062e9386..735cddc13d20 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -772,8 +772,8 @@ static __always_inline void SetPageUptodate(struct page *page) CLEARPAGEFLAG(Uptodate, uptodate, PF_NO_TAIL) -bool __folio_start_writeback(struct folio *folio, bool keep_write); -bool set_page_writeback(struct page *page); +void __folio_start_writeback(struct folio *folio, bool keep_write); +void set_page_writeback(struct page *page); #define folio_start_writeback(folio) \ __folio_start_writeback(folio, false) diff --git a/mm/folio-compat.c b/mm/folio-compat.c index 10c3247542cb..aee3b9a16828 100644 --- a/mm/folio-compat.c +++ b/mm/folio-compat.c @@ -46,9 +46,9 @@ void mark_page_accessed(struct page *page) } EXPORT_SYMBOL(mark_page_accessed); -bool set_page_writeback(struct page *page) +void set_page_writeback(struct page *page) { - return folio_start_writeback(page_folio(page)); + folio_start_writeback(page_folio(page)); } EXPORT_SYMBOL(set_page_writeback); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index ee2fd6a6af40..ca64bd513fa2 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2982,67 +2982,63 @@ bool __folio_end_writeback(struct folio *folio) return ret; } -bool __folio_start_writeback(struct folio *folio, bool keep_write) +void __folio_start_writeback(struct folio *folio, bool keep_write) { long nr = folio_nr_pages(folio); struct address_space *mapping = folio_mapping(folio); - bool ret; int access_ret; + VM_BUG_ON_FOLIO(folio_test_writeback(folio), folio); + folio_memcg_lock(folio); if (mapping && mapping_use_writeback_tags(mapping)) { XA_STATE(xas, &mapping->i_pages, folio_index(folio)); struct inode *inode = mapping->host; struct backing_dev_info *bdi = inode_to_bdi(inode); unsigned long flags; + bool on_wblist; xas_lock_irqsave(&xas, flags); xas_load(&xas); - ret = folio_test_set_writeback(folio); - if (!ret) { - bool on_wblist; + folio_test_set_writeback(folio); - on_wblist = mapping_tagged(mapping, - PAGECACHE_TAG_WRITEBACK); + on_wblist = mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK); - xas_set_mark(&xas, PAGECACHE_TAG_WRITEBACK); - if (bdi->capabilities & BDI_CAP_WRITEBACK_ACCT) { - struct bdi_writeback *wb = inode_to_wb(inode); - - wb_stat_mod(wb, WB_WRITEBACK, nr); - if (!on_wblist) - wb_inode_writeback_start(wb); - } + xas_set_mark(&xas, PAGECACHE_TAG_WRITEBACK); + if (bdi->capabilities & BDI_CAP_WRITEBACK_ACCT) { + struct bdi_writeback *wb = inode_to_wb(inode); - /* - * We can come through here when swapping - * anonymous folios, so we don't necessarily - * have an inode to track for sync. - */ - if (mapping->host && !on_wblist) - sb_mark_inode_writeback(mapping->host); + wb_stat_mod(wb, WB_WRITEBACK, nr); + if (!on_wblist) + wb_inode_writeback_start(wb); } + + /* + * We can come through here when swapping anonymous + * folios, so we don't necessarily have an inode to + * track for sync. + */ + if (mapping->host && !on_wblist) + sb_mark_inode_writeback(mapping->host); if (!folio_test_dirty(folio)) xas_clear_mark(&xas, PAGECACHE_TAG_DIRTY); if (!keep_write) xas_clear_mark(&xas, PAGECACHE_TAG_TOWRITE); xas_unlock_irqrestore(&xas, flags); } else { - ret = folio_test_set_writeback(folio); - } - if (!ret) { - lruvec_stat_mod_folio(folio, NR_WRITEBACK, nr); - zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, nr); + folio_test_set_writeback(folio); } + + lruvec_stat_mod_folio(folio, NR_WRITEBACK, nr); + zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, nr); folio_memcg_unlock(folio); + access_ret = arch_make_folio_accessible(folio); /* * If writeback has been triggered on a page that cannot be made * accessible, it is too late to recover here. */ VM_BUG_ON_FOLIO(access_ret != 0, folio); - - return ret; } EXPORT_SYMBOL(__folio_start_writeback); -- cgit v1.2.3 From 16f5dfbc851b55b87101a20e181d4a14be3007d6 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 9 Nov 2023 21:15:07 +0000 Subject: gfp: include __GFP_NOWARN in GFP_NOWAIT GFP_NOWAIT callers are always prepared for their allocations to fail because they fail so frequently. Forcing the callers to remember to add __GFP_NOWARN is just annoying and leads to an endless stream of patches for the places where we forgot to add it. We can now remove __GFP_NOWARN from all the callers which specify GFP_NOWAIT, but I'd rather wait a cycle and send patches to each maintainer instead of creating a big pile of merge conflicts. Link: https://lkml.kernel.org/r/20231109211507.2262419-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/gfp_types.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/gfp_types.h b/include/linux/gfp_types.h index 6583a58670c5..ae994534a12a 100644 --- a/include/linux/gfp_types.h +++ b/include/linux/gfp_types.h @@ -274,7 +274,8 @@ typedef unsigned int __bitwise gfp_t; * accounted to kmemcg. * * %GFP_NOWAIT is for kernel allocations that should not stall for direct - * reclaim, start physical IO or use any filesystem callback. + * reclaim, start physical IO or use any filesystem callback. It is very + * likely to fail to allocate memory, even for very small allocations. * * %GFP_NOIO will use direct reclaim to discard clean pages or slab pages * that do not require the starting of any physical IO. @@ -325,7 +326,7 @@ typedef unsigned int __bitwise gfp_t; #define GFP_ATOMIC (__GFP_HIGH|__GFP_KSWAPD_RECLAIM) #define GFP_KERNEL (__GFP_RECLAIM | __GFP_IO | __GFP_FS) #define GFP_KERNEL_ACCOUNT (GFP_KERNEL | __GFP_ACCOUNT) -#define GFP_NOWAIT (__GFP_KSWAPD_RECLAIM) +#define GFP_NOWAIT (__GFP_KSWAPD_RECLAIM | __GFP_NOWARN) #define GFP_NOIO (__GFP_RECLAIM) #define GFP_NOFS (__GFP_RECLAIM | __GFP_IO) #define GFP_USER (__GFP_RECLAIM | __GFP_IO | __GFP_FS | __GFP_HARDWALL) -- cgit v1.2.3 From af7628d6ec196999175ecb3fdb38336489b0f88a Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 17 Nov 2023 16:14:47 +0000 Subject: fs: convert error_remove_page to error_remove_folio There were already assertions that we were not passing a tail page to error_remove_page(), so make the compiler enforce that by converting everything to pass and use a folio. Link: https://lkml.kernel.org/r/20231117161447.2461643-7-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Naoya Horiguchi Signed-off-by: Andrew Morton --- Documentation/filesystems/locking.rst | 4 ++-- Documentation/filesystems/vfs.rst | 6 +++--- block/fops.c | 2 +- fs/afs/write.c | 2 +- fs/bcachefs/fs.c | 2 +- fs/btrfs/inode.c | 2 +- fs/ceph/addr.c | 4 ++-- fs/ext2/inode.c | 2 +- fs/ext4/inode.c | 6 +++--- fs/f2fs/compress.c | 2 +- fs/f2fs/inode.c | 2 +- fs/gfs2/aops.c | 4 ++-- fs/hugetlbfs/inode.c | 6 +++--- fs/nfs/file.c | 2 +- fs/ntfs/aops.c | 6 +++--- fs/ocfs2/aops.c | 2 +- fs/xfs/xfs_aops.c | 2 +- fs/zonefs/file.c | 2 +- include/linux/fs.h | 2 +- include/linux/mm.h | 3 ++- mm/memory-failure.c | 10 +++++----- mm/shmem.c | 6 +++--- mm/truncate.c | 9 ++++----- 23 files changed, 44 insertions(+), 44 deletions(-) (limited to 'include') diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst index 7be2900806c8..421daf837940 100644 --- a/Documentation/filesystems/locking.rst +++ b/Documentation/filesystems/locking.rst @@ -261,7 +261,7 @@ prototypes:: struct folio *src, enum migrate_mode); int (*launder_folio)(struct folio *); bool (*is_partially_uptodate)(struct folio *, size_t from, size_t count); - int (*error_remove_page)(struct address_space *, struct page *); + int (*error_remove_folio)(struct address_space *, struct folio *); int (*swap_activate)(struct swap_info_struct *sis, struct file *f, sector_t *span) int (*swap_deactivate)(struct file *); int (*swap_rw)(struct kiocb *iocb, struct iov_iter *iter); @@ -287,7 +287,7 @@ direct_IO: migrate_folio: yes (both) launder_folio: yes is_partially_uptodate: yes -error_remove_page: yes +error_remove_folio: yes swap_activate: no swap_deactivate: no swap_rw: yes, unlocks diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst index 99acc2e98673..dd99ce5912d8 100644 --- a/Documentation/filesystems/vfs.rst +++ b/Documentation/filesystems/vfs.rst @@ -823,7 +823,7 @@ cache in your filesystem. The following members are defined: bool (*is_partially_uptodate) (struct folio *, size_t from, size_t count); void (*is_dirty_writeback)(struct folio *, bool *, bool *); - int (*error_remove_page) (struct mapping *mapping, struct page *page); + int (*error_remove_folio)(struct mapping *mapping, struct folio *); int (*swap_activate)(struct swap_info_struct *sis, struct file *f, sector_t *span) int (*swap_deactivate)(struct file *); int (*swap_rw)(struct kiocb *iocb, struct iov_iter *iter); @@ -1034,8 +1034,8 @@ cache in your filesystem. The following members are defined: VM if a folio should be treated as dirty or writeback for the purposes of stalling. -``error_remove_page`` - normally set to generic_error_remove_page if truncation is ok +``error_remove_folio`` + normally set to generic_error_remove_folio if truncation is ok for this address space. Used for memory failure handling. Setting this implies you deal with pages going away under you, unless you have them locked or reference counts increased. diff --git a/block/fops.c b/block/fops.c index 0abaac705daf..0bdad1e8d514 100644 --- a/block/fops.c +++ b/block/fops.c @@ -500,7 +500,7 @@ const struct address_space_operations def_blk_aops = { .readahead = blkdev_readahead, .writepages = blkdev_writepages, .is_partially_uptodate = iomap_is_partially_uptodate, - .error_remove_page = generic_error_remove_page, + .error_remove_folio = generic_error_remove_folio, .migrate_folio = filemap_migrate_folio, }; #endif /* CONFIG_BUFFER_HEAD */ diff --git a/fs/afs/write.c b/fs/afs/write.c index 57d05d67f0c2..e87b52b1f34c 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -242,7 +242,7 @@ static void afs_kill_pages(struct address_space *mapping, folio_clear_uptodate(folio); folio_end_writeback(folio); folio_lock(folio); - generic_error_remove_page(mapping, &folio->page); + generic_error_remove_folio(mapping, folio); folio_unlock(folio); folio_put(folio); diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 4d51be813509..df4a97b6637b 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -1103,7 +1103,7 @@ static const struct address_space_operations bch_address_space_operations = { #ifdef CONFIG_MIGRATION .migrate_folio = filemap_migrate_folio, #endif - .error_remove_page = generic_error_remove_page, + .error_remove_folio = generic_error_remove_folio, }; struct bcachefs_fid { diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 9f5a9894f88f..ff7b4efca24f 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -10930,7 +10930,7 @@ static const struct address_space_operations btrfs_aops = { .release_folio = btrfs_release_folio, .migrate_folio = btrfs_migrate_folio, .dirty_folio = filemap_dirty_folio, - .error_remove_page = generic_error_remove_page, + .error_remove_folio = generic_error_remove_folio, .swap_activate = btrfs_swap_activate, .swap_deactivate = btrfs_swap_deactivate, }; diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 85be3bf18cdf..13af429ab030 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -907,8 +907,8 @@ static void writepages_finish(struct ceph_osd_request *req) doutc(cl, "unlocking %p\n", page); if (remove_page) - generic_error_remove_page(inode->i_mapping, - page); + generic_error_remove_folio(inode->i_mapping, + page_folio(page)); unlock_page(page); } diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 464faf6c217e..5a4272b2c6b0 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -969,7 +969,7 @@ const struct address_space_operations ext2_aops = { .writepages = ext2_writepages, .migrate_folio = buffer_migrate_folio, .is_partially_uptodate = block_is_partially_uptodate, - .error_remove_page = generic_error_remove_page, + .error_remove_folio = generic_error_remove_folio, }; static const struct address_space_operations ext2_dax_aops = { diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 61277f7f8722..d7729b17a66b 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3564,7 +3564,7 @@ static const struct address_space_operations ext4_aops = { .direct_IO = noop_direct_IO, .migrate_folio = buffer_migrate_folio, .is_partially_uptodate = block_is_partially_uptodate, - .error_remove_page = generic_error_remove_page, + .error_remove_folio = generic_error_remove_folio, .swap_activate = ext4_iomap_swap_activate, }; @@ -3581,7 +3581,7 @@ static const struct address_space_operations ext4_journalled_aops = { .direct_IO = noop_direct_IO, .migrate_folio = buffer_migrate_folio_norefs, .is_partially_uptodate = block_is_partially_uptodate, - .error_remove_page = generic_error_remove_page, + .error_remove_folio = generic_error_remove_folio, .swap_activate = ext4_iomap_swap_activate, }; @@ -3598,7 +3598,7 @@ static const struct address_space_operations ext4_da_aops = { .direct_IO = noop_direct_IO, .migrate_folio = buffer_migrate_folio, .is_partially_uptodate = block_is_partially_uptodate, - .error_remove_page = generic_error_remove_page, + .error_remove_folio = generic_error_remove_folio, .swap_activate = ext4_iomap_swap_activate, }; diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 36e5dab6baae..6b2af514660d 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -1944,7 +1944,7 @@ void f2fs_invalidate_compress_pages(struct f2fs_sb_info *sbi, nid_t ino) continue; } - generic_error_remove_page(mapping, &folio->page); + generic_error_remove_folio(mapping, folio); folio_unlock(folio); } folio_batch_release(&fbatch); diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 560bfcad1af2..a9eb3891f417 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -600,7 +600,7 @@ make_now: #ifdef CONFIG_F2FS_FS_COMPRESSION inode->i_mapping->a_ops = &f2fs_compress_aops; /* - * generic_error_remove_page only truncates pages of regular + * generic_error_remove_folio only truncates pages of regular * inode */ inode->i_mode |= S_IFREG; diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index ba8742dc91f8..5cffb079b87c 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -745,7 +745,7 @@ static const struct address_space_operations gfs2_aops = { .bmap = gfs2_bmap, .migrate_folio = filemap_migrate_folio, .is_partially_uptodate = iomap_is_partially_uptodate, - .error_remove_page = generic_error_remove_page, + .error_remove_folio = generic_error_remove_folio, }; static const struct address_space_operations gfs2_jdata_aops = { @@ -758,7 +758,7 @@ static const struct address_space_operations gfs2_jdata_aops = { .invalidate_folio = gfs2_invalidate_folio, .release_folio = gfs2_release_folio, .is_partially_uptodate = block_is_partially_uptodate, - .error_remove_page = generic_error_remove_page, + .error_remove_folio = generic_error_remove_folio, }; void gfs2_set_aops(struct inode *inode) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index f757d4f7ad98..36132c9125f9 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -1129,8 +1129,8 @@ static int hugetlbfs_migrate_folio(struct address_space *mapping, #define hugetlbfs_migrate_folio NULL #endif -static int hugetlbfs_error_remove_page(struct address_space *mapping, - struct page *page) +static int hugetlbfs_error_remove_folio(struct address_space *mapping, + struct folio *folio) { return 0; } @@ -1277,7 +1277,7 @@ static const struct address_space_operations hugetlbfs_aops = { .write_end = hugetlbfs_write_end, .dirty_folio = noop_dirty_folio, .migrate_folio = hugetlbfs_migrate_folio, - .error_remove_page = hugetlbfs_error_remove_page, + .error_remove_folio = hugetlbfs_error_remove_folio, }; diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 3f9768810427..e8cccb94b927 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -567,7 +567,7 @@ const struct address_space_operations nfs_file_aops = { .migrate_folio = nfs_migrate_folio, .launder_folio = nfs_launder_folio, .is_dirty_writeback = nfs_check_dirty_writeback, - .error_remove_page = generic_error_remove_page, + .error_remove_folio = generic_error_remove_folio, .swap_activate = nfs_swap_activate, .swap_deactivate = nfs_swap_deactivate, .swap_rw = nfs_swap_rw, diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c index 71e31e789b29..70479ce915e8 100644 --- a/fs/ntfs/aops.c +++ b/fs/ntfs/aops.c @@ -1644,7 +1644,7 @@ const struct address_space_operations ntfs_normal_aops = { .bmap = ntfs_bmap, .migrate_folio = buffer_migrate_folio, .is_partially_uptodate = block_is_partially_uptodate, - .error_remove_page = generic_error_remove_page, + .error_remove_folio = generic_error_remove_folio, }; /* @@ -1658,7 +1658,7 @@ const struct address_space_operations ntfs_compressed_aops = { #endif /* NTFS_RW */ .migrate_folio = buffer_migrate_folio, .is_partially_uptodate = block_is_partially_uptodate, - .error_remove_page = generic_error_remove_page, + .error_remove_folio = generic_error_remove_folio, }; /* @@ -1673,7 +1673,7 @@ const struct address_space_operations ntfs_mst_aops = { #endif /* NTFS_RW */ .migrate_folio = buffer_migrate_folio, .is_partially_uptodate = block_is_partially_uptodate, - .error_remove_page = generic_error_remove_page, + .error_remove_folio = generic_error_remove_folio, }; #ifdef NTFS_RW diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index ba790219d528..795997806326 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -2480,5 +2480,5 @@ const struct address_space_operations ocfs2_aops = { .release_folio = ocfs2_release_folio, .migrate_folio = buffer_migrate_folio, .is_partially_uptodate = block_is_partially_uptodate, - .error_remove_page = generic_error_remove_page, + .error_remove_folio = generic_error_remove_folio, }; diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 465d7630bb21..813f85156b0c 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -584,7 +584,7 @@ const struct address_space_operations xfs_address_space_operations = { .bmap = xfs_vm_bmap, .migrate_folio = filemap_migrate_folio, .is_partially_uptodate = iomap_is_partially_uptodate, - .error_remove_page = generic_error_remove_page, + .error_remove_folio = generic_error_remove_folio, .swap_activate = xfs_iomap_swapfile_activate, }; diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c index b2c9b35df8f7..6ab2318a9c8e 100644 --- a/fs/zonefs/file.c +++ b/fs/zonefs/file.c @@ -180,7 +180,7 @@ const struct address_space_operations zonefs_file_aops = { .invalidate_folio = iomap_invalidate_folio, .migrate_folio = filemap_migrate_folio, .is_partially_uptodate = iomap_is_partially_uptodate, - .error_remove_page = generic_error_remove_page, + .error_remove_folio = generic_error_remove_folio, .swap_activate = zonefs_swap_activate, }; diff --git a/include/linux/fs.h b/include/linux/fs.h index 98b7a7a8c42e..31b2cf963db9 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -434,7 +434,7 @@ struct address_space_operations { bool (*is_partially_uptodate) (struct folio *, size_t from, size_t count); void (*is_dirty_writeback) (struct folio *, bool *dirty, bool *wb); - int (*error_remove_page)(struct address_space *, struct page *); + int (*error_remove_folio)(struct address_space *, struct folio *); /* swapfile support */ int (*swap_activate)(struct swap_info_struct *sis, struct file *file, diff --git a/include/linux/mm.h b/include/linux/mm.h index 64cd1ee4aacc..13a090271716 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2384,7 +2384,8 @@ extern void truncate_pagecache(struct inode *inode, loff_t new); extern void truncate_setsize(struct inode *inode, loff_t newsize); void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to); void truncate_pagecache_range(struct inode *inode, loff_t offset, loff_t end); -int generic_error_remove_page(struct address_space *mapping, struct page *page); +int generic_error_remove_folio(struct address_space *mapping, + struct folio *folio); struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm, unsigned long address, struct pt_regs *regs); diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 6aec94821fda..d8c853b35dbb 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -927,13 +927,13 @@ static int delete_from_lru_cache(struct folio *folio) return -EIO; } -static int truncate_error_page(struct folio *folio, unsigned long pfn, +static int truncate_error_folio(struct folio *folio, unsigned long pfn, struct address_space *mapping) { int ret = MF_FAILED; - if (mapping->a_ops->error_remove_page) { - int err = mapping->a_ops->error_remove_page(mapping, &folio->page); + if (mapping->a_ops->error_remove_folio) { + int err = mapping->a_ops->error_remove_folio(mapping, folio); if (err != 0) pr_info("%#lx: Failed to punch page: %d\n", pfn, err); @@ -1054,7 +1054,7 @@ static int me_pagecache_clean(struct page_state *ps, struct page *p) * * Open: to take i_rwsem or not for this? Right now we don't. */ - ret = truncate_error_page(folio, page_to_pfn(p), mapping); + ret = truncate_error_folio(folio, page_to_pfn(p), mapping); if (has_extra_refcount(ps, p, extra_pins)) ret = MF_FAILED; @@ -1188,7 +1188,7 @@ static int me_huge_page(struct page_state *ps, struct page *p) mapping = folio_mapping(folio); if (mapping) { - res = truncate_error_page(folio, page_to_pfn(p), mapping); + res = truncate_error_folio(folio, page_to_pfn(p), mapping); /* The page is kept in page cache. */ extra_pins = true; folio_unlock(folio); diff --git a/mm/shmem.c b/mm/shmem.c index 91e2620148b2..97bc622da774 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -4445,8 +4445,8 @@ static void __init shmem_destroy_inodecache(void) } /* Keep the page in page cache instead of truncating it */ -static int shmem_error_remove_page(struct address_space *mapping, - struct page *page) +static int shmem_error_remove_folio(struct address_space *mapping, + struct folio *folio) { return 0; } @@ -4461,7 +4461,7 @@ const struct address_space_operations shmem_aops = { #ifdef CONFIG_MIGRATION .migrate_folio = migrate_folio, #endif - .error_remove_page = shmem_error_remove_page, + .error_remove_folio = shmem_error_remove_folio, }; EXPORT_SYMBOL(shmem_aops); diff --git a/mm/truncate.c b/mm/truncate.c index 52e3a703e7b2..725b150e47ac 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -250,10 +250,9 @@ bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end) /* * Used to get rid of pages on hardware memory corruption. */ -int generic_error_remove_page(struct address_space *mapping, struct page *page) +int generic_error_remove_folio(struct address_space *mapping, + struct folio *folio) { - VM_BUG_ON_PAGE(PageTail(page), page); - if (!mapping) return -EINVAL; /* @@ -262,9 +261,9 @@ int generic_error_remove_page(struct address_space *mapping, struct page *page) */ if (!S_ISREG(mapping->host->i_mode)) return -EIO; - return truncate_inode_folio(mapping, page_folio(page)); + return truncate_inode_folio(mapping, folio); } -EXPORT_SYMBOL(generic_error_remove_page); +EXPORT_SYMBOL(generic_error_remove_folio); /** * mapping_evict_folio() - Remove an unused folio from the page-cache. -- cgit v1.2.3 From 022012dcf44209074af97b6ae531a10c08736b31 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 20 Nov 2023 18:47:13 +0100 Subject: lib/stackdepot, kasan: add flags to __stack_depot_save and rename Change the bool can_alloc argument of __stack_depot_save to a u32 argument that accepts a set of flags. The following patch will add another flag to stack_depot_save_flags besides the existing STACK_DEPOT_FLAG_CAN_ALLOC. Also rename the function to stack_depot_save_flags, as __stack_depot_save is a cryptic name, Link: https://lkml.kernel.org/r/645fa15239621eebbd3a10331e5864b718839512.1700502145.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Marco Elver Cc: Oscar Salvador Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/stackdepot.h | 36 +++++++++++++++++++++++++----------- lib/stackdepot.c | 16 +++++++++++----- mm/kasan/common.c | 7 ++++--- mm/kasan/generic.c | 9 +++++---- mm/kasan/kasan.h | 2 +- mm/kasan/tags.c | 3 ++- 6 files changed, 48 insertions(+), 25 deletions(-) (limited to 'include') diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h index e58306783d8e..0b262e14144e 100644 --- a/include/linux/stackdepot.h +++ b/include/linux/stackdepot.h @@ -32,6 +32,17 @@ typedef u32 depot_stack_handle_t; */ #define STACK_DEPOT_EXTRA_BITS 5 +typedef u32 depot_flags_t; + +/* + * Flags that can be passed to stack_depot_save_flags(); see the comment next + * to its declaration for more details. + */ +#define STACK_DEPOT_FLAG_CAN_ALLOC ((depot_flags_t)0x0001) + +#define STACK_DEPOT_FLAGS_NUM 1 +#define STACK_DEPOT_FLAGS_MASK ((depot_flags_t)((1 << STACK_DEPOT_FLAGS_NUM) - 1)) + /* * Using stack depot requires its initialization, which can be done in 3 ways: * @@ -69,31 +80,34 @@ static inline int stack_depot_early_init(void) { return 0; } #endif /** - * __stack_depot_save - Save a stack trace to stack depot + * stack_depot_save_flags - Save a stack trace to stack depot * * @entries: Pointer to the stack trace * @nr_entries: Number of frames in the stack * @alloc_flags: Allocation GFP flags - * @can_alloc: Allocate stack pools (increased chance of failure if false) + * @depot_flags: Stack depot flags + * + * Saves a stack trace from @entries array of size @nr_entries. * - * Saves a stack trace from @entries array of size @nr_entries. If @can_alloc is - * %true, stack depot can replenish the stack pools in case no space is left - * (allocates using GFP flags of @alloc_flags). If @can_alloc is %false, avoids - * any allocations and fails if no space is left to store the stack trace. + * If STACK_DEPOT_FLAG_CAN_ALLOC is set in @depot_flags, stack depot can + * replenish the stack pools in case no space is left (allocates using GFP + * flags of @alloc_flags). Otherwise, stack depot avoids any allocations and + * fails if no space is left to store the stack trace. * * If the provided stack trace comes from the interrupt context, only the part * up to the interrupt entry is saved. * - * Context: Any context, but setting @can_alloc to %false is required if + * Context: Any context, but setting STACK_DEPOT_FLAG_CAN_ALLOC is required if * alloc_pages() cannot be used from the current context. Currently * this is the case for contexts where neither %GFP_ATOMIC nor * %GFP_NOWAIT can be used (NMI, raw_spin_lock). * * Return: Handle of the stack struct stored in depot, 0 on failure */ -depot_stack_handle_t __stack_depot_save(unsigned long *entries, - unsigned int nr_entries, - gfp_t gfp_flags, bool can_alloc); +depot_stack_handle_t stack_depot_save_flags(unsigned long *entries, + unsigned int nr_entries, + gfp_t gfp_flags, + depot_flags_t depot_flags); /** * stack_depot_save - Save a stack trace to stack depot @@ -103,7 +117,7 @@ depot_stack_handle_t __stack_depot_save(unsigned long *entries, * @alloc_flags: Allocation GFP flags * * Context: Contexts where allocations via alloc_pages() are allowed. - * See __stack_depot_save() for more details. + * See stack_depot_save_flags() for more details. * * Return: Handle of the stack trace stored in depot, 0 on failure */ diff --git a/lib/stackdepot.c b/lib/stackdepot.c index 4bb0af423f82..59d61d5c09a7 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -450,19 +450,24 @@ static inline struct stack_record *find_stack(struct list_head *bucket, return NULL; } -depot_stack_handle_t __stack_depot_save(unsigned long *entries, - unsigned int nr_entries, - gfp_t alloc_flags, bool can_alloc) +depot_stack_handle_t stack_depot_save_flags(unsigned long *entries, + unsigned int nr_entries, + gfp_t alloc_flags, + depot_flags_t depot_flags) { struct list_head *bucket; struct stack_record *found = NULL; depot_stack_handle_t handle = 0; struct page *page = NULL; void *prealloc = NULL; + bool can_alloc = depot_flags & STACK_DEPOT_FLAG_CAN_ALLOC; bool need_alloc = false; unsigned long flags; u32 hash; + if (WARN_ON(depot_flags & ~STACK_DEPOT_FLAGS_MASK)) + return 0; + /* * If this stack trace is from an interrupt, including anything before * interrupt entry usually leads to unbounded stack depot growth. @@ -541,13 +546,14 @@ exit: handle = found->handle.handle; return handle; } -EXPORT_SYMBOL_GPL(__stack_depot_save); +EXPORT_SYMBOL_GPL(stack_depot_save_flags); depot_stack_handle_t stack_depot_save(unsigned long *entries, unsigned int nr_entries, gfp_t alloc_flags) { - return __stack_depot_save(entries, nr_entries, alloc_flags, true); + return stack_depot_save_flags(entries, nr_entries, alloc_flags, + STACK_DEPOT_FLAG_CAN_ALLOC); } EXPORT_SYMBOL_GPL(stack_depot_save); diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 256930da578a..825a0240ec02 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -37,19 +38,19 @@ struct slab *kasan_addr_to_slab(const void *addr) return NULL; } -depot_stack_handle_t kasan_save_stack(gfp_t flags, bool can_alloc) +depot_stack_handle_t kasan_save_stack(gfp_t flags, depot_flags_t depot_flags) { unsigned long entries[KASAN_STACK_DEPTH]; unsigned int nr_entries; nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 0); - return __stack_depot_save(entries, nr_entries, flags, can_alloc); + return stack_depot_save_flags(entries, nr_entries, flags, depot_flags); } void kasan_set_track(struct kasan_track *track, gfp_t flags) { track->pid = current->pid; - track->stack = kasan_save_stack(flags, true); + track->stack = kasan_save_stack(flags, STACK_DEPOT_FLAG_CAN_ALLOC); } #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c index 4d837ab83f08..5d168c9afb32 100644 --- a/mm/kasan/generic.c +++ b/mm/kasan/generic.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -472,7 +473,7 @@ size_t kasan_metadata_size(struct kmem_cache *cache, bool in_object) sizeof(struct kasan_free_meta) : 0); } -static void __kasan_record_aux_stack(void *addr, bool can_alloc) +static void __kasan_record_aux_stack(void *addr, depot_flags_t depot_flags) { struct slab *slab = kasan_addr_to_slab(addr); struct kmem_cache *cache; @@ -489,17 +490,17 @@ static void __kasan_record_aux_stack(void *addr, bool can_alloc) return; alloc_meta->aux_stack[1] = alloc_meta->aux_stack[0]; - alloc_meta->aux_stack[0] = kasan_save_stack(0, can_alloc); + alloc_meta->aux_stack[0] = kasan_save_stack(0, depot_flags); } void kasan_record_aux_stack(void *addr) { - return __kasan_record_aux_stack(addr, true); + return __kasan_record_aux_stack(addr, STACK_DEPOT_FLAG_CAN_ALLOC); } void kasan_record_aux_stack_noalloc(void *addr) { - return __kasan_record_aux_stack(addr, false); + return __kasan_record_aux_stack(addr, 0); } void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags) diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 8b06bab5c406..b29d46b83d1f 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -368,7 +368,7 @@ static inline void kasan_init_cache_meta(struct kmem_cache *cache, unsigned int static inline void kasan_init_object_meta(struct kmem_cache *cache, const void *object) { } #endif -depot_stack_handle_t kasan_save_stack(gfp_t flags, bool can_alloc); +depot_stack_handle_t kasan_save_stack(gfp_t flags, depot_flags_t depot_flags); void kasan_set_track(struct kasan_track *track, gfp_t flags); void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags); void kasan_save_free_info(struct kmem_cache *cache, void *object); diff --git a/mm/kasan/tags.c b/mm/kasan/tags.c index 7dcfe341d48e..4fd32121b0fd 100644 --- a/mm/kasan/tags.c +++ b/mm/kasan/tags.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -101,7 +102,7 @@ static void save_stack_info(struct kmem_cache *cache, void *object, struct kasan_stack_ring_entry *entry; void *old_ptr; - stack = kasan_save_stack(gfp_flags, true); + stack = kasan_save_stack(gfp_flags, STACK_DEPOT_FLAG_CAN_ALLOC); /* * Prevent save_stack_info() from modifying stack ring -- cgit v1.2.3 From 410b764f89f59cce858d94fc781b68c1f27a0ca9 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 20 Nov 2023 18:47:14 +0100 Subject: lib/stackdepot: add refcount for records Add a reference counter for how many times a stack records has been added to stack depot. Add a new STACK_DEPOT_FLAG_GET flag to stack_depot_save_flags that instructs the stack depot to increment the refcount. Do not yet decrement the refcount; this is implemented in one of the following patches. Do not yet enable any users to use the flag to avoid overflowing the refcount. This is preparatory patch for implementing the eviction of stack records from the stack depot. Link: https://lkml.kernel.org/r/a3fc14a2359d019d2a008d4ff8b46a665371ffee.1700502145.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Marco Elver Cc: Oscar Salvador Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/stackdepot.h | 13 ++++++++++--- lib/stackdepot.c | 12 ++++++++++-- 2 files changed, 20 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h index 0b262e14144e..611716702d73 100644 --- a/include/linux/stackdepot.h +++ b/include/linux/stackdepot.h @@ -39,8 +39,9 @@ typedef u32 depot_flags_t; * to its declaration for more details. */ #define STACK_DEPOT_FLAG_CAN_ALLOC ((depot_flags_t)0x0001) +#define STACK_DEPOT_FLAG_GET ((depot_flags_t)0x0002) -#define STACK_DEPOT_FLAGS_NUM 1 +#define STACK_DEPOT_FLAGS_NUM 2 #define STACK_DEPOT_FLAGS_MASK ((depot_flags_t)((1 << STACK_DEPOT_FLAGS_NUM) - 1)) /* @@ -94,6 +95,9 @@ static inline int stack_depot_early_init(void) { return 0; } * flags of @alloc_flags). Otherwise, stack depot avoids any allocations and * fails if no space is left to store the stack trace. * + * If STACK_DEPOT_FLAG_GET is set in @depot_flags, stack depot will increment + * the refcount on the saved stack trace if it already exists in stack depot. + * * If the provided stack trace comes from the interrupt context, only the part * up to the interrupt entry is saved. * @@ -116,8 +120,11 @@ depot_stack_handle_t stack_depot_save_flags(unsigned long *entries, * @nr_entries: Number of frames in the stack * @alloc_flags: Allocation GFP flags * - * Context: Contexts where allocations via alloc_pages() are allowed. - * See stack_depot_save_flags() for more details. + * Does not increment the refcount on the saved stack trace; see + * stack_depot_save_flags() for more details. + * + * Context: Contexts where allocations via alloc_pages() are allowed; + * see stack_depot_save_flags() for more details. * * Return: Handle of the stack trace stored in depot, 0 on failure */ diff --git a/lib/stackdepot.c b/lib/stackdepot.c index 59d61d5c09a7..911dee11bf39 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -60,6 +61,7 @@ struct stack_record { u32 hash; /* Hash in hash table */ u32 size; /* Number of stored frames */ union handle_parts handle; + refcount_t count; unsigned long entries[CONFIG_STACKDEPOT_MAX_FRAMES]; /* Frames */ }; @@ -373,6 +375,7 @@ depot_alloc_stack(unsigned long *entries, int size, u32 hash, void **prealloc) stack->hash = hash; stack->size = size; /* stack->handle is already filled in by depot_init_pool(). */ + refcount_set(&stack->count, 1); memcpy(stack->entries, entries, flex_array_size(stack, entries, size)); /* @@ -489,6 +492,8 @@ depot_stack_handle_t stack_depot_save_flags(unsigned long *entries, /* Fast path: look the stack trace up without full locking. */ found = find_stack(bucket, entries, nr_entries, hash); if (found) { + if (depot_flags & STACK_DEPOT_FLAG_GET) + refcount_inc(&found->count); read_unlock_irqrestore(&pool_rwlock, flags); goto exit; } @@ -528,12 +533,15 @@ depot_stack_handle_t stack_depot_save_flags(unsigned long *entries, list_add(&new->list, bucket); found = new; } - } else if (prealloc) { + } else { + if (depot_flags & STACK_DEPOT_FLAG_GET) + refcount_inc(&found->count); /* * Stack depot already contains this stack trace, but let's * keep the preallocated memory for future. */ - depot_keep_new_pool(&prealloc); + if (prealloc) + depot_keep_new_pool(&prealloc); } write_unlock_irqrestore(&pool_rwlock, flags); -- cgit v1.2.3 From 108be8def46e9422f5a5abc96b0ab8fb6b3fb344 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 20 Nov 2023 18:47:15 +0100 Subject: lib/stackdepot: allow users to evict stack traces Add stack_depot_put, a function that decrements the reference counter on a stack record and removes it from the stack depot once the counter reaches 0. Internally, when removing a stack record, the function unlinks it from the hash table bucket and returns to the freelist. With this change, the users of stack depot can call stack_depot_put when keeping a stack trace in the stack depot is not needed anymore. This allows avoiding polluting the stack depot with irrelevant stack traces and thus have more space to store the relevant ones before the stack depot reaches its capacity. Link: https://lkml.kernel.org/r/1d1ad5692ee43d4fc2b3fd9d221331d30b36123f.1700502145.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Marco Elver Cc: Oscar Salvador Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/stackdepot.h | 14 ++++++++++++++ lib/stackdepot.c | 37 ++++++++++++++++++++++++++++++++++++- 2 files changed, 50 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h index 611716702d73..a6796f178913 100644 --- a/include/linux/stackdepot.h +++ b/include/linux/stackdepot.h @@ -97,6 +97,8 @@ static inline int stack_depot_early_init(void) { return 0; } * * If STACK_DEPOT_FLAG_GET is set in @depot_flags, stack depot will increment * the refcount on the saved stack trace if it already exists in stack depot. + * Users of this flag must also call stack_depot_put() when keeping the stack + * trace is no longer required to avoid overflowing the refcount. * * If the provided stack trace comes from the interrupt context, only the part * up to the interrupt entry is saved. @@ -162,6 +164,18 @@ void stack_depot_print(depot_stack_handle_t stack); int stack_depot_snprint(depot_stack_handle_t handle, char *buf, size_t size, int spaces); +/** + * stack_depot_put - Drop a reference to a stack trace from stack depot + * + * @handle: Stack depot handle returned from stack_depot_save() + * + * The stack trace is evicted from stack depot once all references to it have + * been dropped (once the number of stack_depot_evict() calls matches the + * number of stack_depot_save_flags() calls with STACK_DEPOT_FLAG_GET set for + * this stack trace). + */ +void stack_depot_put(depot_stack_handle_t handle); + /** * stack_depot_set_extra_bits - Set extra bits in a stack depot handle * diff --git a/lib/stackdepot.c b/lib/stackdepot.c index 911dee11bf39..c1b31160f4b4 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -394,7 +394,7 @@ static struct stack_record *depot_fetch_stack(depot_stack_handle_t handle) size_t offset = parts.offset << DEPOT_STACK_ALIGN; struct stack_record *stack; - lockdep_assert_held_read(&pool_rwlock); + lockdep_assert_held(&pool_rwlock); if (parts.pool_index > pools_num) { WARN(1, "pool index %d out of bounds (%d) for stack id %08x\n", @@ -410,6 +410,14 @@ static struct stack_record *depot_fetch_stack(depot_stack_handle_t handle) return stack; } +/* Links stack into the freelist. */ +static void depot_free_stack(struct stack_record *stack) +{ + lockdep_assert_held_write(&pool_rwlock); + + list_add(&stack->list, &free_stacks); +} + /* Calculates the hash for a stack. */ static inline u32 hash_stack(unsigned long *entries, unsigned int size) { @@ -592,6 +600,33 @@ unsigned int stack_depot_fetch(depot_stack_handle_t handle, } EXPORT_SYMBOL_GPL(stack_depot_fetch); +void stack_depot_put(depot_stack_handle_t handle) +{ + struct stack_record *stack; + unsigned long flags; + + if (!handle || stack_depot_disabled) + return; + + write_lock_irqsave(&pool_rwlock, flags); + + stack = depot_fetch_stack(handle); + if (WARN_ON(!stack)) + goto out; + + if (refcount_dec_and_test(&stack->count)) { + /* Unlink stack from the hash table. */ + list_del(&stack->list); + + /* Free stack. */ + depot_free_stack(stack); + } + +out: + write_unlock_irqrestore(&pool_rwlock, flags); +} +EXPORT_SYMBOL_GPL(stack_depot_put); + void stack_depot_print(depot_stack_handle_t stack) { unsigned long *entries; -- cgit v1.2.3 From 95a2ac937013cc3aaaea02abcdd167b96874548d Mon Sep 17 00:00:00 2001 From: Sumanth Korikkar Date: Mon, 20 Nov 2023 15:53:54 +0100 Subject: mm: use vmem_altmap code without CONFIG_ZONE_DEVICE vmem_altmap_free() and vmem_altmap_offset() could be utlized without CONFIG_ZONE_DEVICE enabled. For example, mm/memory_hotplug.c:__add_pages() relies on that. The altmap is no longer restricted to ZONE_DEVICE handling, but instead depends on CONFIG_SPARSEMEM_VMEMMAP. When CONFIG_SPARSEMEM_VMEMMAP is disabled, these functions are defined as inline stubs, ensuring compatibility with configurations that do not use sparsemem vmemmap. Without it, lkp reported the following: ld: arch/x86/mm/init_64.o: in function `remove_pagetable': init_64.c:(.meminit.text+0xfc7): undefined reference to `vmem_altmap_free' Link: https://lkml.kernel.org/r/20231120145354.308999-4-sumanthk@linux.ibm.com Signed-off-by: Sumanth Korikkar Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202311180545.VeyRXEDq-lkp@intel.com/ Reviewed-by: Gerald Schaefer Acked-by: David Hildenbrand Cc: Alexander Gordeev Cc: Aneesh Kumar K.V Cc: Anshuman Khandual Cc: Heiko Carstens Cc: Michal Hocko Cc: Oscar Salvador Cc: Vasily Gorbik Signed-off-by: Andrew Morton --- include/linux/memremap.h | 12 ------------ include/linux/mm.h | 26 ++++++++++++++++++++++++++ mm/memremap.c | 14 +------------- 3 files changed, 27 insertions(+), 25 deletions(-) (limited to 'include') diff --git a/include/linux/memremap.h b/include/linux/memremap.h index 1314d9c5f05b..744c830f4b13 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -196,8 +196,6 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn, struct dev_pagemap *pgmap); bool pgmap_pfn_valid(struct dev_pagemap *pgmap, unsigned long pfn); -unsigned long vmem_altmap_offset(struct vmem_altmap *altmap); -void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns); unsigned long memremap_compat_align(void); #else static inline void *devm_memremap_pages(struct device *dev, @@ -228,16 +226,6 @@ static inline bool pgmap_pfn_valid(struct dev_pagemap *pgmap, unsigned long pfn) return false; } -static inline unsigned long vmem_altmap_offset(struct vmem_altmap *altmap) -{ - return 0; -} - -static inline void vmem_altmap_free(struct vmem_altmap *altmap, - unsigned long nr_pfns) -{ -} - /* when memremap_pages() is disabled all archs can remap a single page */ static inline unsigned long memremap_compat_align(void) { diff --git a/include/linux/mm.h b/include/linux/mm.h index 13a090271716..a422cc123a2d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3871,6 +3871,32 @@ void vmemmap_free(unsigned long start, unsigned long end, struct vmem_altmap *altmap); #endif +#ifdef CONFIG_SPARSEMEM_VMEMMAP +static inline unsigned long vmem_altmap_offset(struct vmem_altmap *altmap) +{ + /* number of pfns from base where pfn_to_page() is valid */ + if (altmap) + return altmap->reserve + altmap->free; + return 0; +} + +static inline void vmem_altmap_free(struct vmem_altmap *altmap, + unsigned long nr_pfns) +{ + altmap->alloc -= nr_pfns; +} +#else +static inline unsigned long vmem_altmap_offset(struct vmem_altmap *altmap) +{ + return 0; +} + +static inline void vmem_altmap_free(struct vmem_altmap *altmap, + unsigned long nr_pfns) +{ +} +#endif + #define VMEMMAP_RESERVE_NR 2 #ifdef CONFIG_ARCH_WANT_OPTIMIZE_DAX_VMEMMAP static inline bool __vmemmap_can_optimize(struct vmem_altmap *altmap, diff --git a/mm/memremap.c b/mm/memremap.c index bee85560a243..9531faa92a7c 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -422,19 +423,6 @@ void devm_memunmap_pages(struct device *dev, struct dev_pagemap *pgmap) } EXPORT_SYMBOL_GPL(devm_memunmap_pages); -unsigned long vmem_altmap_offset(struct vmem_altmap *altmap) -{ - /* number of pfns from base where pfn_to_page() is valid */ - if (altmap) - return altmap->reserve + altmap->free; - return 0; -} - -void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns) -{ - altmap->alloc -= nr_pfns; -} - /** * get_dev_pagemap() - take a new live reference on the dev_pagemap for @pfn * @pfn: page frame number to lookup page_map -- cgit v1.2.3 From 38ca8a185389716e9f7566bce4bb0085f71da61d Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Tue, 21 Nov 2023 20:43:49 +0100 Subject: pgtable: fix s390 ptdesc field comments Patch series "minor ptdesc updates", v3. This patch (of 2): Since commit d08d4e7cd6bf ("s390/mm: use full 4KB page for 2KB PTE") there is no fragmented page tracking on s390. Fix the corresponding comments. Link: https://lkml.kernel.org/r/cover.1700594815.git.agordeev@linux.ibm.com Link: https://lkml.kernel.org/r/2eead241f3a45bed26c7911cf66bded1e35670b8.1700594815.git.agordeev@linux.ibm.com Signed-off-by: Alexander Gordeev Suggested-by: Heiko Carstens Cc: Gerald Schaefer Cc: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 957ce38768b2..fbec64036baa 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -401,11 +401,11 @@ FOLIO_MATCH(compound_head, _head_2a); * @pmd_huge_pte: Protected by ptdesc->ptl, used for THPs. * @__page_mapping: Aliases with page->mapping. Unused for page tables. * @pt_mm: Used for x86 pgds. - * @pt_frag_refcount: For fragmented page table tracking. Powerpc and s390 only. + * @pt_frag_refcount: For fragmented page table tracking. Powerpc only. * @_pt_pad_2: Padding to ensure proper alignment. * @ptl: Lock for the page table. * @__page_type: Same as page->page_type. Unused for page tables. - * @_refcount: Same as page refcount. Used for s390 page tables. + * @_refcount: Same as page refcount. * @pt_memcg_data: Memcg data. Tracked for page tables here. * * This struct overlays struct page for now. Do not modify without a good -- cgit v1.2.3 From f7dd74ac239aad5ef7575ea03c45fd7956e00285 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Tue, 21 Nov 2023 20:43:50 +0100 Subject: pgtable: rename ptdesc _refcount field to __page_refcount Rename ptdesc _refcount field to __page_refcount similar to the other unused page fields. Link: https://lkml.kernel.org/r/982bdc652ba79a606c3d01c905766e7e076b3315.1700594815.git.agordeev@linux.ibm.com Signed-off-by: Alexander Gordeev Suggested-by: Vishal Moola Cc: Gerald Schaefer Cc: Heiko Carstens Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index fbec64036baa..ef18d2b25378 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -405,7 +405,7 @@ FOLIO_MATCH(compound_head, _head_2a); * @_pt_pad_2: Padding to ensure proper alignment. * @ptl: Lock for the page table. * @__page_type: Same as page->page_type. Unused for page tables. - * @_refcount: Same as page refcount. + * @__page_refcount: Same as page refcount. * @pt_memcg_data: Memcg data. Tracked for page tables here. * * This struct overlays struct page for now. Do not modify without a good @@ -438,7 +438,7 @@ struct ptdesc { #endif }; unsigned int __page_type; - atomic_t _refcount; + atomic_t __page_refcount; #ifdef CONFIG_MEMCG unsigned long pt_memcg_data; #endif @@ -452,7 +452,7 @@ TABLE_MATCH(compound_head, _pt_pad_1); TABLE_MATCH(mapping, __page_mapping); TABLE_MATCH(rcu_head, pt_rcu_head); TABLE_MATCH(page_type, __page_type); -TABLE_MATCH(_refcount, _refcount); +TABLE_MATCH(_refcount, __page_refcount); #ifdef CONFIG_MEMCG TABLE_MATCH(memcg_data, pt_memcg_data); #endif -- cgit v1.2.3 From 7679e14098c9c3c8118a7130d6e1e9cfe2565c04 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 23 Nov 2023 19:23:17 +0200 Subject: mm: list_lru: Update kernel documentation to follow the requirements kernel-doc is not happy about documentation in list_lru.h: list_lru.h:90: warning: Function parameter or member 'lru' not described in 'list_lru_add' list_lru.h:90: warning: Excess function parameter 'list_lru' description in 'list_lru_add' list_lru.h:90: warning: No description found for return value of 'list_lru_add' list_lru.h:103: warning: Function parameter or member 'lru' not described in 'list_lru_del' list_lru.h:103: warning: Excess function parameter 'list_lru' description in 'list_lru_del' list_lru.h:103: warning: No description found for return value of 'list_lru_del' list_lru.h:116: warning: No description found for return value of 'list_lru_count_one' list_lru.h:168: warning: No description found for return value of 'list_lru_walk_one' list_lru.h:185: warning: No description found for return value of 'list_lru_walk_one_irq' Fix the documentation accordingly. While at it, fix the references to the parameters in functions inside the long descriptions, on which the above script is not complaining (yet?). Link: https://lkml.kernel.org/r/20231123172320.2434780-1-andriy.shevchenko@linux.intel.com Signed-off-by: Andy Shevchenko Cc: Rasmus Villemoes Signed-off-by: Andrew Morton --- include/linux/list_lru.h | 36 +++++++++++++++++++----------------- 1 file changed, 19 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h index b35968ee9fb5..db86ad78d428 100644 --- a/include/linux/list_lru.h +++ b/include/linux/list_lru.h @@ -73,7 +73,7 @@ void memcg_reparent_list_lrus(struct mem_cgroup *memcg, struct mem_cgroup *paren /** * list_lru_add: add an element to the lru list's tail - * @list_lru: the lru pointer + * @lru: the lru pointer * @item: the item to be added. * * If the element is already part of a list, this function returns doing @@ -83,22 +83,22 @@ void memcg_reparent_list_lrus(struct mem_cgroup *memcg, struct mem_cgroup *paren * the caller organize itself in a way that elements can be in more than * one type of list, it is up to the caller to fully remove the item from * the previous list (with list_lru_del() for instance) before moving it - * to @list_lru + * to @lru. * - * Return value: true if the list was updated, false otherwise + * Return: true if the list was updated, false otherwise */ bool list_lru_add(struct list_lru *lru, struct list_head *item); /** * list_lru_del: delete an element to the lru list - * @list_lru: the lru pointer + * @lru: the lru pointer * @item: the item to be deleted. * - * This function works analogously as list_lru_add in terms of list + * This function works analogously as list_lru_add() in terms of list * manipulation. The comments about an element already pertaining to - * a list are also valid for list_lru_del. + * a list are also valid for list_lru_del(). * - * Return value: true if the list was updated, false otherwise + * Return: true if the list was updated, false otherwise */ bool list_lru_del(struct list_lru *lru, struct list_head *item); @@ -108,9 +108,11 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item); * @nid: the node id to count from. * @memcg: the cgroup to count from. * - * Always return a non-negative number, 0 for empty lists. There is no - * guarantee that the list is not updated while the count is being computed. - * Callers that want such a guarantee need to provide an outer lock. + * There is no guarantee that the list is not updated while the count is being + * computed. Callers that want such a guarantee need to provide an outer lock. + * + * Return: 0 for empty lists, otherwise the number of objects + * currently held by @lru. */ unsigned long list_lru_count_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg); @@ -141,7 +143,7 @@ typedef enum lru_status (*list_lru_walk_cb)(struct list_head *item, struct list_lru_one *list, spinlock_t *lock, void *cb_arg); /** - * list_lru_walk_one: walk a list_lru, isolating and disposing freeable items. + * list_lru_walk_one: walk a @lru, isolating and disposing freeable items. * @lru: the lru pointer. * @nid: the node id to scan from. * @memcg: the cgroup to scan from. @@ -150,24 +152,24 @@ typedef enum lru_status (*list_lru_walk_cb)(struct list_head *item, * @cb_arg: opaque type that will be passed to @isolate * @nr_to_walk: how many items to scan. * - * This function will scan all elements in a particular list_lru, calling the + * This function will scan all elements in a particular @lru, calling the * @isolate callback for each of those items, along with the current list * spinlock and a caller-provided opaque. The @isolate callback can choose to * drop the lock internally, but *must* return with the lock held. The callback - * will return an enum lru_status telling the list_lru infrastructure what to + * will return an enum lru_status telling the @lru infrastructure what to * do with the object being scanned. * - * Please note that nr_to_walk does not mean how many objects will be freed, + * Please note that @nr_to_walk does not mean how many objects will be freed, * just how many objects will be scanned. * - * Return value: the number of objects effectively removed from the LRU. + * Return: the number of objects effectively removed from the LRU. */ unsigned long list_lru_walk_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg, list_lru_walk_cb isolate, void *cb_arg, unsigned long *nr_to_walk); /** - * list_lru_walk_one_irq: walk a list_lru, isolating and disposing freeable items. + * list_lru_walk_one_irq: walk a @lru, isolating and disposing freeable items. * @lru: the lru pointer. * @nid: the node id to scan from. * @memcg: the cgroup to scan from. @@ -176,7 +178,7 @@ unsigned long list_lru_walk_one(struct list_lru *lru, * @cb_arg: opaque type that will be passed to @isolate * @nr_to_walk: how many items to scan. * - * Same as @list_lru_walk_one except that the spinlock is acquired with + * Same as list_lru_walk_one() except that the spinlock is acquired with * spin_lock_irq(). */ unsigned long list_lru_walk_one_irq(struct list_lru *lru, -- cgit v1.2.3 From bf857ddd21d0bffc1edafc317e8e2ce0d6d5950c Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Wed, 1 Nov 2023 13:16:20 -0400 Subject: maple_tree: move debug check to __mas_set_range() __mas_set_range() was created to shortcut resetting the maple state and a debug check was added to the caller (the vma iterator) to ensure the internal maple state remains safe to use. Move the debug check from the vma iterator into the maple tree itself so other users do not incorrectly use the advanced maple state modification. Fallout from this change include a large amount of debug setup needed to be moved to earlier in the header, and the maple_tree.h radix-tree test code needed to move the inclusion of the header to after the atomic define. None of those changes have functional changes. Link: https://lkml.kernel.org/r/20231101171629.3612299-4-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Peng Zhang Signed-off-by: Andrew Morton --- include/linux/maple_tree.h | 255 ++++++++++++++-------------- mm/internal.h | 2 - tools/testing/radix-tree/linux/maple_tree.h | 2 +- 3 files changed, 130 insertions(+), 129 deletions(-) (limited to 'include') diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index a452dd8a1e5c..b5d5992578c9 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -557,6 +557,131 @@ static inline void mas_reset(struct ma_state *mas) */ #define mas_for_each(__mas, __entry, __max) \ while (((__entry) = mas_find((__mas), (__max))) != NULL) + +#ifdef CONFIG_DEBUG_MAPLE_TREE +enum mt_dump_format { + mt_dump_dec, + mt_dump_hex, +}; + +extern atomic_t maple_tree_tests_run; +extern atomic_t maple_tree_tests_passed; + +void mt_dump(const struct maple_tree *mt, enum mt_dump_format format); +void mas_dump(const struct ma_state *mas); +void mas_wr_dump(const struct ma_wr_state *wr_mas); +void mt_validate(struct maple_tree *mt); +void mt_cache_shrink(void); +#define MT_BUG_ON(__tree, __x) do { \ + atomic_inc(&maple_tree_tests_run); \ + if (__x) { \ + pr_info("BUG at %s:%d (%u)\n", \ + __func__, __LINE__, __x); \ + mt_dump(__tree, mt_dump_hex); \ + pr_info("Pass: %u Run:%u\n", \ + atomic_read(&maple_tree_tests_passed), \ + atomic_read(&maple_tree_tests_run)); \ + dump_stack(); \ + } else { \ + atomic_inc(&maple_tree_tests_passed); \ + } \ +} while (0) + +#define MAS_BUG_ON(__mas, __x) do { \ + atomic_inc(&maple_tree_tests_run); \ + if (__x) { \ + pr_info("BUG at %s:%d (%u)\n", \ + __func__, __LINE__, __x); \ + mas_dump(__mas); \ + mt_dump((__mas)->tree, mt_dump_hex); \ + pr_info("Pass: %u Run:%u\n", \ + atomic_read(&maple_tree_tests_passed), \ + atomic_read(&maple_tree_tests_run)); \ + dump_stack(); \ + } else { \ + atomic_inc(&maple_tree_tests_passed); \ + } \ +} while (0) + +#define MAS_WR_BUG_ON(__wrmas, __x) do { \ + atomic_inc(&maple_tree_tests_run); \ + if (__x) { \ + pr_info("BUG at %s:%d (%u)\n", \ + __func__, __LINE__, __x); \ + mas_wr_dump(__wrmas); \ + mas_dump((__wrmas)->mas); \ + mt_dump((__wrmas)->mas->tree, mt_dump_hex); \ + pr_info("Pass: %u Run:%u\n", \ + atomic_read(&maple_tree_tests_passed), \ + atomic_read(&maple_tree_tests_run)); \ + dump_stack(); \ + } else { \ + atomic_inc(&maple_tree_tests_passed); \ + } \ +} while (0) + +#define MT_WARN_ON(__tree, __x) ({ \ + int ret = !!(__x); \ + atomic_inc(&maple_tree_tests_run); \ + if (ret) { \ + pr_info("WARN at %s:%d (%u)\n", \ + __func__, __LINE__, __x); \ + mt_dump(__tree, mt_dump_hex); \ + pr_info("Pass: %u Run:%u\n", \ + atomic_read(&maple_tree_tests_passed), \ + atomic_read(&maple_tree_tests_run)); \ + dump_stack(); \ + } else { \ + atomic_inc(&maple_tree_tests_passed); \ + } \ + unlikely(ret); \ +}) + +#define MAS_WARN_ON(__mas, __x) ({ \ + int ret = !!(__x); \ + atomic_inc(&maple_tree_tests_run); \ + if (ret) { \ + pr_info("WARN at %s:%d (%u)\n", \ + __func__, __LINE__, __x); \ + mas_dump(__mas); \ + mt_dump((__mas)->tree, mt_dump_hex); \ + pr_info("Pass: %u Run:%u\n", \ + atomic_read(&maple_tree_tests_passed), \ + atomic_read(&maple_tree_tests_run)); \ + dump_stack(); \ + } else { \ + atomic_inc(&maple_tree_tests_passed); \ + } \ + unlikely(ret); \ +}) + +#define MAS_WR_WARN_ON(__wrmas, __x) ({ \ + int ret = !!(__x); \ + atomic_inc(&maple_tree_tests_run); \ + if (ret) { \ + pr_info("WARN at %s:%d (%u)\n", \ + __func__, __LINE__, __x); \ + mas_wr_dump(__wrmas); \ + mas_dump((__wrmas)->mas); \ + mt_dump((__wrmas)->mas->tree, mt_dump_hex); \ + pr_info("Pass: %u Run:%u\n", \ + atomic_read(&maple_tree_tests_passed), \ + atomic_read(&maple_tree_tests_run)); \ + dump_stack(); \ + } else { \ + atomic_inc(&maple_tree_tests_passed); \ + } \ + unlikely(ret); \ +}) +#else +#define MT_BUG_ON(__tree, __x) BUG_ON(__x) +#define MAS_BUG_ON(__mas, __x) BUG_ON(__x) +#define MAS_WR_BUG_ON(__mas, __x) BUG_ON(__x) +#define MT_WARN_ON(__tree, __x) WARN_ON(__x) +#define MAS_WARN_ON(__mas, __x) WARN_ON(__x) +#define MAS_WR_WARN_ON(__mas, __x) WARN_ON(__x) +#endif /* CONFIG_DEBUG_MAPLE_TREE */ + /** * __mas_set_range() - Set up Maple Tree operation state to a sub-range of the * current location. @@ -570,6 +695,9 @@ static inline void mas_reset(struct ma_state *mas) static inline void __mas_set_range(struct ma_state *mas, unsigned long start, unsigned long last) { + /* Ensure the range starts within the current slot */ + MAS_WARN_ON(mas, mas_is_active(mas) && + (mas->index > start || mas->last < start)); mas->index = start; mas->last = last; } @@ -587,8 +715,8 @@ static inline void __mas_set_range(struct ma_state *mas, unsigned long start, static inline void mas_set_range(struct ma_state *mas, unsigned long start, unsigned long last) { - __mas_set_range(mas, start, last); mas->node = MAS_START; + __mas_set_range(mas, start, last); } /** @@ -713,129 +841,4 @@ void *mt_next(struct maple_tree *mt, unsigned long index, unsigned long max); for (__entry = mt_find(__tree, &(__index), __max); \ __entry; __entry = mt_find_after(__tree, &(__index), __max)) - -#ifdef CONFIG_DEBUG_MAPLE_TREE -enum mt_dump_format { - mt_dump_dec, - mt_dump_hex, -}; - -extern atomic_t maple_tree_tests_run; -extern atomic_t maple_tree_tests_passed; - -void mt_dump(const struct maple_tree *mt, enum mt_dump_format format); -void mas_dump(const struct ma_state *mas); -void mas_wr_dump(const struct ma_wr_state *wr_mas); -void mt_validate(struct maple_tree *mt); -void mt_cache_shrink(void); -#define MT_BUG_ON(__tree, __x) do { \ - atomic_inc(&maple_tree_tests_run); \ - if (__x) { \ - pr_info("BUG at %s:%d (%u)\n", \ - __func__, __LINE__, __x); \ - mt_dump(__tree, mt_dump_hex); \ - pr_info("Pass: %u Run:%u\n", \ - atomic_read(&maple_tree_tests_passed), \ - atomic_read(&maple_tree_tests_run)); \ - dump_stack(); \ - } else { \ - atomic_inc(&maple_tree_tests_passed); \ - } \ -} while (0) - -#define MAS_BUG_ON(__mas, __x) do { \ - atomic_inc(&maple_tree_tests_run); \ - if (__x) { \ - pr_info("BUG at %s:%d (%u)\n", \ - __func__, __LINE__, __x); \ - mas_dump(__mas); \ - mt_dump((__mas)->tree, mt_dump_hex); \ - pr_info("Pass: %u Run:%u\n", \ - atomic_read(&maple_tree_tests_passed), \ - atomic_read(&maple_tree_tests_run)); \ - dump_stack(); \ - } else { \ - atomic_inc(&maple_tree_tests_passed); \ - } \ -} while (0) - -#define MAS_WR_BUG_ON(__wrmas, __x) do { \ - atomic_inc(&maple_tree_tests_run); \ - if (__x) { \ - pr_info("BUG at %s:%d (%u)\n", \ - __func__, __LINE__, __x); \ - mas_wr_dump(__wrmas); \ - mas_dump((__wrmas)->mas); \ - mt_dump((__wrmas)->mas->tree, mt_dump_hex); \ - pr_info("Pass: %u Run:%u\n", \ - atomic_read(&maple_tree_tests_passed), \ - atomic_read(&maple_tree_tests_run)); \ - dump_stack(); \ - } else { \ - atomic_inc(&maple_tree_tests_passed); \ - } \ -} while (0) - -#define MT_WARN_ON(__tree, __x) ({ \ - int ret = !!(__x); \ - atomic_inc(&maple_tree_tests_run); \ - if (ret) { \ - pr_info("WARN at %s:%d (%u)\n", \ - __func__, __LINE__, __x); \ - mt_dump(__tree, mt_dump_hex); \ - pr_info("Pass: %u Run:%u\n", \ - atomic_read(&maple_tree_tests_passed), \ - atomic_read(&maple_tree_tests_run)); \ - dump_stack(); \ - } else { \ - atomic_inc(&maple_tree_tests_passed); \ - } \ - unlikely(ret); \ -}) - -#define MAS_WARN_ON(__mas, __x) ({ \ - int ret = !!(__x); \ - atomic_inc(&maple_tree_tests_run); \ - if (ret) { \ - pr_info("WARN at %s:%d (%u)\n", \ - __func__, __LINE__, __x); \ - mas_dump(__mas); \ - mt_dump((__mas)->tree, mt_dump_hex); \ - pr_info("Pass: %u Run:%u\n", \ - atomic_read(&maple_tree_tests_passed), \ - atomic_read(&maple_tree_tests_run)); \ - dump_stack(); \ - } else { \ - atomic_inc(&maple_tree_tests_passed); \ - } \ - unlikely(ret); \ -}) - -#define MAS_WR_WARN_ON(__wrmas, __x) ({ \ - int ret = !!(__x); \ - atomic_inc(&maple_tree_tests_run); \ - if (ret) { \ - pr_info("WARN at %s:%d (%u)\n", \ - __func__, __LINE__, __x); \ - mas_wr_dump(__wrmas); \ - mas_dump((__wrmas)->mas); \ - mt_dump((__wrmas)->mas->tree, mt_dump_hex); \ - pr_info("Pass: %u Run:%u\n", \ - atomic_read(&maple_tree_tests_passed), \ - atomic_read(&maple_tree_tests_run)); \ - dump_stack(); \ - } else { \ - atomic_inc(&maple_tree_tests_passed); \ - } \ - unlikely(ret); \ -}) -#else -#define MT_BUG_ON(__tree, __x) BUG_ON(__x) -#define MAS_BUG_ON(__mas, __x) BUG_ON(__x) -#define MAS_WR_BUG_ON(__mas, __x) BUG_ON(__x) -#define MT_WARN_ON(__tree, __x) WARN_ON(__x) -#define MAS_WARN_ON(__mas, __x) WARN_ON(__x) -#define MAS_WR_WARN_ON(__mas, __x) WARN_ON(__x) -#endif /* CONFIG_DEBUG_MAPLE_TREE */ - #endif /*_LINUX_MAPLE_TREE_H */ diff --git a/mm/internal.h b/mm/internal.h index 2bc9ff8db393..0005b8adbd5c 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1135,8 +1135,6 @@ static inline bool vma_soft_dirty_enabled(struct vm_area_struct *vma) static inline void vma_iter_config(struct vma_iterator *vmi, unsigned long index, unsigned long last) { - MAS_BUG_ON(&vmi->mas, vmi->mas.node != MAS_START && - (vmi->mas.index > index || vmi->mas.last < index)); __mas_set_range(&vmi->mas, index, last - 1); } diff --git a/tools/testing/radix-tree/linux/maple_tree.h b/tools/testing/radix-tree/linux/maple_tree.h index 7d8d1f445b89..06c89bdcc515 100644 --- a/tools/testing/radix-tree/linux/maple_tree.h +++ b/tools/testing/radix-tree/linux/maple_tree.h @@ -1,7 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0+ */ #define atomic_t int32_t -#include "../../../../include/linux/maple_tree.h" #define atomic_inc(x) uatomic_inc(x) #define atomic_read(x) uatomic_read(x) #define atomic_set(x, y) do {} while (0) #define U8_MAX UCHAR_MAX +#include "../../../../include/linux/maple_tree.h" -- cgit v1.2.3 From 31c532a8af57513228c2b12d281104198ff412b8 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Wed, 1 Nov 2023 13:16:21 -0400 Subject: maple_tree: add end of node tracking to the maple state Analysis of the mas_for_each() iteration showed that there is a significant time spent finding the end of a node. This time can be greatly reduced if the end of the node is cached in the maple state. Care must be taken to update & invalidate as necessary. Link: https://lkml.kernel.org/r/20231101171629.3612299-5-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Peng Zhang Signed-off-by: Andrew Morton --- include/linux/maple_tree.h | 1 + lib/maple_tree.c | 7 +++++++ tools/testing/radix-tree/maple.c | 1 + 3 files changed, 9 insertions(+) (limited to 'include') diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index b5d5992578c9..0b82efe0cf1e 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -393,6 +393,7 @@ struct ma_state { unsigned char depth; /* depth of tree descent during write */ unsigned char offset; unsigned char mas_flags; + unsigned char end; /* The end of the node */ }; struct ma_wr_state { diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 8d379d34ea0a..ea0a36341fed 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -2841,6 +2841,7 @@ next: goto dead_node; } while (!ma_is_leaf(type)); + mas->end = end; mas->offset = offset; mas->index = min; mas->last = max; @@ -3507,6 +3508,7 @@ static noinline_for_kasan int mas_commit_b_node(struct ma_wr_state *wr_mas, mas_replace_node(wr_mas->mas, old_enode); reuse_node: mas_update_gap(wr_mas->mas); + wr_mas->mas->end = b_end; return 1; } @@ -4010,6 +4012,7 @@ done: } trace_ma_write(__func__, mas, 0, wr_mas->entry); mas_update_gap(mas); + mas->end = new_end; return true; } @@ -4190,6 +4193,7 @@ static inline bool mas_wr_append(struct ma_wr_state *wr_mas, if (!wr_mas->content || !wr_mas->entry) mas_update_gap(mas); + mas->end = new_end; trace_ma_write(__func__, mas, new_end, wr_mas->entry); return true; } @@ -4428,6 +4432,7 @@ static inline int mas_prev_node(struct ma_state *mas, unsigned long min) if (unlikely(mte_dead_node(mas->node))) return 1; + mas->end = mas->offset; return 0; no_entry: @@ -5074,6 +5079,7 @@ int mas_empty_area(struct ma_state *mas, unsigned long min, if (mas->index < min) mas->index = min; mas->last = mas->index + size - 1; + mas->end = mas_data_end(mas); return 0; } EXPORT_SYMBOL_GPL(mas_empty_area); @@ -5134,6 +5140,7 @@ int mas_empty_area_rev(struct ma_state *mas, unsigned long min, mas->last = max; mas->index = mas->last - size + 1; + mas->end = mas_data_end(mas); return 0; } EXPORT_SYMBOL_GPL(mas_empty_area_rev); diff --git a/tools/testing/radix-tree/maple.c b/tools/testing/radix-tree/maple.c index cb5358674521..7095fb0ec026 100644 --- a/tools/testing/radix-tree/maple.c +++ b/tools/testing/radix-tree/maple.c @@ -945,6 +945,7 @@ retry: goto retry; } + mas->end = mas_data_end(mas); return ret; not_found: -- cgit v1.2.3 From 067311d33e650adfe7ae23765959ddcc1ba18510 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Wed, 1 Nov 2023 13:16:25 -0400 Subject: maple_tree: separate ma_state node from status The maple tree node is overloaded to keep status as well as the active node. This, unfortunately, results in a re-walk on underflow or overflow. Since the maple state has room, the status can be placed in its own enum in the structure. Once an underflow/overflow is detected, certain modes can restore the status to active and others may need to re-walk just that one node to see the entry. The status being an enum has the benefit of detecting unhandled status in switch statements. [Liam.Howlett@oracle.com: fix comments about MAS_*] Link: https://lkml.kernel.org/r/20231106154124.614247-1-Liam.Howlett@oracle.com [Liam.Howlett@oracle.com: update forking to separate maple state and node] Link: https://lkml.kernel.org/r/20231106154551.615042-1-Liam.Howlett@oracle.com [Liam.Howlett@oracle.com: fix mas_prev() state separation code] Link: https://lkml.kernel.org/r/20231207193319.4025462-1-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20231101171629.3612299-9-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Peng Zhang Signed-off-by: Andrew Morton --- include/linux/maple_tree.h | 87 ++++---- include/linux/mm_types.h | 3 +- lib/maple_tree.c | 459 +++++++++++++++++++++++---------------- lib/test_maple_tree.c | 189 ++++++++-------- mm/internal.h | 8 +- tools/testing/radix-tree/maple.c | 26 ++- 6 files changed, 445 insertions(+), 327 deletions(-) (limited to 'include') diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index 0b82efe0cf1e..4dd668f7b111 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -349,6 +349,36 @@ static inline bool mtree_empty(const struct maple_tree *mt) /* Advanced API */ +/* + * Maple State Status + * ma_active means the maple state is pointing to a node and offset and can + * continue operating on the tree. + * ma_start means we have not searched the tree. + * ma_root means we have searched the tree and the entry we found lives in + * the root of the tree (ie it has index 0, length 1 and is the only entry in + * the tree). + * ma_none means we have searched the tree and there is no node in the + * tree for this entry. For example, we searched for index 1 in an empty + * tree. Or we have a tree which points to a full leaf node and we + * searched for an entry which is larger than can be contained in that + * leaf node. + * ma_pause means the data within the maple state may be stale, restart the + * operation + * ma_overflow means the search has reached the upper limit of the search + * ma_underflow means the search has reached the lower limit of the search + * ma_error means there was an error, check the node for the error number. + */ +enum maple_status { + ma_active, + ma_start, + ma_root, + ma_none, + ma_pause, + ma_overflow, + ma_underflow, + ma_error, +}; + /* * The maple state is defined in the struct ma_state and is used to keep track * of information during operations, and even between operations when using the @@ -381,6 +411,13 @@ static inline bool mtree_empty(const struct maple_tree *mt) * When returning a value the maple state index and last respectively contain * the start and end of the range for the entry. Ranges are inclusive in the * Maple Tree. + * + * The status of the state is used to determine how the next action should treat + * the state. For instance, if the status is ma_start then the next action + * should start at the root of the tree and walk down. If the status is + * ma_pause then the node may be stale data and should be discarded. If the + * status is ma_overflow, then the last action hit the upper limit. + * */ struct ma_state { struct maple_tree *tree; /* The tree we're operating in */ @@ -390,6 +427,7 @@ struct ma_state { unsigned long min; /* The minimum index of this node - implied pivot min */ unsigned long max; /* The maximum index of this node - implied pivot max */ struct maple_alloc *alloc; /* Allocated nodes for this operation */ + enum maple_status status; /* The status of the state (active, start, none, etc) */ unsigned char depth; /* depth of tree descent during write */ unsigned char offset; unsigned char mas_flags; @@ -416,28 +454,12 @@ struct ma_wr_state { spin_lock_nested(&((mas)->tree->ma_lock), subclass) #define mas_unlock(mas) spin_unlock(&((mas)->tree->ma_lock)) - /* * Special values for ma_state.node. - * MAS_START means we have not searched the tree. - * MAS_ROOT means we have searched the tree and the entry we found lives in - * the root of the tree (ie it has index 0, length 1 and is the only entry in - * the tree). - * MAS_NONE means we have searched the tree and there is no node in the - * tree for this entry. For example, we searched for index 1 in an empty - * tree. Or we have a tree which points to a full leaf node and we - * searched for an entry which is larger than can be contained in that - * leaf node. * MA_ERROR represents an errno. After dropping the lock and attempting * to resolve the error, the walk would have to be restarted from the * top of the tree as the tree may have been modified. */ -#define MAS_START ((struct maple_enode *)1UL) -#define MAS_ROOT ((struct maple_enode *)5UL) -#define MAS_NONE ((struct maple_enode *)9UL) -#define MAS_PAUSE ((struct maple_enode *)17UL) -#define MAS_OVERFLOW ((struct maple_enode *)33UL) -#define MAS_UNDERFLOW ((struct maple_enode *)65UL) #define MA_ERROR(err) \ ((struct maple_enode *)(((unsigned long)err << 2) | 2UL)) @@ -446,7 +468,8 @@ struct ma_wr_state { .tree = mt, \ .index = first, \ .last = end, \ - .node = MAS_START, \ + .node = NULL, \ + .status = ma_start, \ .min = 0, \ .max = ULONG_MAX, \ .alloc = NULL, \ @@ -477,7 +500,6 @@ void *mas_find_range(struct ma_state *mas, unsigned long max); void *mas_find_rev(struct ma_state *mas, unsigned long min); void *mas_find_range_rev(struct ma_state *mas, unsigned long max); int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp); -bool mas_is_err(struct ma_state *mas); bool mas_nomem(struct ma_state *mas, gfp_t gfp); void mas_pause(struct ma_state *mas); @@ -506,28 +528,18 @@ static inline void mas_init(struct ma_state *mas, struct maple_tree *tree, mas->tree = tree; mas->index = mas->last = addr; mas->max = ULONG_MAX; - mas->node = MAS_START; + mas->status = ma_start; + mas->node = NULL; } -/* Checks if a mas has not found anything */ -static inline bool mas_is_none(const struct ma_state *mas) -{ - return mas->node == MAS_NONE; -} - -/* Checks if a mas has been paused */ -static inline bool mas_is_paused(const struct ma_state *mas) +static inline bool mas_is_active(struct ma_state *mas) { - return mas->node == MAS_PAUSE; + return mas->status == ma_active; } -/* Check if the mas is pointing to a node or not */ -static inline bool mas_is_active(struct ma_state *mas) +static inline bool mas_is_err(struct ma_state *mas) { - if ((unsigned long)mas->node >= MAPLE_RESERVED_RANGE) - return true; - - return false; + return mas->status == ma_error; } /** @@ -540,9 +552,10 @@ static inline bool mas_is_active(struct ma_state *mas) * * Context: Any context. */ -static inline void mas_reset(struct ma_state *mas) +static __always_inline void mas_reset(struct ma_state *mas) { - mas->node = MAS_START; + mas->status = ma_start; + mas->node = NULL; } /** @@ -716,7 +729,7 @@ static inline void __mas_set_range(struct ma_state *mas, unsigned long start, static inline void mas_set_range(struct ma_state *mas, unsigned long start, unsigned long last) { - mas->node = MAS_START; + mas_reset(mas); __mas_set_range(mas, start, last); } diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index ef18d2b25378..a66534c78c4d 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1071,7 +1071,8 @@ struct vma_iterator { .mas = { \ .tree = &(__mm)->mm_mt, \ .index = __addr, \ - .node = MAS_START, \ + .node = NULL, \ + .status = ma_start, \ }, \ } diff --git a/lib/maple_tree.c b/lib/maple_tree.c index f0d2aea91351..187a9796188e 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -249,40 +249,40 @@ static __always_inline bool mt_is_reserved(const void *entry) xa_is_internal(entry); } -static inline void mas_set_err(struct ma_state *mas, long err) +static __always_inline void mas_set_err(struct ma_state *mas, long err) { mas->node = MA_ERROR(err); + mas->status = ma_error; } -static inline bool mas_is_ptr(const struct ma_state *mas) +static __always_inline bool mas_is_ptr(const struct ma_state *mas) { - return mas->node == MAS_ROOT; + return mas->status == ma_root; } -static inline bool mas_is_start(const struct ma_state *mas) +static __always_inline bool mas_is_start(const struct ma_state *mas) { - return mas->node == MAS_START; + return mas->status == ma_start; } -bool mas_is_err(struct ma_state *mas) +static __always_inline bool mas_is_none(const struct ma_state *mas) { - return xa_is_err(mas->node); + return mas->status == ma_none; } -static __always_inline bool mas_is_overflow(struct ma_state *mas) +static __always_inline bool mas_is_paused(const struct ma_state *mas) { - if (unlikely(mas->node == MAS_OVERFLOW)) - return true; - - return false; + return mas->status == ma_pause; } -static __always_inline bool mas_is_underflow(struct ma_state *mas) +static __always_inline bool mas_is_overflow(struct ma_state *mas) { - if (unlikely(mas->node == MAS_UNDERFLOW)) - return true; + return mas->status == ma_overflow; +} - return false; +static inline bool mas_is_underflow(struct ma_state *mas) +{ + return mas->status == ma_underflow; } static inline bool mas_searchable(struct ma_state *mas) @@ -1274,6 +1274,7 @@ static inline void mas_alloc_nodes(struct ma_state *mas, gfp_t gfp) if (mas->mas_flags & MA_STATE_PREALLOC) { if (allocated) return; + BUG_ON(!allocated); WARN_ON(!allocated); } @@ -1379,14 +1380,14 @@ static void mas_node_count(struct ma_state *mas, int count) * mas_start() - Sets up maple state for operations. * @mas: The maple state. * - * If mas->node == MAS_START, then set the min, max and depth to + * If mas->status == mas_start, then set the min, max and depth to * defaults. * * Return: - * - If mas->node is an error or not MAS_START, return NULL. - * - If it's an empty tree: NULL & mas->node == MAS_NONE - * - If it's a single entry: The entry & mas->node == MAS_ROOT - * - If it's a tree: NULL & mas->node == safe root node. + * - If mas->node is an error or not mas_start, return NULL. + * - If it's an empty tree: NULL & mas->status == ma_none + * - If it's a single entry: The entry & mas->status == mas_root + * - If it's a tree: NULL & mas->status == safe root node. */ static inline struct maple_enode *mas_start(struct ma_state *mas) { @@ -1402,6 +1403,7 @@ retry: /* Tree with nodes */ if (likely(xa_is_node(root))) { mas->depth = 1; + mas->status = ma_active; mas->node = mte_safe_root(root); mas->offset = 0; if (mte_dead_node(mas->node)) @@ -1412,13 +1414,14 @@ retry: /* empty tree */ if (unlikely(!root)) { - mas->node = MAS_NONE; + mas->node = NULL; + mas->status = ma_none; mas->offset = MAPLE_NODE_SLOTS; return NULL; } /* Single entry tree */ - mas->node = MAS_ROOT; + mas->status = ma_root; mas->offset = MAPLE_NODE_SLOTS; /* Single entry tree. */ @@ -2225,19 +2228,21 @@ static inline bool mas_next_sibling(struct ma_state *mas) } /* - * mte_node_or_node() - Return the encoded node or MAS_NONE. + * mte_node_or_none() - Set the enode and state. * @enode: The encoded maple node. * - * Shorthand to avoid setting %NULLs in the tree or maple_subtree_state. - * - * Return: @enode or MAS_NONE + * Set the node to the enode and the status. */ -static inline struct maple_enode *mte_node_or_none(struct maple_enode *enode) +static inline void mas_node_or_none(struct ma_state *mas, + struct maple_enode *enode) { - if (enode) - return enode; - - return ma_enode_ptr(MAS_NONE); + if (enode) { + mas->node = enode; + mas->status = ma_active; + } else { + mas->node = NULL; + mas->status = ma_none; + } } /* @@ -2557,13 +2562,15 @@ static inline void mast_set_split_parents(struct maple_subtree_state *mast, * The node will either be RCU freed or pushed back on the maple state. */ static inline void mas_topiary_node(struct ma_state *mas, - struct maple_enode *enode, bool in_rcu) + struct ma_state *tmp_mas, bool in_rcu) { struct maple_node *tmp; + struct maple_enode *enode; - if (enode == MAS_NONE) + if (mas_is_none(tmp_mas)) return; + enode = tmp_mas->node; tmp = mte_to_node(enode); mte_set_node_dead(enode); if (in_rcu) @@ -2603,8 +2610,8 @@ static inline void mas_topiary_replace(struct ma_state *mas, /* Update the parent pointers in the tree */ tmp[0] = *mas; tmp[0].offset = 0; - tmp[1].node = MAS_NONE; - tmp[2].node = MAS_NONE; + tmp[1].status = ma_none; + tmp[2].status = ma_none; while (!mte_is_leaf(tmp[0].node)) { n = 0; for (i = 0; i < 3; i++) { @@ -2624,7 +2631,7 @@ static inline void mas_topiary_replace(struct ma_state *mas, break; while (n < 3) - tmp_next[n++].node = MAS_NONE; + tmp_next[n++].status = ma_none; for (i = 0; i < 3; i++) tmp[i] = tmp_next[i]; @@ -2637,8 +2644,8 @@ static inline void mas_topiary_replace(struct ma_state *mas, tmp[0] = *mas; tmp[0].offset = 0; tmp[0].node = old_enode; - tmp[1].node = MAS_NONE; - tmp[2].node = MAS_NONE; + tmp[1].status = ma_none; + tmp[2].status = ma_none; in_rcu = mt_in_rcu(mas->tree); do { n = 0; @@ -2653,7 +2660,7 @@ static inline void mas_topiary_replace(struct ma_state *mas, if ((tmp_next[n].min >= tmp_next->index) && (tmp_next[n].max <= tmp_next->last)) { mat_add(&subtrees, tmp_next[n].node); - tmp_next[n].node = MAS_NONE; + tmp_next[n].status = ma_none; } else { n++; } @@ -2664,16 +2671,16 @@ static inline void mas_topiary_replace(struct ma_state *mas, break; while (n < 3) - tmp_next[n++].node = MAS_NONE; + tmp_next[n++].status = ma_none; for (i = 0; i < 3; i++) { - mas_topiary_node(mas, tmp[i].node, in_rcu); + mas_topiary_node(mas, &tmp[i], in_rcu); tmp[i] = tmp_next[i]; } } while (!mte_is_leaf(tmp[0].node)); for (i = 0; i < 3; i++) - mas_topiary_node(mas, tmp[i].node, in_rcu); + mas_topiary_node(mas, &tmp[i], in_rcu); mas_mat_destroy(mas, &subtrees); } @@ -2712,9 +2719,9 @@ static inline void mast_cp_to_nodes(struct maple_subtree_state *mast, { bool new_lmax = true; - mast->l->node = mte_node_or_none(left); - mast->m->node = mte_node_or_none(middle); - mast->r->node = mte_node_or_none(right); + mas_node_or_none(mast->l, left); + mas_node_or_none(mast->m, middle); + mas_node_or_none(mast->r, right); mast->l->min = mast->orig_l->min; if (split == mast->bn->b_end) { @@ -2894,7 +2901,7 @@ static int mas_spanning_rebalance(struct ma_state *mas, mast->l = &l_mas; mast->m = &m_mas; mast->r = &r_mas; - l_mas.node = r_mas.node = m_mas.node = MAS_NONE; + l_mas.status = r_mas.status = m_mas.status = ma_none; /* Check if this is not root and has sufficient data. */ if (((mast->orig_l->min != 0) || (mast->orig_r->max != ULONG_MAX)) && @@ -3421,7 +3428,6 @@ static int mas_split(struct ma_state *mas, struct maple_big_node *b_node) /* Try to push left. */ if (mas_push_data(mas, height, &mast, true)) break; - /* Try to push right. */ if (mas_push_data(mas, height, &mast, false)) break; @@ -3537,6 +3543,7 @@ static inline int mas_root_expand(struct ma_state *mas, void *entry) slots = ma_slots(node, type); node->parent = ma_parent_ptr(mas_tree_parent(mas)); mas->node = mt_mk_node(node, type); + mas->status = ma_active; if (mas->index) { if (contents) { @@ -3569,7 +3576,7 @@ static inline void mas_store_root(struct ma_state *mas, void *entry) mas_root_expand(mas, entry); else { rcu_assign_pointer(mas->tree->ma_root, entry); - mas->node = MAS_START; + mas->status = ma_start; } } @@ -3801,7 +3808,7 @@ static inline int mas_new_root(struct ma_state *mas, void *entry) mas->depth = 0; mas_set_height(mas); rcu_assign_pointer(mas->tree->ma_root, entry); - mas->node = MAS_START; + mas->status = ma_start; goto done; } @@ -3814,6 +3821,7 @@ static inline int mas_new_root(struct ma_state *mas, void *entry) slots = ma_slots(node, type); node->parent = ma_parent_ptr(mas_tree_parent(mas)); mas->node = mt_mk_node(node, type); + mas->status = ma_active; rcu_assign_pointer(slots[0], entry); pivots[0] = mas->last; mas->depth = 1; @@ -4367,11 +4375,13 @@ static __always_inline bool mas_rewalk_if_dead(struct ma_state *mas, /* * mas_prev_node() - Find the prev non-null entry at the same level in the - * tree. The prev value will be mas->node[mas->offset] or MAS_NONE. + * tree. The prev value will be mas->node[mas->offset] or the status will be + * ma_none. * @mas: The maple state * @min: The lower limit to search * - * The prev node value will be mas->node[mas->offset] or MAS_NONE. + * The prev node value will be mas->node[mas->offset] or the status will be + * ma_none. * Return: 1 if the node is dead, 0 otherwise. */ static int mas_prev_node(struct ma_state *mas, unsigned long min) @@ -4441,7 +4451,7 @@ no_entry: if (unlikely(ma_dead_node(node))) return 1; - mas->node = MAS_NONE; + mas->status = ma_underflow; return 0; } @@ -4455,8 +4465,7 @@ no_entry: * * Return: The entry in the previous slot which is possibly NULL */ -static void *mas_prev_slot(struct ma_state *mas, unsigned long min, bool empty, - bool set_underflow) +static void *mas_prev_slot(struct ma_state *mas, unsigned long min, bool empty) { void *entry; void __rcu **slots; @@ -4489,13 +4498,16 @@ again: mas->last = mas->index - 1; mas->index = mas_safe_min(mas, pivots, mas->offset); } else { + if (mas->index <= min) + goto underflow; + if (mas_prev_node(mas, min)) { mas_rewalk(mas, save_point); goto retry; } - if (mas_is_none(mas)) - goto underflow; + if (WARN_ON_ONCE(mas_is_underflow(mas))) + return NULL; mas->last = mas->max; node = mas_mn(mas); @@ -4509,12 +4521,15 @@ again: if (unlikely(mas_rewalk_if_dead(mas, node, save_point))) goto retry; + if (likely(entry)) return entry; if (!empty) { - if (mas->index <= min) - goto underflow; + if (mas->index <= min) { + mas->status = ma_underflow; + return NULL; + } goto again; } @@ -4522,8 +4537,7 @@ again: return entry; underflow: - if (set_underflow) - mas->node = MAS_UNDERFLOW; + mas->status = ma_underflow; return NULL; } @@ -4532,7 +4546,8 @@ underflow: * @mas: The maple state * @max: The maximum pivot value to check. * - * The next value will be mas->node[mas->offset] or MAS_NONE. + * The next value will be mas->node[mas->offset] or the status will have + * overflowed. * Return: 1 on dead node, 0 otherwise. */ static int mas_next_node(struct ma_state *mas, struct maple_node *node, @@ -4548,13 +4563,13 @@ static int mas_next_node(struct ma_state *mas, struct maple_node *node, void __rcu **slots; if (mas->max >= max) - goto no_entry; + goto overflow; min = mas->max + 1; level = 0; do { if (ma_is_root(node)) - goto no_entry; + goto overflow; /* Walk up. */ if (unlikely(mas_ascend(mas))) @@ -4605,11 +4620,11 @@ static int mas_next_node(struct ma_state *mas, struct maple_node *node, mas->min = min; return 0; -no_entry: +overflow: if (unlikely(ma_dead_node(node))) return 1; - mas->node = MAS_NONE; + mas->status = ma_overflow; return 0; } @@ -4624,8 +4639,7 @@ no_entry: * * Return: The entry in the next slot which is possibly NULL */ -static void *mas_next_slot(struct ma_state *mas, unsigned long max, bool empty, - bool set_overflow) +static void *mas_next_slot(struct ma_state *mas, unsigned long max, bool empty) { void __rcu **slots; unsigned long *pivots; @@ -4646,13 +4660,15 @@ retry: if (likely(mas->offset < mas->end)) pivot = pivots[mas->offset]; else - goto overflow; + pivot = mas->max; if (unlikely(mas_rewalk_if_dead(mas, node, save_point))) goto retry; - if (pivot >= max) - goto overflow; + if (pivot >= max) { /* Was at the limit, next will extend beyond */ + mas->status = ma_overflow; + return NULL; + } } if (likely(mas->offset < mas->end)) { @@ -4664,16 +4680,18 @@ again: else mas->last = mas->max; } else { + if (mas->last >= max) { + mas->status = ma_overflow; + return NULL; + } + if (mas_next_node(mas, node, max)) { mas_rewalk(mas, save_point); goto retry; } - if (WARN_ON_ONCE(mas_is_none(mas))) { - mas->node = MAS_OVERFLOW; + if (WARN_ON_ONCE(mas_is_overflow(mas))) return NULL; - goto overflow; - } mas->offset = 0; mas->index = mas->min; @@ -4691,20 +4709,18 @@ again: if (entry) return entry; + if (!empty) { - if (mas->last >= max) - goto overflow; + if (mas->last >= max) { + mas->status = ma_overflow; + return NULL; + } mas->index = mas->last + 1; goto again; } return entry; - -overflow: - if (set_overflow) - mas->node = MAS_OVERFLOW; - return NULL; } /* @@ -4723,11 +4739,11 @@ overflow: static inline void *mas_next_entry(struct ma_state *mas, unsigned long limit) { if (mas->last >= limit) { - mas->node = MAS_OVERFLOW; + mas->status = ma_overflow; return NULL; } - return mas_next_slot(mas, limit, false, true); + return mas_next_slot(mas, limit, false); } /* @@ -4895,7 +4911,7 @@ done: * @mas: The maple state. * * mas->index and mas->last will be set to the range if there is a value. If - * mas->node is MAS_NONE, reset to MAS_START. + * mas->status is ma_none, reset to ma_start * * Return: the entry at the location or %NULL. */ @@ -4904,7 +4920,7 @@ void *mas_walk(struct ma_state *mas) void *entry; if (!mas_is_active(mas) || !mas_is_start(mas)) - mas->node = MAS_START; + mas->status = ma_start; retry: entry = mas_state_walk(mas); if (mas_is_start(mas)) { @@ -4920,7 +4936,7 @@ retry: mas->index = 1; mas->last = ULONG_MAX; - mas->node = MAS_NONE; + mas->status = ma_none; return NULL; } @@ -5672,27 +5688,40 @@ static bool mas_next_setup(struct ma_state *mas, unsigned long max, bool was_none = mas_is_none(mas); if (unlikely(mas->last >= max)) { - mas->node = MAS_OVERFLOW; + mas->status = ma_overflow; return true; } - if (mas_is_active(mas)) + switch (mas->status) { + case ma_active: return false; - - if (mas_is_none(mas) || mas_is_paused(mas)) { - mas->node = MAS_START; - } else if (mas_is_overflow(mas)) { + case ma_none: + fallthrough; + case ma_pause: + mas->status = ma_start; + fallthrough; + case ma_start: + mas_walk(mas); /* Retries on dead nodes handled by mas_walk */ + break; + case ma_overflow: /* Overflowed before, but the max changed */ - mas->node = MAS_START; - } else if (mas_is_underflow(mas)) { - mas->node = MAS_START; + mas->status = ma_active; + break; + case ma_underflow: + /* The user expects the mas to be one before where it is */ + mas->status = ma_active; *entry = mas_walk(mas); if (*entry) return true; + break; + case ma_root: + break; + case ma_error: + return true; } - if (mas_is_start(mas)) - *entry = mas_walk(mas); /* Retries on dead nodes handled by mas_walk */ + if (likely(mas_is_active(mas))) /* Fast path */ + return false; if (mas_is_ptr(mas)) { *entry = NULL; @@ -5702,7 +5731,7 @@ static bool mas_next_setup(struct ma_state *mas, unsigned long max, } mas->index = 1; mas->last = ULONG_MAX; - mas->node = MAS_NONE; + mas->status = ma_none; return true; } @@ -5731,7 +5760,7 @@ void *mas_next(struct ma_state *mas, unsigned long max) return entry; /* Retries on dead nodes handled by mas_next_slot */ - return mas_next_slot(mas, max, false, true); + return mas_next_slot(mas, max, false); } EXPORT_SYMBOL_GPL(mas_next); @@ -5754,7 +5783,7 @@ void *mas_next_range(struct ma_state *mas, unsigned long max) return entry; /* Retries on dead nodes handled by mas_next_slot */ - return mas_next_slot(mas, max, true, true); + return mas_next_slot(mas, max, true); } EXPORT_SYMBOL_GPL(mas_next_range); @@ -5785,33 +5814,45 @@ EXPORT_SYMBOL_GPL(mt_next); static bool mas_prev_setup(struct ma_state *mas, unsigned long min, void **entry) { if (unlikely(mas->index <= min)) { - mas->node = MAS_UNDERFLOW; + mas->status = ma_underflow; return true; } - if (mas_is_active(mas)) + switch (mas->status) { + case ma_active: return false; - - if (mas_is_overflow(mas)) { - mas->node = MAS_START; + case ma_start: + break; + case ma_none: + fallthrough; + case ma_pause: + mas->status = ma_start; + break; + case ma_underflow: + /* underflowed before but the min changed */ + mas->status = ma_active; + break; + case ma_overflow: + /* User expects mas to be one after where it is */ + mas->status = ma_active; *entry = mas_walk(mas); if (*entry) return true; - } - - if (mas_is_none(mas) || mas_is_paused(mas)) { - mas->node = MAS_START; - } else if (mas_is_underflow(mas)) { - /* underflowed before but the min changed */ - mas->node = MAS_START; + break; + case ma_root: + break; + case ma_error: + return true; } if (mas_is_start(mas)) mas_walk(mas); if (unlikely(mas_is_ptr(mas))) { - if (!mas->index) - goto none; + if (!mas->index) { + mas->status = ma_none; + return true; + } mas->index = mas->last = 0; *entry = mas_root(mas); return true; @@ -5821,7 +5862,7 @@ static bool mas_prev_setup(struct ma_state *mas, unsigned long min, void **entry if (mas->index) { /* Walked to out-of-range pointer? */ mas->index = mas->last = 0; - mas->node = MAS_ROOT; + mas->status = ma_root; *entry = mas_root(mas); return true; } @@ -5829,10 +5870,6 @@ static bool mas_prev_setup(struct ma_state *mas, unsigned long min, void **entry } return false; - -none: - mas->node = MAS_NONE; - return true; } /** @@ -5841,7 +5878,7 @@ none: * @min: The minimum value to check. * * Must hold rcu_read_lock or the write lock. - * Will reset mas to MAS_START if the node is MAS_NONE. Will stop on not + * Will reset mas to ma_start if the status is ma_none. Will stop on not * searchable nodes. * * Return: the previous value or %NULL. @@ -5853,7 +5890,7 @@ void *mas_prev(struct ma_state *mas, unsigned long min) if (mas_prev_setup(mas, min, &entry)) return entry; - return mas_prev_slot(mas, min, false, true); + return mas_prev_slot(mas, min, false); } EXPORT_SYMBOL_GPL(mas_prev); @@ -5864,7 +5901,7 @@ EXPORT_SYMBOL_GPL(mas_prev); * * Sets @mas->index and @mas->last to the range. * Must hold rcu_read_lock or the write lock. - * Will reset mas to MAS_START if the node is MAS_NONE. Will stop on not + * Will reset mas to ma_start if the node is ma_none. Will stop on not * searchable nodes. * * Return: the previous value or %NULL. @@ -5876,7 +5913,7 @@ void *mas_prev_range(struct ma_state *mas, unsigned long min) if (mas_prev_setup(mas, min, &entry)) return entry; - return mas_prev_slot(mas, min, true, true); + return mas_prev_slot(mas, min, true); } EXPORT_SYMBOL_GPL(mas_prev_range); @@ -5919,7 +5956,8 @@ EXPORT_SYMBOL_GPL(mt_prev); */ void mas_pause(struct ma_state *mas) { - mas->node = MAS_PAUSE; + mas->status = ma_pause; + mas->node = NULL; } EXPORT_SYMBOL_GPL(mas_pause); @@ -5933,32 +5971,52 @@ EXPORT_SYMBOL_GPL(mas_pause); */ static __always_inline bool mas_find_setup(struct ma_state *mas, unsigned long max, void **entry) { - if (mas_is_active(mas)) { + switch (mas->status) { + case ma_active: if (mas->last < max) return false; - return true; - } - - if (mas_is_paused(mas)) { + case ma_start: + break; + case ma_pause: if (unlikely(mas->last >= max)) return true; mas->index = ++mas->last; - mas->node = MAS_START; - } else if (mas_is_none(mas)) { + mas->status = ma_start; + break; + case ma_none: if (unlikely(mas->last >= max)) return true; mas->index = mas->last; - mas->node = MAS_START; - } else if (mas_is_overflow(mas) || mas_is_underflow(mas)) { - if (mas->index > max) { - mas->node = MAS_OVERFLOW; + mas->status = ma_start; + break; + case ma_underflow: + /* mas is pointing at entry before unable to go lower */ + if (unlikely(mas->index >= max)) { + mas->status = ma_overflow; return true; } - mas->node = MAS_START; + mas->status = ma_active; + *entry = mas_walk(mas); + if (*entry) + return true; + break; + case ma_overflow: + if (unlikely(mas->last >= max)) + return true; + + mas->status = ma_active; + *entry = mas_walk(mas); + if (*entry) + return true; + break; + case ma_root: + break; + case ma_error: + return true; } if (mas_is_start(mas)) { @@ -5985,7 +6043,7 @@ static __always_inline bool mas_find_setup(struct ma_state *mas, unsigned long m return false; ptr_out_of_range: - mas->node = MAS_NONE; + mas->status = ma_none; mas->index = 1; mas->last = ULONG_MAX; return true; @@ -5999,7 +6057,7 @@ ptr_out_of_range: * * Must hold rcu_read_lock or the write lock. * If an entry exists, last and index are updated accordingly. - * May set @mas->node to MAS_NONE. + * May set @mas->status to ma_overflow. * * Return: The entry or %NULL. */ @@ -6011,7 +6069,10 @@ void *mas_find(struct ma_state *mas, unsigned long max) return entry; /* Retries on dead nodes handled by mas_next_slot */ - return mas_next_slot(mas, max, false, false); + entry = mas_next_slot(mas, max, false); + /* Ignore overflow */ + mas->status = ma_active; + return entry; } EXPORT_SYMBOL_GPL(mas_find); @@ -6023,7 +6084,7 @@ EXPORT_SYMBOL_GPL(mas_find); * * Must hold rcu_read_lock or the write lock. * If an entry exists, last and index are updated accordingly. - * May set @mas->node to MAS_NONE. + * May set @mas->status to ma_overflow. * * Return: The entry or %NULL. */ @@ -6035,7 +6096,7 @@ void *mas_find_range(struct ma_state *mas, unsigned long max) return entry; /* Retries on dead nodes handled by mas_next_slot */ - return mas_next_slot(mas, max, true, false); + return mas_next_slot(mas, max, true); } EXPORT_SYMBOL_GPL(mas_find_range); @@ -6050,33 +6111,45 @@ EXPORT_SYMBOL_GPL(mas_find_range); static bool mas_find_rev_setup(struct ma_state *mas, unsigned long min, void **entry) { - if (mas_is_active(mas)) { - if (mas->index > min) - return false; - - return true; - } - if (mas_is_paused(mas)) { + switch (mas->status) { + case ma_active: + goto active; + case ma_start: + break; + case ma_pause: if (unlikely(mas->index <= min)) { - mas->node = MAS_NONE; + mas->status = ma_underflow; return true; } - mas->node = MAS_START; mas->last = --mas->index; - } else if (mas_is_none(mas)) { + mas->status = ma_start; + break; + case ma_none: if (mas->index <= min) goto none; mas->last = mas->index; - mas->node = MAS_START; - } else if (mas_is_underflow(mas) || mas_is_overflow(mas)) { - if (mas->last <= min) { - mas->node = MAS_UNDERFLOW; + mas->status = ma_start; + break; + case ma_overflow: /* user expects the mas to be one after where it is */ + if (unlikely(mas->index <= min)) { + mas->status = ma_underflow; return true; } - mas->node = MAS_START; + mas->status = ma_active; + break; + case ma_underflow: /* user expects the mas to be one before where it is */ + if (unlikely(mas->index <= min)) + return true; + + mas->status = ma_active; + break; + case ma_root: + break; + case ma_error: + return true; } if (mas_is_start(mas)) { @@ -6099,19 +6172,20 @@ static bool mas_find_rev_setup(struct ma_state *mas, unsigned long min, * previous location is 0. */ mas->last = mas->index = 0; - mas->node = MAS_ROOT; + mas->status = ma_root; *entry = mas_root(mas); return true; } } +active: if (mas->index < min) return true; return false; none: - mas->node = MAS_NONE; + mas->status = ma_none; return true; } @@ -6124,7 +6198,7 @@ none: * * Must hold rcu_read_lock or the write lock. * If an entry exists, last and index are updated accordingly. - * May set @mas->node to MAS_NONE. + * May set @mas->status to ma_underflow. * * Return: The entry or %NULL. */ @@ -6136,7 +6210,7 @@ void *mas_find_rev(struct ma_state *mas, unsigned long min) return entry; /* Retries on dead nodes handled by mas_prev_slot */ - return mas_prev_slot(mas, min, false, false); + return mas_prev_slot(mas, min, false); } EXPORT_SYMBOL_GPL(mas_find_rev); @@ -6150,7 +6224,7 @@ EXPORT_SYMBOL_GPL(mas_find_rev); * * Must hold rcu_read_lock or the write lock. * If an entry exists, last and index are updated accordingly. - * May set @mas->node to MAS_NONE. + * May set @mas->status to ma_underflow. * * Return: The entry or %NULL. */ @@ -6162,7 +6236,7 @@ void *mas_find_range_rev(struct ma_state *mas, unsigned long min) return entry; /* Retries on dead nodes handled by mas_prev_slot */ - return mas_prev_slot(mas, min, true, false); + return mas_prev_slot(mas, min, true); } EXPORT_SYMBOL_GPL(mas_find_range_rev); @@ -6183,7 +6257,7 @@ void *mas_erase(struct ma_state *mas) MA_WR_STATE(wr_mas, mas, NULL); if (!mas_is_active(mas) || !mas_is_start(mas)) - mas->node = MAS_START; + mas->status = ma_start; /* Retry unnecessary when holding the write lock. */ entry = mas_state_walk(mas); @@ -6228,7 +6302,7 @@ bool mas_nomem(struct ma_state *mas, gfp_t gfp) if (!mas_allocated(mas)) return false; - mas->node = MAS_START; + mas->status = ma_start; return true; } @@ -6627,7 +6701,7 @@ static inline void mas_dup_build(struct ma_state *mas, struct ma_state *new_mas, node = mt_alloc_one(gfp); if (!node) { - new_mas->node = MAS_NONE; + new_mas->status = ma_none; mas_set_err(mas, -ENOMEM); return; } @@ -6971,11 +7045,11 @@ static inline struct maple_enode *mas_get_slot(struct ma_state *mas, static void mas_dfs_postorder(struct ma_state *mas, unsigned long max) { - struct maple_enode *p = MAS_NONE, *mn = mas->node; + struct maple_enode *p, *mn = mas->node; unsigned long p_min, p_max; mas_next_node(mas, mas_mn(mas), max); - if (!mas_is_none(mas)) + if (!mas_is_overflow(mas)) return; if (mte_is_root(mn)) @@ -6988,7 +7062,7 @@ static void mas_dfs_postorder(struct ma_state *mas, unsigned long max) p_min = mas->min; p_max = mas->max; mas_prev_node(mas, 0); - } while (!mas_is_none(mas)); + } while (!mas_is_underflow(mas)); mas->node = p; mas->max = p_max; @@ -7443,7 +7517,7 @@ static void mt_validate_nulls(struct maple_tree *mt) MA_STATE(mas, mt, 0, 0); mas_start(&mas); - if (mas_is_none(&mas) || (mas.node == MAS_ROOT)) + if (mas_is_none(&mas) || (mas_is_ptr(&mas))) return; while (!mte_is_leaf(mas.node)) @@ -7460,7 +7534,7 @@ static void mt_validate_nulls(struct maple_tree *mt) last = entry; if (offset == mas_data_end(&mas)) { mas_next_node(&mas, mas_mn(&mas), ULONG_MAX); - if (mas_is_none(&mas)) + if (mas_is_overflow(&mas)) return; offset = 0; slots = ma_slots(mte_to_node(mas.node), @@ -7469,7 +7543,7 @@ static void mt_validate_nulls(struct maple_tree *mt) offset++; } - } while (!mas_is_none(&mas)); + } while (!mas_is_overflow(&mas)); } /* @@ -7490,7 +7564,7 @@ void mt_validate(struct maple_tree *mt) while (!mte_is_leaf(mas.node)) mas_descend(&mas); - while (!mas_is_none(&mas)) { + while (!mas_is_overflow(&mas)) { MAS_WARN_ON(&mas, mte_dead_node(mas.node)); end = mas_data_end(&mas); if (MAS_WARN_ON(&mas, (end < mt_min_slot_count(mas.node)) && @@ -7515,16 +7589,35 @@ EXPORT_SYMBOL_GPL(mt_validate); void mas_dump(const struct ma_state *mas) { pr_err("MAS: tree=%p enode=%p ", mas->tree, mas->node); - if (mas_is_none(mas)) - pr_err("(MAS_NONE) "); - else if (mas_is_ptr(mas)) - pr_err("(MAS_ROOT) "); - else if (mas_is_start(mas)) - pr_err("(MAS_START) "); - else if (mas_is_paused(mas)) - pr_err("(MAS_PAUSED) "); - - pr_err("[%u] index=%lx last=%lx\n", mas->offset, mas->index, mas->last); + switch (mas->status) { + case ma_active: + pr_err("(ma_active)"); + break; + case ma_none: + pr_err("(ma_none)"); + break; + case ma_root: + pr_err("(ma_root)"); + break; + case ma_start: + pr_err("(ma_start) "); + break; + case ma_pause: + pr_err("(ma_pause) "); + break; + case ma_overflow: + pr_err("(ma_overflow) "); + break; + case ma_underflow: + pr_err("(ma_underflow) "); + break; + case ma_error: + pr_err("(ma_error) "); + break; + } + + pr_err("[%u/%u] index=%lx last=%lx\n", mas->offset, mas->end, + mas->index, mas->last); pr_err(" min=%lx max=%lx alloc=%p, depth=%u, flags=%x\n", mas->min, mas->max, mas->alloc, mas->depth, mas->mas_flags); if (mas->index > mas->last) diff --git a/lib/test_maple_tree.c b/lib/test_maple_tree.c index 3e4597fb49d3..e7a5d688c9e0 100644 --- a/lib/test_maple_tree.c +++ b/lib/test_maple_tree.c @@ -54,6 +54,11 @@ atomic_t maple_tree_tests_passed; #else #define cond_resched() do {} while (0) #endif + +#define mas_is_none(x) ((x)->status == ma_none) +#define mas_is_overflow(x) ((x)->status == ma_overflow) +#define mas_is_underflow(x) ((x)->status == ma_underflow) + static int __init mtree_insert_index(struct maple_tree *mt, unsigned long index, gfp_t gfp) { @@ -582,7 +587,7 @@ static noinline void __init check_find(struct maple_tree *mt) MT_BUG_ON(mt, last != mas.last); - mas.node = MAS_NONE; + mas.status = ma_none; mas.index = ULONG_MAX; mas.last = ULONG_MAX; entry2 = mas_prev(&mas, 0); @@ -2178,7 +2183,7 @@ static noinline void __init next_prev_test(struct maple_tree *mt) MT_BUG_ON(mt, val != NULL); MT_BUG_ON(mt, mas.index != 0); MT_BUG_ON(mt, mas.last != 5); - MT_BUG_ON(mt, mas.node != MAS_UNDERFLOW); + MT_BUG_ON(mt, !mas_is_underflow(&mas)); mas.index = 0; mas.last = 5; @@ -3042,10 +3047,6 @@ static noinline void __init check_empty_area_fill(struct maple_tree *mt) * DNE active active range of NULL */ -#define mas_active(x) (((x).node != MAS_ROOT) && \ - ((x).node != MAS_START) && \ - ((x).node != MAS_PAUSE) && \ - ((x).node != MAS_NONE)) static noinline void __init check_state_handling(struct maple_tree *mt) { MA_STATE(mas, mt, 0, 0); @@ -3060,7 +3061,7 @@ static noinline void __init check_state_handling(struct maple_tree *mt) /* prev: Start -> underflow*/ entry = mas_prev(&mas, 0); MT_BUG_ON(mt, entry != NULL); - MT_BUG_ON(mt, mas.node != MAS_UNDERFLOW); + MT_BUG_ON(mt, mas.status != ma_underflow); /* prev: Start -> root */ mas_set(&mas, 10); @@ -3068,7 +3069,7 @@ static noinline void __init check_state_handling(struct maple_tree *mt) MT_BUG_ON(mt, entry != ptr); MT_BUG_ON(mt, mas.index != 0); MT_BUG_ON(mt, mas.last != 0); - MT_BUG_ON(mt, mas.node != MAS_ROOT); + MT_BUG_ON(mt, mas.status != ma_root); /* prev: pause -> root */ mas_set(&mas, 10); @@ -3077,7 +3078,7 @@ static noinline void __init check_state_handling(struct maple_tree *mt) MT_BUG_ON(mt, entry != ptr); MT_BUG_ON(mt, mas.index != 0); MT_BUG_ON(mt, mas.last != 0); - MT_BUG_ON(mt, mas.node != MAS_ROOT); + MT_BUG_ON(mt, mas.status != ma_root); /* next: start -> none */ mas_set(&mas, 0); @@ -3085,7 +3086,7 @@ static noinline void __init check_state_handling(struct maple_tree *mt) MT_BUG_ON(mt, mas.index != 1); MT_BUG_ON(mt, mas.last != ULONG_MAX); MT_BUG_ON(mt, entry != NULL); - MT_BUG_ON(mt, mas.node != MAS_NONE); + MT_BUG_ON(mt, mas.status != ma_none); /* next: start -> none*/ mas_set(&mas, 10); @@ -3093,7 +3094,7 @@ static noinline void __init check_state_handling(struct maple_tree *mt) MT_BUG_ON(mt, mas.index != 1); MT_BUG_ON(mt, mas.last != ULONG_MAX); MT_BUG_ON(mt, entry != NULL); - MT_BUG_ON(mt, mas.node != MAS_NONE); + MT_BUG_ON(mt, mas.status != ma_none); /* find: start -> root */ mas_set(&mas, 0); @@ -3101,21 +3102,21 @@ static noinline void __init check_state_handling(struct maple_tree *mt) MT_BUG_ON(mt, entry != ptr); MT_BUG_ON(mt, mas.index != 0); MT_BUG_ON(mt, mas.last != 0); - MT_BUG_ON(mt, mas.node != MAS_ROOT); + MT_BUG_ON(mt, mas.status != ma_root); /* find: root -> none */ entry = mas_find(&mas, ULONG_MAX); MT_BUG_ON(mt, entry != NULL); MT_BUG_ON(mt, mas.index != 1); MT_BUG_ON(mt, mas.last != ULONG_MAX); - MT_BUG_ON(mt, mas.node != MAS_NONE); + MT_BUG_ON(mt, mas.status != ma_none); /* find: none -> none */ entry = mas_find(&mas, ULONG_MAX); MT_BUG_ON(mt, entry != NULL); MT_BUG_ON(mt, mas.index != 1); MT_BUG_ON(mt, mas.last != ULONG_MAX); - MT_BUG_ON(mt, mas.node != MAS_NONE); + MT_BUG_ON(mt, mas.status != ma_none); /* find: start -> none */ mas_set(&mas, 10); @@ -3123,14 +3124,14 @@ static noinline void __init check_state_handling(struct maple_tree *mt) MT_BUG_ON(mt, entry != NULL); MT_BUG_ON(mt, mas.index != 1); MT_BUG_ON(mt, mas.last != ULONG_MAX); - MT_BUG_ON(mt, mas.node != MAS_NONE); + MT_BUG_ON(mt, mas.status != ma_none); /* find_rev: none -> root */ entry = mas_find_rev(&mas, 0); MT_BUG_ON(mt, entry != ptr); MT_BUG_ON(mt, mas.index != 0); MT_BUG_ON(mt, mas.last != 0); - MT_BUG_ON(mt, mas.node != MAS_ROOT); + MT_BUG_ON(mt, mas.status != ma_root); /* find_rev: start -> root */ mas_set(&mas, 0); @@ -3138,21 +3139,21 @@ static noinline void __init check_state_handling(struct maple_tree *mt) MT_BUG_ON(mt, entry != ptr); MT_BUG_ON(mt, mas.index != 0); MT_BUG_ON(mt, mas.last != 0); - MT_BUG_ON(mt, mas.node != MAS_ROOT); + MT_BUG_ON(mt, mas.status != ma_root); /* find_rev: root -> none */ entry = mas_find_rev(&mas, 0); MT_BUG_ON(mt, entry != NULL); MT_BUG_ON(mt, mas.index != 0); MT_BUG_ON(mt, mas.last != 0); - MT_BUG_ON(mt, mas.node != MAS_NONE); + MT_BUG_ON(mt, mas.status != ma_none); /* find_rev: none -> none */ entry = mas_find_rev(&mas, 0); MT_BUG_ON(mt, entry != NULL); MT_BUG_ON(mt, mas.index != 0); MT_BUG_ON(mt, mas.last != 0); - MT_BUG_ON(mt, mas.node != MAS_NONE); + MT_BUG_ON(mt, mas.status != ma_none); /* find_rev: start -> root */ mas_set(&mas, 10); @@ -3160,7 +3161,7 @@ static noinline void __init check_state_handling(struct maple_tree *mt) MT_BUG_ON(mt, entry != ptr); MT_BUG_ON(mt, mas.index != 0); MT_BUG_ON(mt, mas.last != 0); - MT_BUG_ON(mt, mas.node != MAS_ROOT); + MT_BUG_ON(mt, mas.status != ma_root); /* walk: start -> none */ mas_set(&mas, 10); @@ -3168,7 +3169,7 @@ static noinline void __init check_state_handling(struct maple_tree *mt) MT_BUG_ON(mt, entry != NULL); MT_BUG_ON(mt, mas.index != 1); MT_BUG_ON(mt, mas.last != ULONG_MAX); - MT_BUG_ON(mt, mas.node != MAS_NONE); + MT_BUG_ON(mt, mas.status != ma_none); /* walk: pause -> none*/ mas_set(&mas, 10); @@ -3177,7 +3178,7 @@ static noinline void __init check_state_handling(struct maple_tree *mt) MT_BUG_ON(mt, entry != NULL); MT_BUG_ON(mt, mas.index != 1); MT_BUG_ON(mt, mas.last != ULONG_MAX); - MT_BUG_ON(mt, mas.node != MAS_NONE); + MT_BUG_ON(mt, mas.status != ma_none); /* walk: none -> none */ mas.index = mas.last = 10; @@ -3185,14 +3186,14 @@ static noinline void __init check_state_handling(struct maple_tree *mt) MT_BUG_ON(mt, entry != NULL); MT_BUG_ON(mt, mas.index != 1); MT_BUG_ON(mt, mas.last != ULONG_MAX); - MT_BUG_ON(mt, mas.node != MAS_NONE); + MT_BUG_ON(mt, mas.status != ma_none); /* walk: none -> none */ entry = mas_walk(&mas); MT_BUG_ON(mt, entry != NULL); MT_BUG_ON(mt, mas.index != 1); MT_BUG_ON(mt, mas.last != ULONG_MAX); - MT_BUG_ON(mt, mas.node != MAS_NONE); + MT_BUG_ON(mt, mas.status != ma_none); /* walk: start -> root */ mas_set(&mas, 0); @@ -3200,7 +3201,7 @@ static noinline void __init check_state_handling(struct maple_tree *mt) MT_BUG_ON(mt, entry != ptr); MT_BUG_ON(mt, mas.index != 0); MT_BUG_ON(mt, mas.last != 0); - MT_BUG_ON(mt, mas.node != MAS_ROOT); + MT_BUG_ON(mt, mas.status != ma_root); /* walk: pause -> root */ mas_set(&mas, 0); @@ -3209,22 +3210,22 @@ static noinline void __init check_state_handling(struct maple_tree *mt) MT_BUG_ON(mt, entry != ptr); MT_BUG_ON(mt, mas.index != 0); MT_BUG_ON(mt, mas.last != 0); - MT_BUG_ON(mt, mas.node != MAS_ROOT); + MT_BUG_ON(mt, mas.status != ma_root); /* walk: none -> root */ - mas.node = MAS_NONE; + mas.status = ma_none; entry = mas_walk(&mas); MT_BUG_ON(mt, entry != ptr); MT_BUG_ON(mt, mas.index != 0); MT_BUG_ON(mt, mas.last != 0); - MT_BUG_ON(mt, mas.node != MAS_ROOT); + MT_BUG_ON(mt, mas.status != ma_root); /* walk: root -> root */ entry = mas_walk(&mas); MT_BUG_ON(mt, entry != ptr); MT_BUG_ON(mt, mas.index != 0); MT_BUG_ON(mt, mas.last != 0); - MT_BUG_ON(mt, mas.node != MAS_ROOT); + MT_BUG_ON(mt, mas.status != ma_root); /* walk: root -> none */ mas_set(&mas, 10); @@ -3232,7 +3233,7 @@ static noinline void __init check_state_handling(struct maple_tree *mt) MT_BUG_ON(mt, entry != NULL); MT_BUG_ON(mt, mas.index != 1); MT_BUG_ON(mt, mas.last != ULONG_MAX); - MT_BUG_ON(mt, mas.node != MAS_NONE); + MT_BUG_ON(mt, mas.status != ma_none); /* walk: none -> root */ mas.index = mas.last = 0; @@ -3240,7 +3241,7 @@ static noinline void __init check_state_handling(struct maple_tree *mt) MT_BUG_ON(mt, entry != ptr); MT_BUG_ON(mt, mas.index != 0); MT_BUG_ON(mt, mas.last != 0); - MT_BUG_ON(mt, mas.node != MAS_ROOT); + MT_BUG_ON(mt, mas.status != ma_root); mas_unlock(&mas); @@ -3258,7 +3259,7 @@ static noinline void __init check_state_handling(struct maple_tree *mt) MT_BUG_ON(mt, entry != ptr); MT_BUG_ON(mt, mas.index != 0x1000); MT_BUG_ON(mt, mas.last != 0x1500); - MT_BUG_ON(mt, !mas_active(mas)); + MT_BUG_ON(mt, !mas_is_active(&mas)); /* next: pause ->active */ mas_set(&mas, 0); @@ -3267,126 +3268,132 @@ static noinline void __init check_state_handling(struct maple_tree *mt) MT_BUG_ON(mt, entry != ptr); MT_BUG_ON(mt, mas.index != 0x1000); MT_BUG_ON(mt, mas.last != 0x1500); - MT_BUG_ON(mt, !mas_active(mas)); + MT_BUG_ON(mt, !mas_is_active(&mas)); /* next: none ->active */ mas.index = mas.last = 0; mas.offset = 0; - mas.node = MAS_NONE; + mas.status = ma_none; entry = mas_next(&mas, ULONG_MAX); MT_BUG_ON(mt, entry != ptr); MT_BUG_ON(mt, mas.index != 0x1000); MT_BUG_ON(mt, mas.last != 0x1500); - MT_BUG_ON(mt, !mas_active(mas)); + MT_BUG_ON(mt, !mas_is_active(&mas)); - /* next:active ->active */ - entry = mas_next(&mas, ULONG_MAX); + /* next:active ->active (spanning limit) */ + entry = mas_next(&mas, 0x2100); MT_BUG_ON(mt, entry != ptr2); MT_BUG_ON(mt, mas.index != 0x2000); MT_BUG_ON(mt, mas.last != 0x2500); - MT_BUG_ON(mt, !mas_active(mas)); + MT_BUG_ON(mt, !mas_is_active(&mas)); - /* next:active -> active beyond data */ + /* next:active -> overflow (limit reached) beyond data */ entry = mas_next(&mas, 0x2999); MT_BUG_ON(mt, entry != NULL); MT_BUG_ON(mt, mas.index != 0x2501); MT_BUG_ON(mt, mas.last != 0x2fff); - MT_BUG_ON(mt, !mas_active(mas)); + MT_BUG_ON(mt, !mas_is_overflow(&mas)); - /* Continue after last range ends after max */ + /* next:overflow -> active (limit changed) */ entry = mas_next(&mas, ULONG_MAX); MT_BUG_ON(mt, entry != ptr3); MT_BUG_ON(mt, mas.index != 0x3000); MT_BUG_ON(mt, mas.last != 0x3500); - MT_BUG_ON(mt, !mas_active(mas)); + MT_BUG_ON(mt, !mas_is_active(&mas)); - /* next:active -> active continued */ + /* next:active -> overflow (limit reached) */ entry = mas_next(&mas, ULONG_MAX); MT_BUG_ON(mt, entry != NULL); MT_BUG_ON(mt, mas.index != 0x3501); MT_BUG_ON(mt, mas.last != ULONG_MAX); - MT_BUG_ON(mt, !mas_active(mas)); - - /* next:active -> overflow */ - entry = mas_next(&mas, ULONG_MAX); - MT_BUG_ON(mt, entry != NULL); - MT_BUG_ON(mt, mas.index != 0x3501); - MT_BUG_ON(mt, mas.last != ULONG_MAX); - MT_BUG_ON(mt, mas.node != MAS_OVERFLOW); + MT_BUG_ON(mt, !mas_is_overflow(&mas)); /* next:overflow -> overflow */ entry = mas_next(&mas, ULONG_MAX); MT_BUG_ON(mt, entry != NULL); MT_BUG_ON(mt, mas.index != 0x3501); MT_BUG_ON(mt, mas.last != ULONG_MAX); - MT_BUG_ON(mt, mas.node != MAS_OVERFLOW); + MT_BUG_ON(mt, !mas_is_overflow(&mas)); /* prev:overflow -> active */ entry = mas_prev(&mas, 0); MT_BUG_ON(mt, entry != ptr3); MT_BUG_ON(mt, mas.index != 0x3000); MT_BUG_ON(mt, mas.last != 0x3500); - MT_BUG_ON(mt, !mas_active(mas)); + MT_BUG_ON(mt, !mas_is_active(&mas)); /* next: none -> active, skip value at location */ mas_set(&mas, 0); entry = mas_next(&mas, ULONG_MAX); - mas.node = MAS_NONE; + mas.status = ma_none; mas.offset = 0; entry = mas_next(&mas, ULONG_MAX); MT_BUG_ON(mt, entry != ptr2); MT_BUG_ON(mt, mas.index != 0x2000); MT_BUG_ON(mt, mas.last != 0x2500); - MT_BUG_ON(mt, !mas_active(mas)); + MT_BUG_ON(mt, !mas_is_active(&mas)); /* prev:active ->active */ entry = mas_prev(&mas, 0); MT_BUG_ON(mt, entry != ptr); MT_BUG_ON(mt, mas.index != 0x1000); MT_BUG_ON(mt, mas.last != 0x1500); - MT_BUG_ON(mt, !mas_active(mas)); + MT_BUG_ON(mt, !mas_is_active(&mas)); - /* prev:active -> active spanning end range */ + /* prev:active -> underflow (span limit) */ + mas_next(&mas, ULONG_MAX); + entry = mas_prev(&mas, 0x1200); + MT_BUG_ON(mt, entry != ptr); + MT_BUG_ON(mt, mas.index != 0x1000); + MT_BUG_ON(mt, mas.last != 0x1500); + MT_BUG_ON(mt, !mas_is_active(&mas)); /* spanning limit */ + entry = mas_prev(&mas, 0x1200); /* underflow */ + MT_BUG_ON(mt, entry != NULL); + MT_BUG_ON(mt, mas.index != 0x1000); + MT_BUG_ON(mt, mas.last != 0x1500); + MT_BUG_ON(mt, !mas_is_underflow(&mas)); + + /* prev:underflow -> underflow (lower limit) spanning end range */ entry = mas_prev(&mas, 0x0100); MT_BUG_ON(mt, entry != NULL); MT_BUG_ON(mt, mas.index != 0); MT_BUG_ON(mt, mas.last != 0x0FFF); - MT_BUG_ON(mt, !mas_active(mas)); + MT_BUG_ON(mt, !mas_is_underflow(&mas)); - /* prev:active -> underflow */ + /* prev:underflow -> underflow */ entry = mas_prev(&mas, 0); MT_BUG_ON(mt, entry != NULL); MT_BUG_ON(mt, mas.index != 0); MT_BUG_ON(mt, mas.last != 0x0FFF); - MT_BUG_ON(mt, mas.node != MAS_UNDERFLOW); + MT_BUG_ON(mt, !mas_is_underflow(&mas)); /* prev:underflow -> underflow */ entry = mas_prev(&mas, 0); MT_BUG_ON(mt, entry != NULL); MT_BUG_ON(mt, mas.index != 0); MT_BUG_ON(mt, mas.last != 0x0FFF); - MT_BUG_ON(mt, mas.node != MAS_UNDERFLOW); + MT_BUG_ON(mt, !mas_is_underflow(&mas)); /* next:underflow -> active */ entry = mas_next(&mas, ULONG_MAX); MT_BUG_ON(mt, entry != ptr); MT_BUG_ON(mt, mas.index != 0x1000); MT_BUG_ON(mt, mas.last != 0x1500); - MT_BUG_ON(mt, !mas_active(mas)); + MT_BUG_ON(mt, !mas_is_active(&mas)); /* prev:first value -> underflow */ entry = mas_prev(&mas, 0x1000); MT_BUG_ON(mt, entry != NULL); MT_BUG_ON(mt, mas.index != 0x1000); MT_BUG_ON(mt, mas.last != 0x1500); - MT_BUG_ON(mt, mas.node != MAS_UNDERFLOW); + MT_BUG_ON(mt, !mas_is_underflow(&mas)); /* find:underflow -> first value */ entry = mas_find(&mas, ULONG_MAX); MT_BUG_ON(mt, entry != ptr); MT_BUG_ON(mt, mas.index != 0x1000); MT_BUG_ON(mt, mas.last != 0x1500); - MT_BUG_ON(mt, !mas_active(mas)); + MT_BUG_ON(mt, !mas_is_active(&mas)); /* prev: pause ->active */ mas_set(&mas, 0x3600); @@ -3397,21 +3404,21 @@ static noinline void __init check_state_handling(struct maple_tree *mt) MT_BUG_ON(mt, entry != ptr2); MT_BUG_ON(mt, mas.index != 0x2000); MT_BUG_ON(mt, mas.last != 0x2500); - MT_BUG_ON(mt, !mas_active(mas)); + MT_BUG_ON(mt, !mas_is_active(&mas)); - /* prev:active -> active spanning min */ + /* prev:active -> underflow spanning min */ entry = mas_prev(&mas, 0x1600); MT_BUG_ON(mt, entry != NULL); MT_BUG_ON(mt, mas.index != 0x1501); MT_BUG_ON(mt, mas.last != 0x1FFF); - MT_BUG_ON(mt, !mas_active(mas)); + MT_BUG_ON(mt, !mas_is_underflow(&mas)); /* prev: active ->active, continue */ entry = mas_prev(&mas, 0); MT_BUG_ON(mt, entry != ptr); MT_BUG_ON(mt, mas.index != 0x1000); MT_BUG_ON(mt, mas.last != 0x1500); - MT_BUG_ON(mt, !mas_active(mas)); + MT_BUG_ON(mt, !mas_is_active(&mas)); /* find: start ->active */ mas_set(&mas, 0); @@ -3419,7 +3426,7 @@ static noinline void __init check_state_handling(struct maple_tree *mt) MT_BUG_ON(mt, entry != ptr); MT_BUG_ON(mt, mas.index != 0x1000); MT_BUG_ON(mt, mas.last != 0x1500); - MT_BUG_ON(mt, !mas_active(mas)); + MT_BUG_ON(mt, !mas_is_active(&mas)); /* find: pause ->active */ mas_set(&mas, 0); @@ -3428,7 +3435,7 @@ static noinline void __init check_state_handling(struct maple_tree *mt) MT_BUG_ON(mt, entry != ptr); MT_BUG_ON(mt, mas.index != 0x1000); MT_BUG_ON(mt, mas.last != 0x1500); - MT_BUG_ON(mt, !mas_active(mas)); + MT_BUG_ON(mt, !mas_is_active(&mas)); /* find: start ->active on value */; mas_set(&mas, 1200); @@ -3436,14 +3443,14 @@ static noinline void __init check_state_handling(struct maple_tree *mt) MT_BUG_ON(mt, entry != ptr); MT_BUG_ON(mt, mas.index != 0x1000); MT_BUG_ON(mt, mas.last != 0x1500); - MT_BUG_ON(mt, !mas_active(mas)); + MT_BUG_ON(mt, !mas_is_active(&mas)); /* find:active ->active */ entry = mas_find(&mas, ULONG_MAX); MT_BUG_ON(mt, entry != ptr2); MT_BUG_ON(mt, mas.index != 0x2000); MT_BUG_ON(mt, mas.last != 0x2500); - MT_BUG_ON(mt, !mas_active(mas)); + MT_BUG_ON(mt, !mas_is_active(&mas)); /* find:active -> active (NULL)*/ @@ -3451,35 +3458,35 @@ static noinline void __init check_state_handling(struct maple_tree *mt) MT_BUG_ON(mt, entry != NULL); MT_BUG_ON(mt, mas.index != 0x2501); MT_BUG_ON(mt, mas.last != 0x2FFF); - MT_BUG_ON(mt, !mas_active(mas)); + MAS_BUG_ON(&mas, !mas_is_active(&mas)); /* find: overflow ->active */ entry = mas_find(&mas, 0x5000); MT_BUG_ON(mt, entry != ptr3); MT_BUG_ON(mt, mas.index != 0x3000); MT_BUG_ON(mt, mas.last != 0x3500); - MT_BUG_ON(mt, !mas_active(mas)); + MT_BUG_ON(mt, !mas_is_active(&mas)); /* find:active -> active (NULL) end*/ entry = mas_find(&mas, ULONG_MAX); MT_BUG_ON(mt, entry != NULL); MT_BUG_ON(mt, mas.index != 0x3501); MT_BUG_ON(mt, mas.last != ULONG_MAX); - MT_BUG_ON(mt, !mas_active(mas)); + MAS_BUG_ON(&mas, !mas_is_active(&mas)); /* find_rev: active (END) ->active */ entry = mas_find_rev(&mas, 0); MT_BUG_ON(mt, entry != ptr3); MT_BUG_ON(mt, mas.index != 0x3000); MT_BUG_ON(mt, mas.last != 0x3500); - MT_BUG_ON(mt, !mas_active(mas)); + MT_BUG_ON(mt, !mas_is_active(&mas)); /* find_rev:active ->active */ entry = mas_find_rev(&mas, 0); MT_BUG_ON(mt, entry != ptr2); MT_BUG_ON(mt, mas.index != 0x2000); MT_BUG_ON(mt, mas.last != 0x2500); - MT_BUG_ON(mt, !mas_active(mas)); + MT_BUG_ON(mt, !mas_is_active(&mas)); /* find_rev: pause ->active */ mas_pause(&mas); @@ -3487,14 +3494,14 @@ static noinline void __init check_state_handling(struct maple_tree *mt) MT_BUG_ON(mt, entry != ptr); MT_BUG_ON(mt, mas.index != 0x1000); MT_BUG_ON(mt, mas.last != 0x1500); - MT_BUG_ON(mt, !mas_active(mas)); + MT_BUG_ON(mt, !mas_is_active(&mas)); - /* find_rev:active -> active */ + /* find_rev:active -> underflow */ entry = mas_find_rev(&mas, 0); MT_BUG_ON(mt, entry != NULL); MT_BUG_ON(mt, mas.index != 0); MT_BUG_ON(mt, mas.last != 0x0FFF); - MT_BUG_ON(mt, !mas_active(mas)); + MT_BUG_ON(mt, !mas_is_underflow(&mas)); /* find_rev: start ->active */ mas_set(&mas, 0x1200); @@ -3502,7 +3509,7 @@ static noinline void __init check_state_handling(struct maple_tree *mt) MT_BUG_ON(mt, entry != ptr); MT_BUG_ON(mt, mas.index != 0x1000); MT_BUG_ON(mt, mas.last != 0x1500); - MT_BUG_ON(mt, !mas_active(mas)); + MT_BUG_ON(mt, !mas_is_active(&mas)); /* mas_walk start ->active */ mas_set(&mas, 0x1200); @@ -3510,7 +3517,7 @@ static noinline void __init check_state_handling(struct maple_tree *mt) MT_BUG_ON(mt, entry != ptr); MT_BUG_ON(mt, mas.index != 0x1000); MT_BUG_ON(mt, mas.last != 0x1500); - MT_BUG_ON(mt, !mas_active(mas)); + MT_BUG_ON(mt, !mas_is_active(&mas)); /* mas_walk start ->active */ mas_set(&mas, 0x1600); @@ -3518,7 +3525,7 @@ static noinline void __init check_state_handling(struct maple_tree *mt) MT_BUG_ON(mt, entry != NULL); MT_BUG_ON(mt, mas.index != 0x1501); MT_BUG_ON(mt, mas.last != 0x1fff); - MT_BUG_ON(mt, !mas_active(mas)); + MT_BUG_ON(mt, !mas_is_active(&mas)); /* mas_walk pause ->active */ mas_set(&mas, 0x1200); @@ -3527,7 +3534,7 @@ static noinline void __init check_state_handling(struct maple_tree *mt) MT_BUG_ON(mt, entry != ptr); MT_BUG_ON(mt, mas.index != 0x1000); MT_BUG_ON(mt, mas.last != 0x1500); - MT_BUG_ON(mt, !mas_active(mas)); + MT_BUG_ON(mt, !mas_is_active(&mas)); /* mas_walk pause -> active */ mas_set(&mas, 0x1600); @@ -3536,25 +3543,25 @@ static noinline void __init check_state_handling(struct maple_tree *mt) MT_BUG_ON(mt, entry != NULL); MT_BUG_ON(mt, mas.index != 0x1501); MT_BUG_ON(mt, mas.last != 0x1fff); - MT_BUG_ON(mt, !mas_active(mas)); + MT_BUG_ON(mt, !mas_is_active(&mas)); /* mas_walk none -> active */ mas_set(&mas, 0x1200); - mas.node = MAS_NONE; + mas.status = ma_none; entry = mas_walk(&mas); MT_BUG_ON(mt, entry != ptr); MT_BUG_ON(mt, mas.index != 0x1000); MT_BUG_ON(mt, mas.last != 0x1500); - MT_BUG_ON(mt, !mas_active(mas)); + MT_BUG_ON(mt, !mas_is_active(&mas)); /* mas_walk none -> active */ mas_set(&mas, 0x1600); - mas.node = MAS_NONE; + mas.status = ma_none; entry = mas_walk(&mas); MT_BUG_ON(mt, entry != NULL); MT_BUG_ON(mt, mas.index != 0x1501); MT_BUG_ON(mt, mas.last != 0x1fff); - MT_BUG_ON(mt, !mas_active(mas)); + MT_BUG_ON(mt, !mas_is_active(&mas)); /* mas_walk active -> active */ mas.index = 0x1200; @@ -3564,7 +3571,7 @@ static noinline void __init check_state_handling(struct maple_tree *mt) MT_BUG_ON(mt, entry != ptr); MT_BUG_ON(mt, mas.index != 0x1000); MT_BUG_ON(mt, mas.last != 0x1500); - MT_BUG_ON(mt, !mas_active(mas)); + MT_BUG_ON(mt, !mas_is_active(&mas)); /* mas_walk active -> active */ mas.index = 0x1600; @@ -3573,7 +3580,7 @@ static noinline void __init check_state_handling(struct maple_tree *mt) MT_BUG_ON(mt, entry != NULL); MT_BUG_ON(mt, mas.index != 0x1501); MT_BUG_ON(mt, mas.last != 0x1fff); - MT_BUG_ON(mt, !mas_active(mas)); + MT_BUG_ON(mt, !mas_is_active(&mas)); mas_unlock(&mas); } diff --git a/mm/internal.h b/mm/internal.h index 0005b8adbd5c..8450562744cf 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1163,13 +1163,13 @@ static inline void vma_iter_store(struct vma_iterator *vmi, { #if defined(CONFIG_DEBUG_VM_MAPLE_TREE) - if (MAS_WARN_ON(&vmi->mas, vmi->mas.node != MAS_START && + if (MAS_WARN_ON(&vmi->mas, vmi->mas.status != ma_start && vmi->mas.index > vma->vm_start)) { pr_warn("%lx > %lx\n store vma %lx-%lx\n into slot %lx-%lx\n", vmi->mas.index, vma->vm_start, vma->vm_start, vma->vm_end, vmi->mas.index, vmi->mas.last); } - if (MAS_WARN_ON(&vmi->mas, vmi->mas.node != MAS_START && + if (MAS_WARN_ON(&vmi->mas, vmi->mas.status != ma_start && vmi->mas.last < vma->vm_start)) { pr_warn("%lx < %lx\nstore vma %lx-%lx\ninto slot %lx-%lx\n", vmi->mas.last, vma->vm_start, vma->vm_start, vma->vm_end, @@ -1177,7 +1177,7 @@ static inline void vma_iter_store(struct vma_iterator *vmi, } #endif - if (vmi->mas.node != MAS_START && + if (vmi->mas.status != ma_start && ((vmi->mas.index > vma->vm_start) || (vmi->mas.last < vma->vm_start))) vma_iter_invalidate(vmi); @@ -1188,7 +1188,7 @@ static inline void vma_iter_store(struct vma_iterator *vmi, static inline int vma_iter_store_gfp(struct vma_iterator *vmi, struct vm_area_struct *vma, gfp_t gfp) { - if (vmi->mas.node != MAS_START && + if (vmi->mas.status != ma_start && ((vmi->mas.index > vma->vm_start) || (vmi->mas.last < vma->vm_start))) vma_iter_invalidate(vmi); diff --git a/tools/testing/radix-tree/maple.c b/tools/testing/radix-tree/maple.c index 7095fb0ec026..857c439e6bbc 100644 --- a/tools/testing/radix-tree/maple.c +++ b/tools/testing/radix-tree/maple.c @@ -118,6 +118,7 @@ static noinline void __init check_new_node(struct maple_tree *mt) MT_BUG_ON(mt, mas.alloc == NULL); MT_BUG_ON(mt, mas.alloc->slot[0] == NULL); mas_push_node(&mas, mn); + mas_reset(&mas); mas_nomem(&mas, GFP_KERNEL); /* free */ mtree_unlock(mt); @@ -141,7 +142,7 @@ static noinline void __init check_new_node(struct maple_tree *mt) mn->parent = ma_parent_ptr(mn); ma_free_rcu(mn); - mas.node = MAS_START; + mas.status = ma_start; mas_nomem(&mas, GFP_KERNEL); /* Allocate 3 nodes, will fail. */ mas_node_count(&mas, 3); @@ -158,6 +159,7 @@ static noinline void __init check_new_node(struct maple_tree *mt) /* Ensure we counted 3. */ MT_BUG_ON(mt, mas_allocated(&mas) != 3); /* Free. */ + mas_reset(&mas); mas_nomem(&mas, GFP_KERNEL); /* Set allocation request to 1. */ @@ -272,6 +274,7 @@ static noinline void __init check_new_node(struct maple_tree *mt) ma_free_rcu(mn); MT_BUG_ON(mt, mas_allocated(&mas) != i - j - 1); } + mas_reset(&mas); MT_BUG_ON(mt, mas_nomem(&mas, GFP_KERNEL)); } @@ -294,6 +297,7 @@ static noinline void __init check_new_node(struct maple_tree *mt) smn = smn->slot[0]; /* next. */ } MT_BUG_ON(mt, mas_allocated(&mas) != total); + mas_reset(&mas); mas_nomem(&mas, GFP_KERNEL); /* Free. */ MT_BUG_ON(mt, mas_allocated(&mas) != 0); @@ -441,7 +445,7 @@ static noinline void __init check_new_node(struct maple_tree *mt) mas.node = MA_ERROR(-ENOMEM); mas_node_count(&mas, 10); /* Request */ mas_nomem(&mas, GFP_KERNEL); /* Fill request */ - mas.node = MAS_START; + mas.status = ma_start; MT_BUG_ON(mt, mas_allocated(&mas) != 10); mas_destroy(&mas); @@ -452,7 +456,7 @@ static noinline void __init check_new_node(struct maple_tree *mt) mas.node = MA_ERROR(-ENOMEM); mas_node_count(&mas, 10 + MAPLE_ALLOC_SLOTS - 1); /* Request */ mas_nomem(&mas, GFP_KERNEL); /* Fill request */ - mas.node = MAS_START; + mas.status = ma_start; MT_BUG_ON(mt, mas_allocated(&mas) != 10 + MAPLE_ALLOC_SLOTS - 1); mas_destroy(&mas); @@ -941,7 +945,7 @@ retry: ret = mas_descend_walk(mas, range_min, range_max); if (unlikely(mte_dead_node(mas->node))) { - mas->node = MAS_START; + mas->status = ma_start; goto retry; } @@ -961,10 +965,10 @@ static inline void *mas_range_load(struct ma_state *mas, unsigned long index = mas->index; if (mas_is_none(mas) || mas_is_paused(mas)) - mas->node = MAS_START; + mas->status = ma_start; retry: if (mas_tree_walk(mas, range_min, range_max)) - if (unlikely(mas->node == MAS_ROOT)) + if (unlikely(mas->status == ma_root)) return mas_root(mas); if (likely(mas->offset != MAPLE_NODE_SLOTS)) @@ -35337,7 +35341,7 @@ static void mas_dfs_preorder(struct ma_state *mas) unsigned char end, slot = 0; unsigned long *pivots; - if (mas->node == MAS_START) { + if (mas->status == ma_start) { mas_start(mas); return; } @@ -35374,7 +35378,7 @@ walk_up: return; done: - mas->node = MAS_NONE; + mas->status = ma_none; } @@ -35833,7 +35837,7 @@ static noinline void __init check_nomem(struct maple_tree *mt) mas_store(&ms, &ms); /* insert 1 -> &ms, fails. */ MT_BUG_ON(mt, ms.node != MA_ERROR(-ENOMEM)); mas_nomem(&ms, GFP_KERNEL); /* Node allocated in here. */ - MT_BUG_ON(mt, ms.node != MAS_START); + MT_BUG_ON(mt, ms.status != ma_start); mtree_unlock(mt); MT_BUG_ON(mt, mtree_insert(mt, 2, mt, GFP_KERNEL) != 0); mtree_lock(mt); @@ -35952,7 +35956,7 @@ static int __init compare_tree(struct maple_tree *mt_a, struct maple_tree *mt_b) if (mas_is_ptr(&mas_a) || mas_is_ptr(&mas_b)) { if (!(mas_is_ptr(&mas_a) && mas_is_ptr(&mas_b))) { - pr_err("One is MAS_ROOT and the other is not.\n"); + pr_err("One is ma_root and the other is not.\n"); return -1; } return 0; @@ -35961,7 +35965,7 @@ static int __init compare_tree(struct maple_tree *mt_a, struct maple_tree *mt_b) while (!mas_is_none(&mas_a) || !mas_is_none(&mas_b)) { if (mas_is_none(&mas_a) || mas_is_none(&mas_b)) { - pr_err("One is MAS_NONE and the other is not.\n"); + pr_err("One is ma_none and the other is not.\n"); return -1; } -- cgit v1.2.3 From 0de56e38b307b0cb2ac825e8e7cb371a28daf844 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Wed, 1 Nov 2023 13:16:27 -0400 Subject: maple_tree: use maple state end for write operations ma_wr_state was previously tracking the end of the node for writing. Since the implementation of the ma_state end tracking, this is duplicated work. This patch removes the maple write state tracking of the end of the node and uses the maple state end instead. Link: https://lkml.kernel.org/r/20231101171629.3612299-11-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Peng Zhang Signed-off-by: Andrew Morton --- include/linux/maple_tree.h | 1 - lib/maple_tree.c | 46 ++++++++++++++++++++++++---------------------- 2 files changed, 24 insertions(+), 23 deletions(-) (limited to 'include') diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index 4dd668f7b111..b3d63123b945 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -441,7 +441,6 @@ struct ma_wr_state { unsigned long r_max; /* range max */ enum maple_type type; /* mas->node type */ unsigned char offset_end; /* The offset where the write ends */ - unsigned char node_end; /* mas->node end */ unsigned long *pivots; /* mas->node->pivots pointer */ unsigned long end_piv; /* The pivot at the offset end */ void __rcu **slots; /* mas->node->slots pointer */ diff --git a/lib/maple_tree.c b/lib/maple_tree.c index c7016066f12b..59500fe6988b 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -2158,11 +2158,11 @@ static noinline_for_kasan void mas_store_b_node(struct ma_wr_state *wr_mas, } slot = offset_end + 1; - if (slot > wr_mas->node_end) + if (slot > mas->end) goto b_end; /* Copy end data to the end of the node. */ - mas_mab_cp(mas, slot, wr_mas->node_end + 1, b_node, ++b_end); + mas_mab_cp(mas, slot, mas->end + 1, b_node, ++b_end); b_node->b_end--; return; @@ -2253,8 +2253,8 @@ static inline void mas_wr_node_walk(struct ma_wr_state *wr_mas) wr_mas->node = mas_mn(wr_mas->mas); wr_mas->pivots = ma_pivots(wr_mas->node, wr_mas->type); - count = wr_mas->node_end = ma_data_end(wr_mas->node, wr_mas->type, - wr_mas->pivots, mas->max); + count = mas->end = ma_data_end(wr_mas->node, wr_mas->type, + wr_mas->pivots, mas->max); offset = mas->offset; while (offset < count && mas->index > wr_mas->pivots[offset]) @@ -3904,10 +3904,10 @@ static inline int mas_wr_spanning_store(struct ma_wr_state *wr_mas) memset(&b_node, 0, sizeof(struct maple_big_node)); /* Copy l_mas and store the value in b_node. */ - mas_store_b_node(&l_wr_mas, &b_node, l_wr_mas.node_end); + mas_store_b_node(&l_wr_mas, &b_node, l_mas.end); /* Copy r_mas into b_node. */ - if (r_mas.offset <= r_wr_mas.node_end) - mas_mab_cp(&r_mas, r_mas.offset, r_wr_mas.node_end, + if (r_mas.offset <= r_mas.end) + mas_mab_cp(&r_mas, r_mas.offset, r_mas.end, &b_node, b_node.b_end + 1); else b_node.b_end++; @@ -3949,7 +3949,7 @@ static inline bool mas_wr_node_store(struct ma_wr_state *wr_mas, if (mas->last == wr_mas->end_piv) offset_end++; /* don't copy this offset */ else if (unlikely(wr_mas->r_max == ULONG_MAX)) - mas_bulk_rebalance(mas, wr_mas->node_end, wr_mas->type); + mas_bulk_rebalance(mas, mas->end, wr_mas->type); /* set up node. */ if (in_rcu) { @@ -3985,12 +3985,12 @@ static inline bool mas_wr_node_store(struct ma_wr_state *wr_mas, * this range wrote to the end of the node or it overwrote the rest of * the data */ - if (offset_end > wr_mas->node_end) + if (offset_end > mas->end) goto done; dst_offset = mas->offset + 1; /* Copy to the end of node if necessary. */ - copy_size = wr_mas->node_end - offset_end + 1; + copy_size = mas->end - offset_end + 1; memcpy(dst_slots + dst_offset, wr_mas->slots + offset_end, sizeof(void *) * copy_size); memcpy(dst_pivots + dst_offset, wr_mas->pivots + offset_end, @@ -4077,10 +4077,10 @@ static inline void mas_wr_extend_null(struct ma_wr_state *wr_mas) } else { /* Check next slot(s) if we are overwriting the end */ if ((mas->last == wr_mas->end_piv) && - (wr_mas->node_end != wr_mas->offset_end) && + (mas->end != wr_mas->offset_end) && !wr_mas->slots[wr_mas->offset_end + 1]) { wr_mas->offset_end++; - if (wr_mas->offset_end == wr_mas->node_end) + if (wr_mas->offset_end == mas->end) mas->last = mas->max; else mas->last = wr_mas->pivots[wr_mas->offset_end]; @@ -4105,11 +4105,11 @@ static inline void mas_wr_extend_null(struct ma_wr_state *wr_mas) static inline void mas_wr_end_piv(struct ma_wr_state *wr_mas) { - while ((wr_mas->offset_end < wr_mas->node_end) && + while ((wr_mas->offset_end < wr_mas->mas->end) && (wr_mas->mas->last > wr_mas->pivots[wr_mas->offset_end])) wr_mas->offset_end++; - if (wr_mas->offset_end < wr_mas->node_end) + if (wr_mas->offset_end < wr_mas->mas->end) wr_mas->end_piv = wr_mas->pivots[wr_mas->offset_end]; else wr_mas->end_piv = wr_mas->mas->max; @@ -4121,7 +4121,7 @@ static inline void mas_wr_end_piv(struct ma_wr_state *wr_mas) static inline unsigned char mas_wr_new_end(struct ma_wr_state *wr_mas) { struct ma_state *mas = wr_mas->mas; - unsigned char new_end = wr_mas->node_end + 2; + unsigned char new_end = mas->end + 2; new_end -= wr_mas->offset_end - mas->offset; if (wr_mas->r_min == mas->index) @@ -4155,10 +4155,10 @@ static inline bool mas_wr_append(struct ma_wr_state *wr_mas, if (mt_in_rcu(mas->tree)) return false; - if (mas->offset != wr_mas->node_end) + if (mas->offset != mas->end) return false; - end = wr_mas->node_end; + end = mas->end; if (mas->offset != end) return false; @@ -4210,7 +4210,7 @@ static void mas_wr_bnode(struct ma_wr_state *wr_mas) trace_ma_write(__func__, wr_mas->mas, 0, wr_mas->entry); memset(&b_node, 0, sizeof(struct maple_big_node)); mas_store_b_node(wr_mas, &b_node, wr_mas->offset_end); - mas_commit_b_node(wr_mas, &b_node, wr_mas->node_end); + mas_commit_b_node(wr_mas, &b_node, wr_mas->mas->end); } static inline void mas_wr_modify(struct ma_wr_state *wr_mas) @@ -4238,7 +4238,7 @@ static inline void mas_wr_modify(struct ma_wr_state *wr_mas) if (mas_wr_append(wr_mas, new_end)) return; - if (new_end == wr_mas->node_end && mas_wr_slot_store(wr_mas)) + if (new_end == mas->end && mas_wr_slot_store(wr_mas)) return; if (mas_wr_node_store(wr_mas, new_end)) @@ -5052,6 +5052,7 @@ int mas_empty_area(struct ma_state *mas, unsigned long min, unsigned char offset; unsigned long *pivots; enum maple_type mt; + struct maple_node *node; if (min > max) return -EINVAL; @@ -5082,13 +5083,14 @@ int mas_empty_area(struct ma_state *mas, unsigned long min, if (unlikely(offset == MAPLE_NODE_SLOTS)) return -EBUSY; + node = mas_mn(mas); mt = mte_node_type(mas->node); - pivots = ma_pivots(mas_mn(mas), mt); + pivots = ma_pivots(node, mt); min = mas_safe_min(mas, pivots, offset); if (mas->index < min) mas->index = min; mas->last = mas->index + size - 1; - mas->end = mas_data_end(mas); + mas->end = ma_data_end(node, mt, pivots, mas->max); return 0; } EXPORT_SYMBOL_GPL(mas_empty_area); @@ -7596,7 +7598,7 @@ void mas_wr_dump(const struct ma_wr_state *wr_mas) pr_err("WR_MAS: node=%p r_min=%lx r_max=%lx\n", wr_mas->node, wr_mas->r_min, wr_mas->r_max); pr_err(" type=%u off_end=%u, node_end=%u, end_piv=%lx\n", - wr_mas->type, wr_mas->offset_end, wr_mas->node_end, + wr_mas->type, wr_mas->offset_end, wr_mas->mas->end, wr_mas->end_piv); } EXPORT_SYMBOL_GPL(mas_wr_dump); -- cgit v1.2.3 From 0a97c01cd20bb96359d8c9dedad92a061ed34e0b Mon Sep 17 00:00:00 2001 From: Nhat Pham Date: Thu, 30 Nov 2023 11:40:18 -0800 Subject: list_lru: allow explicit memcg and NUMA node selection Patch series "workload-specific and memory pressure-driven zswap writeback", v8. There are currently several issues with zswap writeback: 1. There is only a single global LRU for zswap, making it impossible to perform worload-specific shrinking - an memcg under memory pressure cannot determine which pages in the pool it owns, and often ends up writing pages from other memcgs. This issue has been previously observed in practice and mitigated by simply disabling memcg-initiated shrinking: https://lore.kernel.org/all/20230530232435.3097106-1-nphamcs@gmail.com/T/#u But this solution leaves a lot to be desired, as we still do not have an avenue for an memcg to free up its own memory locked up in the zswap pool. 2. We only shrink the zswap pool when the user-defined limit is hit. This means that if we set the limit too high, cold data that are unlikely to be used again will reside in the pool, wasting precious memory. It is hard to predict how much zswap space will be needed ahead of time, as this depends on the workload (specifically, on factors such as memory access patterns and compressibility of the memory pages). This patch series solves these issues by separating the global zswap LRU into per-memcg and per-NUMA LRUs, and performs workload-specific (i.e memcg- and NUMA-aware) zswap writeback under memory pressure. The new shrinker does not have any parameter that must be tuned by the user, and can be opted in or out on a per-memcg basis. As a proof of concept, we ran the following synthetic benchmark: build the linux kernel in a memory-limited cgroup, and allocate some cold data in tmpfs to see if the shrinker could write them out and improved the overall performance. Depending on the amount of cold data generated, we observe from 14% to 35% reduction in kernel CPU time used in the kernel builds. This patch (of 6): The interface of list_lru is based on the assumption that the list node and the data it represents belong to the same allocated on the correct node/memcg. While this assumption is valid for existing slab objects LRU such as dentries and inodes, it is undocumented, and rather inflexible for certain potential list_lru users (such as the upcoming zswap shrinker and the THP shrinker). It has caused us a lot of issues during our development. This patch changes list_lru interface so that the caller must explicitly specify numa node and memcg when adding and removing objects. The old list_lru_add() and list_lru_del() are renamed to list_lru_add_obj() and list_lru_del_obj(), respectively. It also extends the list_lru API with a new function, list_lru_putback, which undoes a previous list_lru_isolate call. Unlike list_lru_add, it does not increment the LRU node count (as list_lru_isolate does not decrement the node count). list_lru_putback also allows for explicit memcg and NUMA node selection. Link: https://lkml.kernel.org/r/20231130194023.4102148-1-nphamcs@gmail.com Link: https://lkml.kernel.org/r/20231130194023.4102148-2-nphamcs@gmail.com Signed-off-by: Nhat Pham Suggested-by: Johannes Weiner Acked-by: Johannes Weiner Tested-by: Bagas Sanjaya Cc: Chris Li Cc: Dan Streetman Cc: Domenico Cerasuolo Cc: Michal Hocko Cc: Muchun Song Cc: Roman Gushchin Cc: Seth Jennings Cc: Shakeel Butt Cc: Shuah Khan Cc: Vitaly Wool Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- drivers/android/binder_alloc.c | 7 +++--- fs/dcache.c | 8 ++++--- fs/gfs2/quota.c | 6 ++--- fs/inode.c | 4 ++-- fs/nfs/nfs42xattr.c | 8 +++---- fs/nfsd/filecache.c | 4 ++-- fs/xfs/xfs_buf.c | 6 ++--- fs/xfs/xfs_dquot.c | 2 +- fs/xfs/xfs_qm.c | 2 +- include/linux/list_lru.h | 54 +++++++++++++++++++++++++++++++++++++++--- mm/list_lru.c | 48 ++++++++++++++++++++++++++++++------- mm/workingset.c | 4 ++-- 12 files changed, 117 insertions(+), 36 deletions(-) (limited to 'include') diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c index 138f6d43d13b..f69d30c9f50f 100644 --- a/drivers/android/binder_alloc.c +++ b/drivers/android/binder_alloc.c @@ -234,7 +234,7 @@ static int binder_update_page_range(struct binder_alloc *alloc, int allocate, if (page->page_ptr) { trace_binder_alloc_lru_start(alloc, index); - on_lru = list_lru_del(&binder_alloc_lru, &page->lru); + on_lru = list_lru_del_obj(&binder_alloc_lru, &page->lru); WARN_ON(!on_lru); trace_binder_alloc_lru_end(alloc, index); @@ -285,7 +285,7 @@ free_range: trace_binder_free_lru_start(alloc, index); - ret = list_lru_add(&binder_alloc_lru, &page->lru); + ret = list_lru_add_obj(&binder_alloc_lru, &page->lru); WARN_ON(!ret); trace_binder_free_lru_end(alloc, index); @@ -848,7 +848,7 @@ void binder_alloc_deferred_release(struct binder_alloc *alloc) if (!alloc->pages[i].page_ptr) continue; - on_lru = list_lru_del(&binder_alloc_lru, + on_lru = list_lru_del_obj(&binder_alloc_lru, &alloc->pages[i].lru); page_addr = alloc->buffer + i * PAGE_SIZE; binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC, @@ -1287,4 +1287,3 @@ int binder_alloc_copy_from_buffer(struct binder_alloc *alloc, return binder_alloc_do_buffer_copy(alloc, false, buffer, buffer_offset, dest, bytes); } - diff --git a/fs/dcache.c b/fs/dcache.c index c82ae731df9a..2ba37643b9c5 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -428,7 +428,8 @@ static void d_lru_add(struct dentry *dentry) this_cpu_inc(nr_dentry_unused); if (d_is_negative(dentry)) this_cpu_inc(nr_dentry_negative); - WARN_ON_ONCE(!list_lru_add(&dentry->d_sb->s_dentry_lru, &dentry->d_lru)); + WARN_ON_ONCE(!list_lru_add_obj( + &dentry->d_sb->s_dentry_lru, &dentry->d_lru)); } static void d_lru_del(struct dentry *dentry) @@ -438,7 +439,8 @@ static void d_lru_del(struct dentry *dentry) this_cpu_dec(nr_dentry_unused); if (d_is_negative(dentry)) this_cpu_dec(nr_dentry_negative); - WARN_ON_ONCE(!list_lru_del(&dentry->d_sb->s_dentry_lru, &dentry->d_lru)); + WARN_ON_ONCE(!list_lru_del_obj( + &dentry->d_sb->s_dentry_lru, &dentry->d_lru)); } static void d_shrink_del(struct dentry *dentry) @@ -1240,7 +1242,7 @@ static enum lru_status dentry_lru_isolate(struct list_head *item, * * This is guaranteed by the fact that all LRU management * functions are intermediated by the LRU API calls like - * list_lru_add and list_lru_del. List movement in this file + * list_lru_add_obj and list_lru_del_obj. List movement in this file * only ever occur through this functions or through callbacks * like this one, that are called from the LRU API. * diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 95dae7838b4e..b57f8c7b35be 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -271,7 +271,7 @@ static struct gfs2_quota_data *gfs2_qd_search_bucket(unsigned int hash, if (qd->qd_sbd != sdp) continue; if (lockref_get_not_dead(&qd->qd_lockref)) { - list_lru_del(&gfs2_qd_lru, &qd->qd_lru); + list_lru_del_obj(&gfs2_qd_lru, &qd->qd_lru); return qd; } } @@ -344,7 +344,7 @@ static void qd_put(struct gfs2_quota_data *qd) } qd->qd_lockref.count = 0; - list_lru_add(&gfs2_qd_lru, &qd->qd_lru); + list_lru_add_obj(&gfs2_qd_lru, &qd->qd_lru); spin_unlock(&qd->qd_lockref.lock); } @@ -1517,7 +1517,7 @@ void gfs2_quota_cleanup(struct gfs2_sbd *sdp) lockref_mark_dead(&qd->qd_lockref); spin_unlock(&qd->qd_lockref.lock); - list_lru_del(&gfs2_qd_lru, &qd->qd_lru); + list_lru_del_obj(&gfs2_qd_lru, &qd->qd_lru); list_add(&qd->qd_lru, &dispose); } spin_unlock(&qd_lock); diff --git a/fs/inode.c b/fs/inode.c index f238d987dec9..ef2034a985e0 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -464,7 +464,7 @@ static void __inode_add_lru(struct inode *inode, bool rotate) if (!mapping_shrinkable(&inode->i_data)) return; - if (list_lru_add(&inode->i_sb->s_inode_lru, &inode->i_lru)) + if (list_lru_add_obj(&inode->i_sb->s_inode_lru, &inode->i_lru)) this_cpu_inc(nr_unused); else if (rotate) inode->i_state |= I_REFERENCED; @@ -482,7 +482,7 @@ void inode_add_lru(struct inode *inode) static void inode_lru_list_del(struct inode *inode) { - if (list_lru_del(&inode->i_sb->s_inode_lru, &inode->i_lru)) + if (list_lru_del_obj(&inode->i_sb->s_inode_lru, &inode->i_lru)) this_cpu_dec(nr_unused); } diff --git a/fs/nfs/nfs42xattr.c b/fs/nfs/nfs42xattr.c index 2ad66a8922f4..49aaf28a6950 100644 --- a/fs/nfs/nfs42xattr.c +++ b/fs/nfs/nfs42xattr.c @@ -132,7 +132,7 @@ nfs4_xattr_entry_lru_add(struct nfs4_xattr_entry *entry) lru = (entry->flags & NFS4_XATTR_ENTRY_EXTVAL) ? &nfs4_xattr_large_entry_lru : &nfs4_xattr_entry_lru; - return list_lru_add(lru, &entry->lru); + return list_lru_add_obj(lru, &entry->lru); } static bool @@ -143,7 +143,7 @@ nfs4_xattr_entry_lru_del(struct nfs4_xattr_entry *entry) lru = (entry->flags & NFS4_XATTR_ENTRY_EXTVAL) ? &nfs4_xattr_large_entry_lru : &nfs4_xattr_entry_lru; - return list_lru_del(lru, &entry->lru); + return list_lru_del_obj(lru, &entry->lru); } /* @@ -349,7 +349,7 @@ nfs4_xattr_cache_unlink(struct inode *inode) oldcache = nfsi->xattr_cache; if (oldcache != NULL) { - list_lru_del(&nfs4_xattr_cache_lru, &oldcache->lru); + list_lru_del_obj(&nfs4_xattr_cache_lru, &oldcache->lru); oldcache->inode = NULL; } nfsi->xattr_cache = NULL; @@ -474,7 +474,7 @@ nfs4_xattr_get_cache(struct inode *inode, int add) kref_get(&cache->ref); nfsi->xattr_cache = cache; cache->inode = inode; - list_lru_add(&nfs4_xattr_cache_lru, &cache->lru); + list_lru_add_obj(&nfs4_xattr_cache_lru, &cache->lru); } spin_unlock(&inode->i_lock); diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index ef063f93fde9..6c2decfdeb4b 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -322,7 +322,7 @@ nfsd_file_check_writeback(struct nfsd_file *nf) static bool nfsd_file_lru_add(struct nfsd_file *nf) { set_bit(NFSD_FILE_REFERENCED, &nf->nf_flags); - if (list_lru_add(&nfsd_file_lru, &nf->nf_lru)) { + if (list_lru_add_obj(&nfsd_file_lru, &nf->nf_lru)) { trace_nfsd_file_lru_add(nf); return true; } @@ -331,7 +331,7 @@ static bool nfsd_file_lru_add(struct nfsd_file *nf) static bool nfsd_file_lru_remove(struct nfsd_file *nf) { - if (list_lru_del(&nfsd_file_lru, &nf->nf_lru)) { + if (list_lru_del_obj(&nfsd_file_lru, &nf->nf_lru)) { trace_nfsd_file_lru_del(nf); return true; } diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 545c7991b9b5..669332849680 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -169,7 +169,7 @@ xfs_buf_stale( atomic_set(&bp->b_lru_ref, 0); if (!(bp->b_state & XFS_BSTATE_DISPOSE) && - (list_lru_del(&bp->b_target->bt_lru, &bp->b_lru))) + (list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru))) atomic_dec(&bp->b_hold); ASSERT(atomic_read(&bp->b_hold) >= 1); @@ -1047,7 +1047,7 @@ xfs_buf_rele( * buffer for the LRU and clear the (now stale) dispose list * state flag */ - if (list_lru_add(&bp->b_target->bt_lru, &bp->b_lru)) { + if (list_lru_add_obj(&bp->b_target->bt_lru, &bp->b_lru)) { bp->b_state &= ~XFS_BSTATE_DISPOSE; atomic_inc(&bp->b_hold); } @@ -1060,7 +1060,7 @@ xfs_buf_rele( * was on was the disposal list */ if (!(bp->b_state & XFS_BSTATE_DISPOSE)) { - list_lru_del(&bp->b_target->bt_lru, &bp->b_lru); + list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru); } else { ASSERT(list_empty(&bp->b_lru)); } diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index a013b87ab8d5..61a45a86ffe8 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -1065,7 +1065,7 @@ xfs_qm_dqput( struct xfs_quotainfo *qi = dqp->q_mount->m_quotainfo; trace_xfs_dqput_free(dqp); - if (list_lru_add(&qi->qi_lru, &dqp->q_lru)) + if (list_lru_add_obj(&qi->qi_lru, &dqp->q_lru)) XFS_STATS_INC(dqp->q_mount, xs_qm_dquot_unused); } xfs_dqunlock(dqp); diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 94a7932ac570..67d0a8564ff3 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -171,7 +171,7 @@ xfs_qm_dqpurge( * hits zero, so it really should be on the freelist here. */ ASSERT(!list_empty(&dqp->q_lru)); - list_lru_del(&qi->qi_lru, &dqp->q_lru); + list_lru_del_obj(&qi->qi_lru, &dqp->q_lru); XFS_STATS_DEC(dqp->q_mount, xs_qm_dquot_unused); xfs_qm_dqdestroy(dqp); diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h index db86ad78d428..7675a48a0701 100644 --- a/include/linux/list_lru.h +++ b/include/linux/list_lru.h @@ -75,6 +75,8 @@ void memcg_reparent_list_lrus(struct mem_cgroup *memcg, struct mem_cgroup *paren * list_lru_add: add an element to the lru list's tail * @lru: the lru pointer * @item: the item to be added. + * @nid: the node id of the sublist to add the item to. + * @memcg: the cgroup of the sublist to add the item to. * * If the element is already part of a list, this function returns doing * nothing. Therefore the caller does not need to keep state about whether or @@ -87,12 +89,28 @@ void memcg_reparent_list_lrus(struct mem_cgroup *memcg, struct mem_cgroup *paren * * Return: true if the list was updated, false otherwise */ -bool list_lru_add(struct list_lru *lru, struct list_head *item); +bool list_lru_add(struct list_lru *lru, struct list_head *item, int nid, + struct mem_cgroup *memcg); /** - * list_lru_del: delete an element to the lru list + * list_lru_add_obj: add an element to the lru list's tail + * @lru: the lru pointer + * @item: the item to be added. + * + * This function is similar to list_lru_add(), but the NUMA node and the + * memcg of the sublist is determined by @item list_head. This assumption is + * valid for slab objects LRU such as dentries, inodes, etc. + * + * Return value: true if the list was updated, false otherwise + */ +bool list_lru_add_obj(struct list_lru *lru, struct list_head *item); + +/** + * list_lru_del: delete an element from the lru list * @lru: the lru pointer * @item: the item to be deleted. + * @nid: the node id of the sublist to delete the item from. + * @memcg: the cgroup of the sublist to delete the item from. * * This function works analogously as list_lru_add() in terms of list * manipulation. The comments about an element already pertaining to @@ -100,7 +118,21 @@ bool list_lru_add(struct list_lru *lru, struct list_head *item); * * Return: true if the list was updated, false otherwise */ -bool list_lru_del(struct list_lru *lru, struct list_head *item); +bool list_lru_del(struct list_lru *lru, struct list_head *item, int nid, + struct mem_cgroup *memcg); + +/** + * list_lru_del_obj: delete an element from the lru list + * @lru: the lru pointer + * @item: the item to be deleted. + * + * This function is similar to list_lru_del(), but the NUMA node and the + * memcg of the sublist is determined by @item list_head. This assumption is + * valid for slab objects LRU such as dentries, inodes, etc. + * + * Return value: true if the list was updated, false otherwise. + */ +bool list_lru_del_obj(struct list_lru *lru, struct list_head *item); /** * list_lru_count_one: return the number of objects currently held by @lru @@ -138,6 +170,22 @@ static inline unsigned long list_lru_count(struct list_lru *lru) void list_lru_isolate(struct list_lru_one *list, struct list_head *item); void list_lru_isolate_move(struct list_lru_one *list, struct list_head *item, struct list_head *head); +/** + * list_lru_putback: undo list_lru_isolate + * @lru: the lru pointer. + * @item: the item to put back. + * @nid: the node id of the sublist to put the item back to. + * @memcg: the cgroup of the sublist to put the item back to. + * + * Put back an isolated item into its original LRU. Note that unlike + * list_lru_add, this does not increment the node LRU count (as + * list_lru_isolate does not originally decrement this count). + * + * Since we might have dropped the LRU lock in between, recompute list_lru_one + * from the node's id and memcg. + */ +void list_lru_putback(struct list_lru *lru, struct list_head *item, int nid, + struct mem_cgroup *memcg); typedef enum lru_status (*list_lru_walk_cb)(struct list_head *item, struct list_lru_one *list, spinlock_t *lock, void *cb_arg); diff --git a/mm/list_lru.c b/mm/list_lru.c index a05e5bef3b40..fcca67ac26ec 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -116,21 +116,19 @@ list_lru_from_kmem(struct list_lru *lru, int nid, void *ptr, } #endif /* CONFIG_MEMCG_KMEM */ -bool list_lru_add(struct list_lru *lru, struct list_head *item) +bool list_lru_add(struct list_lru *lru, struct list_head *item, int nid, + struct mem_cgroup *memcg) { - int nid = page_to_nid(virt_to_page(item)); struct list_lru_node *nlru = &lru->node[nid]; - struct mem_cgroup *memcg; struct list_lru_one *l; spin_lock(&nlru->lock); if (list_empty(item)) { - l = list_lru_from_kmem(lru, nid, item, &memcg); + l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg)); list_add_tail(item, &l->list); /* Set shrinker bit if the first element was added */ if (!l->nr_items++) - set_shrinker_bit(memcg, nid, - lru_shrinker_id(lru)); + set_shrinker_bit(memcg, nid, lru_shrinker_id(lru)); nlru->nr_items++; spin_unlock(&nlru->lock); return true; @@ -140,15 +138,25 @@ bool list_lru_add(struct list_lru *lru, struct list_head *item) } EXPORT_SYMBOL_GPL(list_lru_add); -bool list_lru_del(struct list_lru *lru, struct list_head *item) +bool list_lru_add_obj(struct list_lru *lru, struct list_head *item) { int nid = page_to_nid(virt_to_page(item)); + struct mem_cgroup *memcg = list_lru_memcg_aware(lru) ? + mem_cgroup_from_slab_obj(item) : NULL; + + return list_lru_add(lru, item, nid, memcg); +} +EXPORT_SYMBOL_GPL(list_lru_add_obj); + +bool list_lru_del(struct list_lru *lru, struct list_head *item, int nid, + struct mem_cgroup *memcg) +{ struct list_lru_node *nlru = &lru->node[nid]; struct list_lru_one *l; spin_lock(&nlru->lock); if (!list_empty(item)) { - l = list_lru_from_kmem(lru, nid, item, NULL); + l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg)); list_del_init(item); l->nr_items--; nlru->nr_items--; @@ -160,6 +168,16 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item) } EXPORT_SYMBOL_GPL(list_lru_del); +bool list_lru_del_obj(struct list_lru *lru, struct list_head *item) +{ + int nid = page_to_nid(virt_to_page(item)); + struct mem_cgroup *memcg = list_lru_memcg_aware(lru) ? + mem_cgroup_from_slab_obj(item) : NULL; + + return list_lru_del(lru, item, nid, memcg); +} +EXPORT_SYMBOL_GPL(list_lru_del_obj); + void list_lru_isolate(struct list_lru_one *list, struct list_head *item) { list_del_init(item); @@ -175,6 +193,20 @@ void list_lru_isolate_move(struct list_lru_one *list, struct list_head *item, } EXPORT_SYMBOL_GPL(list_lru_isolate_move); +void list_lru_putback(struct list_lru *lru, struct list_head *item, int nid, + struct mem_cgroup *memcg) +{ + struct list_lru_one *list = + list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg)); + + if (list_empty(item)) { + list_add_tail(item, &list->list); + if (!list->nr_items++) + set_shrinker_bit(memcg, nid, lru_shrinker_id(lru)); + } +} +EXPORT_SYMBOL_GPL(list_lru_putback); + unsigned long list_lru_count_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg) { diff --git a/mm/workingset.c b/mm/workingset.c index b192e44a0e7c..c17d45c6f29b 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -615,12 +615,12 @@ void workingset_update_node(struct xa_node *node) if (node->count && node->count == node->nr_values) { if (list_empty(&node->private_list)) { - list_lru_add(&shadow_nodes, &node->private_list); + list_lru_add_obj(&shadow_nodes, &node->private_list); __inc_lruvec_kmem_state(node, WORKINGSET_NODES); } } else { if (!list_empty(&node->private_list)) { - list_lru_del(&shadow_nodes, &node->private_list); + list_lru_del_obj(&shadow_nodes, &node->private_list); __dec_lruvec_kmem_state(node, WORKINGSET_NODES); } } -- cgit v1.2.3 From fdc4161ff6a5e96222e159c1f1b28d31a985130d Mon Sep 17 00:00:00 2001 From: Nhat Pham Date: Thu, 30 Nov 2023 11:40:19 -0800 Subject: memcontrol: implement mem_cgroup_tryget_online() This patch implements a helper function that try to get a reference to an memcg's css, as well as checking if it is online. This new function is almost exactly the same as the existing mem_cgroup_tryget(), except for the onlineness check. In the !CONFIG_MEMCG case, it always returns true, analogous to mem_cgroup_tryget(). This is useful for e.g to the new zswap writeback scheme, where we need to select the next online memcg as a candidate for the global limit reclaim. Link: https://lkml.kernel.org/r/20231130194023.4102148-3-nphamcs@gmail.com Signed-off-by: Nhat Pham Tested-by: Bagas Sanjaya Reviewed-by: Yosry Ahmed Cc: Chris Li Cc: Dan Streetman Cc: Domenico Cerasuolo Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: Roman Gushchin Cc: Seth Jennings Cc: Shakeel Butt Cc: Shuah Khan Cc: Vitaly Wool Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 7bdcf3020d7a..2bd7d14ace78 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -821,6 +821,11 @@ static inline bool mem_cgroup_tryget(struct mem_cgroup *memcg) return !memcg || css_tryget(&memcg->css); } +static inline bool mem_cgroup_tryget_online(struct mem_cgroup *memcg) +{ + return !memcg || css_tryget_online(&memcg->css); +} + static inline void mem_cgroup_put(struct mem_cgroup *memcg) { if (memcg) @@ -1349,6 +1354,11 @@ static inline bool mem_cgroup_tryget(struct mem_cgroup *memcg) return true; } +static inline bool mem_cgroup_tryget_online(struct mem_cgroup *memcg) +{ + return true; +} + static inline void mem_cgroup_put(struct mem_cgroup *memcg) { } -- cgit v1.2.3 From a65b0e7607ccb5e5184591f73e48512f25c76061 Mon Sep 17 00:00:00 2001 From: Domenico Cerasuolo Date: Thu, 30 Nov 2023 11:40:20 -0800 Subject: zswap: make shrinking memcg-aware Currently, we only have a single global LRU for zswap. This makes it impossible to perform worload-specific shrinking - an memcg cannot determine which pages in the pool it owns, and often ends up writing pages from other memcgs. This issue has been previously observed in practice and mitigated by simply disabling memcg-initiated shrinking: https://lore.kernel.org/all/20230530232435.3097106-1-nphamcs@gmail.com/T/#u This patch fully resolves the issue by replacing the global zswap LRU with memcg- and NUMA-specific LRUs, and modify the reclaim logic: a) When a store attempt hits an memcg limit, it now triggers a synchronous reclaim attempt that, if successful, allows the new hotter page to be accepted by zswap. b) If the store attempt instead hits the global zswap limit, it will trigger an asynchronous reclaim attempt, in which an memcg is selected for reclaim in a round-robin-like fashion. [nphamcs@gmail.com: use correct function for the onlineness check, use mem_cgroup_iter_break()] Link: https://lkml.kernel.org/r/20231205195419.2563217-1-nphamcs@gmail.com [nphamcs@gmail.com: drop the pool's reference at the end of the writeback step] Link: https://lkml.kernel.org/r/20231206030627.4155634-1-nphamcs@gmail.com Link: https://lkml.kernel.org/r/20231130194023.4102148-4-nphamcs@gmail.com Signed-off-by: Domenico Cerasuolo Co-developed-by: Nhat Pham Signed-off-by: Nhat Pham Tested-by: Bagas Sanjaya Cc: Chris Li Cc: Dan Streetman Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: Roman Gushchin Cc: Seth Jennings Cc: Shakeel Butt Cc: Shuah Khan Cc: Vitaly Wool Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 5 + include/linux/zswap.h | 2 + mm/memcontrol.c | 2 + mm/swap.h | 3 +- mm/swap_state.c | 24 +++- mm/zswap.c | 268 ++++++++++++++++++++++++++++++++++++--------- 6 files changed, 245 insertions(+), 59 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 2bd7d14ace78..a308c8eacf20 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1192,6 +1192,11 @@ static inline struct mem_cgroup *page_memcg_check(struct page *page) return NULL; } +static inline struct mem_cgroup *get_mem_cgroup_from_objcg(struct obj_cgroup *objcg) +{ + return NULL; +} + static inline bool folio_memcg_kmem(struct folio *folio) { return false; diff --git a/include/linux/zswap.h b/include/linux/zswap.h index 2a60ce39cfde..e571e393669b 100644 --- a/include/linux/zswap.h +++ b/include/linux/zswap.h @@ -15,6 +15,7 @@ bool zswap_load(struct folio *folio); void zswap_invalidate(int type, pgoff_t offset); void zswap_swapon(int type); void zswap_swapoff(int type); +void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg); #else @@ -31,6 +32,7 @@ static inline bool zswap_load(struct folio *folio) static inline void zswap_invalidate(int type, pgoff_t offset) {} static inline void zswap_swapon(int type) {} static inline void zswap_swapoff(int type) {} +static inline void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg) {} #endif diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 592572d4842e..ce75e504fe8b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5614,6 +5614,8 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) page_counter_set_min(&memcg->memory, 0); page_counter_set_low(&memcg->memory, 0); + zswap_memcg_offline_cleanup(memcg); + memcg_offline_kmem(memcg); reparent_shrinker_deferred(memcg); wb_memcg_offline(memcg); diff --git a/mm/swap.h b/mm/swap.h index 73c332ee4d91..c0dc73e10e91 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -51,7 +51,8 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, struct swap_iocb **plug); struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, struct mempolicy *mpol, pgoff_t ilx, - bool *new_page_allocated); + bool *new_page_allocated, + bool skip_if_exists); struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t flag, struct mempolicy *mpol, pgoff_t ilx); struct page *swapin_readahead(swp_entry_t entry, gfp_t flag, diff --git a/mm/swap_state.c b/mm/swap_state.c index 85d9e5806a6a..6c84236382f3 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -412,7 +412,8 @@ struct folio *filemap_get_incore_folio(struct address_space *mapping, struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, struct mempolicy *mpol, pgoff_t ilx, - bool *new_page_allocated) + bool *new_page_allocated, + bool skip_if_exists) { struct swap_info_struct *si; struct folio *folio; @@ -470,6 +471,17 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, if (err != -EEXIST) goto fail_put_swap; + /* + * Protect against a recursive call to __read_swap_cache_async() + * on the same entry waiting forever here because SWAP_HAS_CACHE + * is set but the folio is not the swap cache yet. This can + * happen today if mem_cgroup_swapin_charge_folio() below + * triggers reclaim through zswap, which may call + * __read_swap_cache_async() in the writeback path. + */ + if (skip_if_exists) + goto fail_put_swap; + /* * We might race against __delete_from_swap_cache(), and * stumble across a swap_map entry whose SWAP_HAS_CACHE @@ -537,7 +549,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, mpol = get_vma_policy(vma, addr, 0, &ilx); page = __read_swap_cache_async(entry, gfp_mask, mpol, ilx, - &page_allocated); + &page_allocated, false); mpol_cond_put(mpol); if (page_allocated) @@ -654,7 +666,7 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, /* Ok, do the async read-ahead now */ page = __read_swap_cache_async( swp_entry(swp_type(entry), offset), - gfp_mask, mpol, ilx, &page_allocated); + gfp_mask, mpol, ilx, &page_allocated, false); if (!page) continue; if (page_allocated) { @@ -672,7 +684,7 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, skip: /* The page was likely read above, so no need for plugging here */ page = __read_swap_cache_async(entry, gfp_mask, mpol, ilx, - &page_allocated); + &page_allocated, false); if (unlikely(page_allocated)) swap_readpage(page, false, NULL); return page; @@ -827,7 +839,7 @@ static struct page *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask, pte_unmap(pte); pte = NULL; page = __read_swap_cache_async(entry, gfp_mask, mpol, ilx, - &page_allocated); + &page_allocated, false); if (!page) continue; if (page_allocated) { @@ -847,7 +859,7 @@ static struct page *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask, skip: /* The page was likely read above, so no need for plugging here */ page = __read_swap_cache_async(targ_entry, gfp_mask, mpol, targ_ilx, - &page_allocated); + &page_allocated, false); if (unlikely(page_allocated)) swap_readpage(page, false, NULL); return page; diff --git a/mm/zswap.c b/mm/zswap.c index 699c6ee11222..213626e0f659 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -35,6 +35,7 @@ #include #include #include +#include #include "swap.h" #include "internal.h" @@ -174,8 +175,8 @@ struct zswap_pool { struct work_struct shrink_work; struct hlist_node node; char tfm_name[CRYPTO_MAX_ALG_NAME]; - struct list_head lru; - spinlock_t lru_lock; + struct list_lru list_lru; + struct mem_cgroup *next_shrink; }; /* @@ -291,15 +292,46 @@ static void zswap_update_total_size(void) zswap_pool_total_size = total; } +/* should be called under RCU */ +#ifdef CONFIG_MEMCG +static inline struct mem_cgroup *mem_cgroup_from_entry(struct zswap_entry *entry) +{ + return entry->objcg ? obj_cgroup_memcg(entry->objcg) : NULL; +} +#else +static inline struct mem_cgroup *mem_cgroup_from_entry(struct zswap_entry *entry) +{ + return NULL; +} +#endif + +static inline int entry_to_nid(struct zswap_entry *entry) +{ + return page_to_nid(virt_to_page(entry)); +} + +void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg) +{ + struct zswap_pool *pool; + + /* lock out zswap pools list modification */ + spin_lock(&zswap_pools_lock); + list_for_each_entry(pool, &zswap_pools, list) { + if (pool->next_shrink == memcg) + pool->next_shrink = mem_cgroup_iter(NULL, pool->next_shrink, NULL); + } + spin_unlock(&zswap_pools_lock); +} + /********************************* * zswap entry functions **********************************/ static struct kmem_cache *zswap_entry_cache; -static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp) +static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp, int nid) { struct zswap_entry *entry; - entry = kmem_cache_alloc(zswap_entry_cache, gfp); + entry = kmem_cache_alloc_node(zswap_entry_cache, gfp, nid); if (!entry) return NULL; entry->refcount = 1; @@ -312,6 +344,61 @@ static void zswap_entry_cache_free(struct zswap_entry *entry) kmem_cache_free(zswap_entry_cache, entry); } +/********************************* +* lru functions +**********************************/ +static void zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry) +{ + int nid = entry_to_nid(entry); + struct mem_cgroup *memcg; + + /* + * Note that it is safe to use rcu_read_lock() here, even in the face of + * concurrent memcg offlining. Thanks to the memcg->kmemcg_id indirection + * used in list_lru lookup, only two scenarios are possible: + * + * 1. list_lru_add() is called before memcg->kmemcg_id is updated. The + * new entry will be reparented to memcg's parent's list_lru. + * 2. list_lru_add() is called after memcg->kmemcg_id is updated. The + * new entry will be added directly to memcg's parent's list_lru. + * + * Similar reasoning holds for list_lru_del() and list_lru_putback(). + */ + rcu_read_lock(); + memcg = mem_cgroup_from_entry(entry); + /* will always succeed */ + list_lru_add(list_lru, &entry->lru, nid, memcg); + rcu_read_unlock(); +} + +static void zswap_lru_del(struct list_lru *list_lru, struct zswap_entry *entry) +{ + int nid = entry_to_nid(entry); + struct mem_cgroup *memcg; + + rcu_read_lock(); + memcg = mem_cgroup_from_entry(entry); + /* will always succeed */ + list_lru_del(list_lru, &entry->lru, nid, memcg); + rcu_read_unlock(); +} + +static void zswap_lru_putback(struct list_lru *list_lru, + struct zswap_entry *entry) +{ + int nid = entry_to_nid(entry); + spinlock_t *lock = &list_lru->node[nid].lock; + struct mem_cgroup *memcg; + + rcu_read_lock(); + memcg = mem_cgroup_from_entry(entry); + spin_lock(lock); + /* we cannot use list_lru_add here, because it increments node's lru count */ + list_lru_putback(list_lru, &entry->lru, nid, memcg); + spin_unlock(lock); + rcu_read_unlock(); +} + /********************************* * rbtree functions **********************************/ @@ -396,9 +483,7 @@ static void zswap_free_entry(struct zswap_entry *entry) if (!entry->length) atomic_dec(&zswap_same_filled_pages); else { - spin_lock(&entry->pool->lru_lock); - list_del(&entry->lru); - spin_unlock(&entry->pool->lru_lock); + zswap_lru_del(&entry->pool->list_lru, entry); zpool_free(zswap_find_zpool(entry), entry->handle); zswap_pool_put(entry->pool); } @@ -632,21 +717,15 @@ static void zswap_invalidate_entry(struct zswap_tree *tree, zswap_entry_put(tree, entry); } -static int zswap_reclaim_entry(struct zswap_pool *pool) +static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_one *l, + spinlock_t *lock, void *arg) { - struct zswap_entry *entry; + struct zswap_entry *entry = container_of(item, struct zswap_entry, lru); struct zswap_tree *tree; pgoff_t swpoffset; - int ret; + enum lru_status ret = LRU_REMOVED_RETRY; + int writeback_result; - /* Get an entry off the LRU */ - spin_lock(&pool->lru_lock); - if (list_empty(&pool->lru)) { - spin_unlock(&pool->lru_lock); - return -EINVAL; - } - entry = list_last_entry(&pool->lru, struct zswap_entry, lru); - list_del_init(&entry->lru); /* * Once the lru lock is dropped, the entry might get freed. The * swpoffset is copied to the stack, and entry isn't deref'd again @@ -654,28 +733,32 @@ static int zswap_reclaim_entry(struct zswap_pool *pool) */ swpoffset = swp_offset(entry->swpentry); tree = zswap_trees[swp_type(entry->swpentry)]; - spin_unlock(&pool->lru_lock); + list_lru_isolate(l, item); + /* + * It's safe to drop the lock here because we return either + * LRU_REMOVED_RETRY or LRU_RETRY. + */ + spin_unlock(lock); /* Check for invalidate() race */ spin_lock(&tree->lock); - if (entry != zswap_rb_search(&tree->rbroot, swpoffset)) { - ret = -EAGAIN; + if (entry != zswap_rb_search(&tree->rbroot, swpoffset)) goto unlock; - } + /* Hold a reference to prevent a free during writeback */ zswap_entry_get(entry); spin_unlock(&tree->lock); - ret = zswap_writeback_entry(entry, tree); + writeback_result = zswap_writeback_entry(entry, tree); spin_lock(&tree->lock); - if (ret) { - /* Writeback failed, put entry back on LRU */ - spin_lock(&pool->lru_lock); - list_move(&entry->lru, &pool->lru); - spin_unlock(&pool->lru_lock); + if (writeback_result) { + zswap_reject_reclaim_fail++; + zswap_lru_putback(&entry->pool->list_lru, entry); + ret = LRU_RETRY; goto put_unlock; } + zswap_written_back_pages++; /* * Writeback started successfully, the page now belongs to the @@ -689,24 +772,91 @@ put_unlock: zswap_entry_put(tree, entry); unlock: spin_unlock(&tree->lock); - return ret ? -EAGAIN : 0; + spin_lock(lock); + return ret; +} + +static int shrink_memcg(struct mem_cgroup *memcg) +{ + struct zswap_pool *pool; + int nid, shrunk = 0; + + /* + * Skip zombies because their LRUs are reparented and we would be + * reclaiming from the parent instead of the dead memcg. + */ + if (memcg && !mem_cgroup_online(memcg)) + return -ENOENT; + + pool = zswap_pool_current_get(); + if (!pool) + return -EINVAL; + + for_each_node_state(nid, N_NORMAL_MEMORY) { + unsigned long nr_to_walk = 1; + + shrunk += list_lru_walk_one(&pool->list_lru, nid, memcg, + &shrink_memcg_cb, NULL, &nr_to_walk); + } + zswap_pool_put(pool); + return shrunk ? 0 : -EAGAIN; } static void shrink_worker(struct work_struct *w) { struct zswap_pool *pool = container_of(w, typeof(*pool), shrink_work); + struct mem_cgroup *memcg; int ret, failures = 0; + /* global reclaim will select cgroup in a round-robin fashion. */ do { - ret = zswap_reclaim_entry(pool); - if (ret) { - zswap_reject_reclaim_fail++; - if (ret != -EAGAIN) + spin_lock(&zswap_pools_lock); + pool->next_shrink = mem_cgroup_iter(NULL, pool->next_shrink, NULL); + memcg = pool->next_shrink; + + /* + * We need to retry if we have gone through a full round trip, or if we + * got an offline memcg (or else we risk undoing the effect of the + * zswap memcg offlining cleanup callback). This is not catastrophic + * per se, but it will keep the now offlined memcg hostage for a while. + * + * Note that if we got an online memcg, we will keep the extra + * reference in case the original reference obtained by mem_cgroup_iter + * is dropped by the zswap memcg offlining callback, ensuring that the + * memcg is not killed when we are reclaiming. + */ + if (!memcg) { + spin_unlock(&zswap_pools_lock); + if (++failures == MAX_RECLAIM_RETRIES) break; + + goto resched; + } + + if (!mem_cgroup_tryget_online(memcg)) { + /* drop the reference from mem_cgroup_iter() */ + mem_cgroup_iter_break(NULL, memcg); + pool->next_shrink = NULL; + spin_unlock(&zswap_pools_lock); + if (++failures == MAX_RECLAIM_RETRIES) break; + + goto resched; } + spin_unlock(&zswap_pools_lock); + + ret = shrink_memcg(memcg); + /* drop the extra reference */ + mem_cgroup_put(memcg); + + if (ret == -EINVAL) + break; + if (ret && ++failures == MAX_RECLAIM_RETRIES) + break; + +resched: cond_resched(); } while (!zswap_can_accept()); zswap_pool_put(pool); @@ -767,8 +917,7 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor) */ kref_init(&pool->kref); INIT_LIST_HEAD(&pool->list); - INIT_LIST_HEAD(&pool->lru); - spin_lock_init(&pool->lru_lock); + list_lru_init_memcg(&pool->list_lru, NULL); INIT_WORK(&pool->shrink_work, shrink_worker); zswap_pool_debug("created", pool); @@ -834,6 +983,13 @@ static void zswap_pool_destroy(struct zswap_pool *pool) cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node); free_percpu(pool->acomp_ctx); + list_lru_destroy(&pool->list_lru); + + spin_lock(&zswap_pools_lock); + mem_cgroup_iter_break(NULL, pool->next_shrink); + pool->next_shrink = NULL; + spin_unlock(&zswap_pools_lock); + for (i = 0; i < ZSWAP_NR_ZPOOLS; i++) zpool_destroy_pool(pool->zpools[i]); kfree(pool); @@ -1081,7 +1237,7 @@ static int zswap_writeback_entry(struct zswap_entry *entry, /* try to allocate swap cache page */ mpol = get_task_policy(current); page = __read_swap_cache_async(swpentry, GFP_KERNEL, mpol, - NO_INTERLEAVE_INDEX, &page_was_allocated); + NO_INTERLEAVE_INDEX, &page_was_allocated, true); if (!page) { ret = -ENOMEM; goto fail; @@ -1147,7 +1303,6 @@ static int zswap_writeback_entry(struct zswap_entry *entry, /* start writeback */ __swap_writepage(page, &wbc); put_page(page); - zswap_written_back_pages++; return ret; @@ -1204,6 +1359,7 @@ bool zswap_store(struct folio *folio) struct scatterlist input, output; struct crypto_acomp_ctx *acomp_ctx; struct obj_cgroup *objcg = NULL; + struct mem_cgroup *memcg = NULL; struct zswap_pool *pool; struct zpool *zpool; unsigned int dlen = PAGE_SIZE; @@ -1235,15 +1391,15 @@ bool zswap_store(struct folio *folio) zswap_invalidate_entry(tree, dupentry); } spin_unlock(&tree->lock); - - /* - * XXX: zswap reclaim does not work with cgroups yet. Without a - * cgroup-aware entry LRU, we will push out entries system-wide based on - * local cgroup limits. - */ objcg = get_obj_cgroup_from_folio(folio); - if (objcg && !obj_cgroup_may_zswap(objcg)) - goto reject; + if (objcg && !obj_cgroup_may_zswap(objcg)) { + memcg = get_mem_cgroup_from_objcg(objcg); + if (shrink_memcg(memcg)) { + mem_cgroup_put(memcg); + goto reject; + } + mem_cgroup_put(memcg); + } /* reclaim space if needed */ if (zswap_is_full()) { @@ -1260,7 +1416,7 @@ bool zswap_store(struct folio *folio) } /* allocate entry */ - entry = zswap_entry_cache_alloc(GFP_KERNEL); + entry = zswap_entry_cache_alloc(GFP_KERNEL, page_to_nid(page)); if (!entry) { zswap_reject_kmemcache_fail++; goto reject; @@ -1287,6 +1443,15 @@ bool zswap_store(struct folio *folio) if (!entry->pool) goto freepage; + if (objcg) { + memcg = get_mem_cgroup_from_objcg(objcg); + if (memcg_list_lru_alloc(memcg, &entry->pool->list_lru, GFP_KERNEL)) { + mem_cgroup_put(memcg); + goto put_pool; + } + mem_cgroup_put(memcg); + } + /* compress */ acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); @@ -1365,9 +1530,8 @@ insert_entry: zswap_invalidate_entry(tree, dupentry); } if (entry->length) { - spin_lock(&entry->pool->lru_lock); - list_add(&entry->lru, &entry->pool->lru); - spin_unlock(&entry->pool->lru_lock); + INIT_LIST_HEAD(&entry->lru); + zswap_lru_add(&entry->pool->list_lru, entry); } spin_unlock(&tree->lock); @@ -1380,6 +1544,7 @@ insert_entry: put_dstmem: mutex_unlock(acomp_ctx->mutex); +put_pool: zswap_pool_put(entry->pool); freepage: zswap_entry_cache_free(entry); @@ -1474,9 +1639,8 @@ freeentry: zswap_invalidate_entry(tree, entry); folio_mark_dirty(folio); } else if (entry->length) { - spin_lock(&entry->pool->lru_lock); - list_move(&entry->lru, &entry->pool->lru); - spin_unlock(&entry->pool->lru_lock); + zswap_lru_del(&entry->pool->list_lru, entry); + zswap_lru_add(&entry->pool->list_lru, entry); } zswap_entry_put(tree, entry); spin_unlock(&tree->lock); -- cgit v1.2.3 From 7108cc3f765cafd48a6a35f8add140beaecfa75b Mon Sep 17 00:00:00 2001 From: Domenico Cerasuolo Date: Thu, 30 Nov 2023 11:40:21 -0800 Subject: mm: memcg: add per-memcg zswap writeback stat Since zswap now writes back pages from memcg-specific LRUs, we now need a new stat to show writebacks count for each memcg. [nphamcs@gmail.com: rename ZSWP_WB to ZSWPWB] Link: https://lkml.kernel.org/r/20231205193307.2432803-1-nphamcs@gmail.com Link: https://lkml.kernel.org/r/20231130194023.4102148-5-nphamcs@gmail.com Suggested-by: Nhat Pham Signed-off-by: Domenico Cerasuolo Signed-off-by: Nhat Pham Tested-by: Bagas Sanjaya Reviewed-by: Yosry Ahmed Cc: Chris Li Cc: Dan Streetman Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: Roman Gushchin Cc: Seth Jennings Cc: Shakeel Butt Cc: Shuah Khan Cc: Vitaly Wool Signed-off-by: Andrew Morton --- include/linux/vm_event_item.h | 1 + mm/memcontrol.c | 1 + mm/vmstat.c | 1 + mm/zswap.c | 4 ++++ 4 files changed, 7 insertions(+) (limited to 'include') diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index d1b847502f09..747943bc8cc2 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -142,6 +142,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, #ifdef CONFIG_ZSWAP ZSWPIN, ZSWPOUT, + ZSWPWB, #endif #ifdef CONFIG_X86 DIRECT_MAP_LEVEL2_SPLIT, diff --git a/mm/memcontrol.c b/mm/memcontrol.c index ce75e504fe8b..69b0ad455242 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -703,6 +703,7 @@ static const unsigned int memcg_vm_event_stat[] = { #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) ZSWPIN, ZSWPOUT, + ZSWPWB, #endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE THP_FAULT_ALLOC, diff --git a/mm/vmstat.c b/mm/vmstat.c index afa5a38fcc9c..cfd8d8256f8e 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1401,6 +1401,7 @@ const char * const vmstat_text[] = { #ifdef CONFIG_ZSWAP "zswpin", "zswpout", + "zswpwb", #endif #ifdef CONFIG_X86 "direct_map_level2_splits", diff --git a/mm/zswap.c b/mm/zswap.c index 213626e0f659..c329fca810c8 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -760,6 +760,10 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o } zswap_written_back_pages++; + if (entry->objcg) + count_objcg_event(entry->objcg, ZSWPWB); + + count_vm_event(ZSWPWB); /* * Writeback started successfully, the page now belongs to the * swapcache. Drop the entry from zswap - unless invalidate already -- cgit v1.2.3 From b5ba474f3f518701249598b35c581b92a3c95b48 Mon Sep 17 00:00:00 2001 From: Nhat Pham Date: Thu, 30 Nov 2023 11:40:23 -0800 Subject: zswap: shrink zswap pool based on memory pressure Currently, we only shrink the zswap pool when the user-defined limit is hit. This means that if we set the limit too high, cold data that are unlikely to be used again will reside in the pool, wasting precious memory. It is hard to predict how much zswap space will be needed ahead of time, as this depends on the workload (specifically, on factors such as memory access patterns and compressibility of the memory pages). This patch implements a memcg- and NUMA-aware shrinker for zswap, that is initiated when there is memory pressure. The shrinker does not have any parameter that must be tuned by the user, and can be opted in or out on a per-memcg basis. Furthermore, to make it more robust for many workloads and prevent overshrinking (i.e evicting warm pages that might be refaulted into memory), we build in the following heuristics: * Estimate the number of warm pages residing in zswap, and attempt to protect this region of the zswap LRU. * Scale the number of freeable objects by an estimate of the memory saving factor. The better zswap compresses the data, the fewer pages we will evict to swap (as we will otherwise incur IO for relatively small memory saving). * During reclaim, if the shrinker encounters a page that is also being brought into memory, the shrinker will cautiously terminate its shrinking action, as this is a sign that it is touching the warmer region of the zswap LRU. As a proof of concept, we ran the following synthetic benchmark: build the linux kernel in a memory-limited cgroup, and allocate some cold data in tmpfs to see if the shrinker could write them out and improved the overall performance. Depending on the amount of cold data generated, we observe from 14% to 35% reduction in kernel CPU time used in the kernel builds. [nphamcs@gmail.com: check shrinker enablement early, use less costly stat flushing] Link: https://lkml.kernel.org/r/20231206194456.3234203-1-nphamcs@gmail.com Link: https://lkml.kernel.org/r/20231130194023.4102148-7-nphamcs@gmail.com Signed-off-by: Nhat Pham Acked-by: Johannes Weiner Tested-by: Bagas Sanjaya Cc: Chris Li Cc: Dan Streetman Cc: Domenico Cerasuolo Cc: Michal Hocko Cc: Muchun Song Cc: Roman Gushchin Cc: Seth Jennings Cc: Shakeel Butt Cc: Shuah Khan Cc: Vitaly Wool Cc: Yosry Ahmed Cc: Chengming Zhou Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/zswap.rst | 10 ++ include/linux/mmzone.h | 2 + include/linux/zswap.h | 25 ++++- mm/Kconfig | 14 +++ mm/mmzone.c | 1 + mm/swap_state.c | 2 + mm/zswap.c | 192 ++++++++++++++++++++++++++++++++- 7 files changed, 240 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/Documentation/admin-guide/mm/zswap.rst b/Documentation/admin-guide/mm/zswap.rst index 45b98390e938..62fc244ec702 100644 --- a/Documentation/admin-guide/mm/zswap.rst +++ b/Documentation/admin-guide/mm/zswap.rst @@ -153,6 +153,16 @@ attribute, e. g.:: Setting this parameter to 100 will disable the hysteresis. +When there is a sizable amount of cold memory residing in the zswap pool, it +can be advantageous to proactively write these cold pages to swap and reclaim +the memory for other use cases. By default, the zswap shrinker is disabled. +User can enable it as follows: + + echo Y > /sys/module/zswap/parameters/shrinker_enabled + +This can be enabled at the boot time if ``CONFIG_ZSWAP_SHRINKER_DEFAULT_ON`` is +selected. + A debugfs interface is provided for various statistic about pool size, number of pages stored, same-value filled pages and various counters for the reasons pages are rejected. diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 14faffa4354f..9ef9d010bff0 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -22,6 +22,7 @@ #include #include #include +#include #include /* Free memory management - zoned buddy allocator. */ @@ -641,6 +642,7 @@ struct lruvec { #ifdef CONFIG_MEMCG struct pglist_data *pgdat; #endif + struct zswap_lruvec_state zswap_lruvec_state; }; /* Isolate for asynchronous migration */ diff --git a/include/linux/zswap.h b/include/linux/zswap.h index e571e393669b..08c240e16a01 100644 --- a/include/linux/zswap.h +++ b/include/linux/zswap.h @@ -5,20 +5,40 @@ #include #include +struct lruvec; + extern u64 zswap_pool_total_size; extern atomic_t zswap_stored_pages; #ifdef CONFIG_ZSWAP +struct zswap_lruvec_state { + /* + * Number of pages in zswap that should be protected from the shrinker. + * This number is an estimate of the following counts: + * + * a) Recent page faults. + * b) Recent insertion to the zswap LRU. This includes new zswap stores, + * as well as recent zswap LRU rotations. + * + * These pages are likely to be warm, and might incur IO if the are written + * to swap. + */ + atomic_long_t nr_zswap_protected; +}; + bool zswap_store(struct folio *folio); bool zswap_load(struct folio *folio); void zswap_invalidate(int type, pgoff_t offset); void zswap_swapon(int type); void zswap_swapoff(int type); void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg); - +void zswap_lruvec_state_init(struct lruvec *lruvec); +void zswap_page_swapin(struct page *page); #else +struct zswap_lruvec_state {}; + static inline bool zswap_store(struct folio *folio) { return false; @@ -33,7 +53,8 @@ static inline void zswap_invalidate(int type, pgoff_t offset) {} static inline void zswap_swapon(int type) {} static inline void zswap_swapoff(int type) {} static inline void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg) {} - +static inline void zswap_lruvec_state_init(struct lruvec *lruvec) {} +static inline void zswap_page_swapin(struct page *page) {} #endif #endif /* _LINUX_ZSWAP_H */ diff --git a/mm/Kconfig b/mm/Kconfig index 57cd378c73d6..ca87cdb72f11 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -61,6 +61,20 @@ config ZSWAP_EXCLUSIVE_LOADS_DEFAULT_ON The cost is that if the page was never dirtied and needs to be swapped out again, it will be re-compressed. +config ZSWAP_SHRINKER_DEFAULT_ON + bool "Shrink the zswap pool on memory pressure" + depends on ZSWAP + default n + help + If selected, the zswap shrinker will be enabled, and the pages + stored in the zswap pool will become available for reclaim (i.e + written back to the backing swap device) on memory pressure. + + This means that zswap writeback could happen even if the pool is + not yet full, or the cgroup zswap limit has not been reached, + reducing the chance that cold pages will reside in the zswap pool + and consume memory indefinitely. + choice prompt "Default compressor" depends on ZSWAP diff --git a/mm/mmzone.c b/mm/mmzone.c index b594d3f268fe..c01896eca736 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c @@ -78,6 +78,7 @@ void lruvec_init(struct lruvec *lruvec) memset(lruvec, 0, sizeof(struct lruvec)); spin_lock_init(&lruvec->lru_lock); + zswap_lruvec_state_init(lruvec); for_each_lru(lru) INIT_LIST_HEAD(&lruvec->lists[lru]); diff --git a/mm/swap_state.c b/mm/swap_state.c index 6c84236382f3..c597cec606e4 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -687,6 +687,7 @@ skip: &page_allocated, false); if (unlikely(page_allocated)) swap_readpage(page, false, NULL); + zswap_page_swapin(page); return page; } @@ -862,6 +863,7 @@ skip: &page_allocated, false); if (unlikely(page_allocated)) swap_readpage(page, false, NULL); + zswap_page_swapin(page); return page; } diff --git a/mm/zswap.c b/mm/zswap.c index c329fca810c8..015425ed9003 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -148,6 +148,11 @@ module_param_named(exclusive_loads, zswap_exclusive_loads_enabled, bool, 0644); /* Number of zpools in zswap_pool (empirically determined for scalability) */ #define ZSWAP_NR_ZPOOLS 32 +/* Enable/disable memory pressure-based shrinker. */ +static bool zswap_shrinker_enabled = IS_ENABLED( + CONFIG_ZSWAP_SHRINKER_DEFAULT_ON); +module_param_named(shrinker_enabled, zswap_shrinker_enabled, bool, 0644); + /********************************* * data structures **********************************/ @@ -177,6 +182,8 @@ struct zswap_pool { char tfm_name[CRYPTO_MAX_ALG_NAME]; struct list_lru list_lru; struct mem_cgroup *next_shrink; + struct shrinker *shrinker; + atomic_t nr_stored; }; /* @@ -275,17 +282,26 @@ static bool zswap_can_accept(void) DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE); } +static u64 get_zswap_pool_size(struct zswap_pool *pool) +{ + u64 pool_size = 0; + int i; + + for (i = 0; i < ZSWAP_NR_ZPOOLS; i++) + pool_size += zpool_get_total_size(pool->zpools[i]); + + return pool_size; +} + static void zswap_update_total_size(void) { struct zswap_pool *pool; u64 total = 0; - int i; rcu_read_lock(); list_for_each_entry_rcu(pool, &zswap_pools, list) - for (i = 0; i < ZSWAP_NR_ZPOOLS; i++) - total += zpool_get_total_size(pool->zpools[i]); + total += get_zswap_pool_size(pool); rcu_read_unlock(); @@ -344,13 +360,34 @@ static void zswap_entry_cache_free(struct zswap_entry *entry) kmem_cache_free(zswap_entry_cache, entry); } +/********************************* +* zswap lruvec functions +**********************************/ +void zswap_lruvec_state_init(struct lruvec *lruvec) +{ + atomic_long_set(&lruvec->zswap_lruvec_state.nr_zswap_protected, 0); +} + +void zswap_page_swapin(struct page *page) +{ + struct lruvec *lruvec; + + if (page) { + lruvec = folio_lruvec(page_folio(page)); + atomic_long_inc(&lruvec->zswap_lruvec_state.nr_zswap_protected); + } +} + /********************************* * lru functions **********************************/ static void zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry) { + atomic_long_t *nr_zswap_protected; + unsigned long lru_size, old, new; int nid = entry_to_nid(entry); struct mem_cgroup *memcg; + struct lruvec *lruvec; /* * Note that it is safe to use rcu_read_lock() here, even in the face of @@ -368,6 +405,19 @@ static void zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry) memcg = mem_cgroup_from_entry(entry); /* will always succeed */ list_lru_add(list_lru, &entry->lru, nid, memcg); + + /* Update the protection area */ + lru_size = list_lru_count_one(list_lru, nid, memcg); + lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid)); + nr_zswap_protected = &lruvec->zswap_lruvec_state.nr_zswap_protected; + old = atomic_long_inc_return(nr_zswap_protected); + /* + * Decay to avoid overflow and adapt to changing workloads. + * This is based on LRU reclaim cost decaying heuristics. + */ + do { + new = old > lru_size / 4 ? old / 2 : old; + } while (!atomic_long_try_cmpxchg(nr_zswap_protected, &old, new)); rcu_read_unlock(); } @@ -389,6 +439,7 @@ static void zswap_lru_putback(struct list_lru *list_lru, int nid = entry_to_nid(entry); spinlock_t *lock = &list_lru->node[nid].lock; struct mem_cgroup *memcg; + struct lruvec *lruvec; rcu_read_lock(); memcg = mem_cgroup_from_entry(entry); @@ -396,6 +447,10 @@ static void zswap_lru_putback(struct list_lru *list_lru, /* we cannot use list_lru_add here, because it increments node's lru count */ list_lru_putback(list_lru, &entry->lru, nid, memcg); spin_unlock(lock); + + lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(entry_to_nid(entry))); + /* increment the protection area to account for the LRU rotation. */ + atomic_long_inc(&lruvec->zswap_lruvec_state.nr_zswap_protected); rcu_read_unlock(); } @@ -485,6 +540,7 @@ static void zswap_free_entry(struct zswap_entry *entry) else { zswap_lru_del(&entry->pool->list_lru, entry); zpool_free(zswap_find_zpool(entry), entry->handle); + atomic_dec(&entry->pool->nr_stored); zswap_pool_put(entry->pool); } zswap_entry_cache_free(entry); @@ -526,6 +582,109 @@ static struct zswap_entry *zswap_entry_find_get(struct rb_root *root, return entry; } +/********************************* +* shrinker functions +**********************************/ +static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_one *l, + spinlock_t *lock, void *arg); + +static unsigned long zswap_shrinker_scan(struct shrinker *shrinker, + struct shrink_control *sc) +{ + struct lruvec *lruvec = mem_cgroup_lruvec(sc->memcg, NODE_DATA(sc->nid)); + unsigned long shrink_ret, nr_protected, lru_size; + struct zswap_pool *pool = shrinker->private_data; + bool encountered_page_in_swapcache = false; + + if (!zswap_shrinker_enabled) { + sc->nr_scanned = 0; + return SHRINK_STOP; + } + + nr_protected = + atomic_long_read(&lruvec->zswap_lruvec_state.nr_zswap_protected); + lru_size = list_lru_shrink_count(&pool->list_lru, sc); + + /* + * Abort if we are shrinking into the protected region. + * + * This short-circuiting is necessary because if we have too many multiple + * concurrent reclaimers getting the freeable zswap object counts at the + * same time (before any of them made reasonable progress), the total + * number of reclaimed objects might be more than the number of unprotected + * objects (i.e the reclaimers will reclaim into the protected area of the + * zswap LRU). + */ + if (nr_protected >= lru_size - sc->nr_to_scan) { + sc->nr_scanned = 0; + return SHRINK_STOP; + } + + shrink_ret = list_lru_shrink_walk(&pool->list_lru, sc, &shrink_memcg_cb, + &encountered_page_in_swapcache); + + if (encountered_page_in_swapcache) + return SHRINK_STOP; + + return shrink_ret ? shrink_ret : SHRINK_STOP; +} + +static unsigned long zswap_shrinker_count(struct shrinker *shrinker, + struct shrink_control *sc) +{ + struct zswap_pool *pool = shrinker->private_data; + struct mem_cgroup *memcg = sc->memcg; + struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(sc->nid)); + unsigned long nr_backing, nr_stored, nr_freeable, nr_protected; + + if (!zswap_shrinker_enabled) + return 0; + +#ifdef CONFIG_MEMCG_KMEM + mem_cgroup_flush_stats(); + nr_backing = memcg_page_state(memcg, MEMCG_ZSWAP_B) >> PAGE_SHIFT; + nr_stored = memcg_page_state(memcg, MEMCG_ZSWAPPED); +#else + /* use pool stats instead of memcg stats */ + nr_backing = get_zswap_pool_size(pool) >> PAGE_SHIFT; + nr_stored = atomic_read(&pool->nr_stored); +#endif + + if (!nr_stored) + return 0; + + nr_protected = + atomic_long_read(&lruvec->zswap_lruvec_state.nr_zswap_protected); + nr_freeable = list_lru_shrink_count(&pool->list_lru, sc); + /* + * Subtract the lru size by an estimate of the number of pages + * that should be protected. + */ + nr_freeable = nr_freeable > nr_protected ? nr_freeable - nr_protected : 0; + + /* + * Scale the number of freeable pages by the memory saving factor. + * This ensures that the better zswap compresses memory, the fewer + * pages we will evict to swap (as it will otherwise incur IO for + * relatively small memory saving). + */ + return mult_frac(nr_freeable, nr_backing, nr_stored); +} + +static void zswap_alloc_shrinker(struct zswap_pool *pool) +{ + pool->shrinker = + shrinker_alloc(SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE, "mm-zswap"); + if (!pool->shrinker) + return; + + pool->shrinker->private_data = pool; + pool->shrinker->scan_objects = zswap_shrinker_scan; + pool->shrinker->count_objects = zswap_shrinker_count; + pool->shrinker->batch = 0; + pool->shrinker->seeks = DEFAULT_SEEKS; +} + /********************************* * per-cpu code **********************************/ @@ -721,6 +880,7 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o spinlock_t *lock, void *arg) { struct zswap_entry *entry = container_of(item, struct zswap_entry, lru); + bool *encountered_page_in_swapcache = (bool *)arg; struct zswap_tree *tree; pgoff_t swpoffset; enum lru_status ret = LRU_REMOVED_RETRY; @@ -756,6 +916,17 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o zswap_reject_reclaim_fail++; zswap_lru_putback(&entry->pool->list_lru, entry); ret = LRU_RETRY; + + /* + * Encountering a page already in swap cache is a sign that we are shrinking + * into the warmer region. We should terminate shrinking (if we're in the dynamic + * shrinker context). + */ + if (writeback_result == -EEXIST && encountered_page_in_swapcache) { + ret = LRU_SKIP; + *encountered_page_in_swapcache = true; + } + goto put_unlock; } zswap_written_back_pages++; @@ -914,6 +1085,11 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor) &pool->node); if (ret) goto error; + + zswap_alloc_shrinker(pool); + if (!pool->shrinker) + goto error; + pr_debug("using %s compressor\n", pool->tfm_name); /* being the current pool takes 1 ref; this func expects the @@ -921,13 +1097,19 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor) */ kref_init(&pool->kref); INIT_LIST_HEAD(&pool->list); - list_lru_init_memcg(&pool->list_lru, NULL); + if (list_lru_init_memcg(&pool->list_lru, pool->shrinker)) + goto lru_fail; + shrinker_register(pool->shrinker); INIT_WORK(&pool->shrink_work, shrink_worker); + atomic_set(&pool->nr_stored, 0); zswap_pool_debug("created", pool); return pool; +lru_fail: + list_lru_destroy(&pool->list_lru); + shrinker_free(pool->shrinker); error: if (pool->acomp_ctx) free_percpu(pool->acomp_ctx); @@ -985,6 +1167,7 @@ static void zswap_pool_destroy(struct zswap_pool *pool) zswap_pool_debug("destroying", pool); + shrinker_free(pool->shrinker); cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node); free_percpu(pool->acomp_ctx); list_lru_destroy(&pool->list_lru); @@ -1536,6 +1719,7 @@ insert_entry: if (entry->length) { INIT_LIST_HEAD(&entry->lru); zswap_lru_add(&entry->pool->list_lru, entry); + atomic_inc(&entry->pool->nr_stored); } spin_unlock(&tree->lock); -- cgit v1.2.3 From 9294a037c01564786abb15436529fae3863268a2 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 30 Nov 2023 02:36:44 +0000 Subject: mm/damon/core: implement goal-oriented feedback-driven quota auto-tuning Patch series "mm/damon: let users feed and tame/auto-tune DAMOS". Introduce Aim-oriented Feedback-driven DAMOS Aggressiveness Auto-tuning. It makes DAMOS self-tuned with periodic simple user feedback. Background: DAMOS Control Difficulty ==================================== DAMOS helps users easily implement access pattern aware system operations. However, controlling DAMOS in the wild is not that easy. The basic way for DAMOS control is specifying the target access pattern. In this approach, the user is assumed to well understand the access pattern and the characteristics of the system and the workloads. Though there are useful tools for that, it takes time and effort depending on the complexity and the dynamicity of the system and the workloads. After all, the access pattern consists of three ranges, namely the size, the access rate, and the age of the regions. It means users need to tune six parameters, which is anyway not a simple task. One of the worst cases would be DAMOS being too aggressive like a berserker, and therefore consuming too much system resource and making unwanted radical system operations. To let users avoid such cases, DAMOS allows users to set the upper-limit of the schemes' aggressiveness, namely DAMOS quota. DAMOS further provides its best-effort under the limit by prioritizing regions based on the access pattern of the regions. For example, users can ask DAMOS to page out up to 100 MiB of memory regions per second. Then DAMOS pages out regions that are not accessed for a longer time (colder) first under the limit. This allows users to set the target access pattern a bit naive with wider ranges, and focus on tuning only one parameter, the quota. In other words, the number of parameters to tune can be reduced from six to one. Still, however, the optimum value for the quota depends on the system and the workloads' characteristics, so not that simple. The number of parameters to tune can also increase again if the user needs to run multiple schemes. Aim-oriented Feedback-driven DAMOS Aggressiveness Auto Tuning ============================================================= Users would use DAMOS since they want to achieve something with it. They will likely have measurable metrics representing the achievement and the target number of the metric like SLO, and continuously measure that anyway. While the additional cost of getting the information is nearly zero, it could be useful for DAMOS to understand how appropriate its current aggressiveness is set, and adjust it on its own to make the metric value more close to the target. Based on this idea, we introduce a new way of tuning DAMOS with nearly zero additional effort, namely Aim-oriented Feedback-driven DAMOS Aggressiveness Auto Tuning. It asks users to provide feedback representing how well DAMOS is doing relative to the users' aim. Then DAMOS adjusts its aggressiveness, specifically the quota that provides the best effort result under the limit, based on the current level of the aggressiveness and the users' feedback. Implementation ============== The implementation asks users to represent the feedback with score numbers. The scores could be anything including user-space specific metrics including latency and throughput of special user-space workloads, and system metrics including free memory ratio, memory pressure stall time (PSI), and active to inactive LRU lists size ratio. The feedback scores and the aggressiveness of the given DAMOS scheme are assumed to be positively proportional, though. Selecting metrics of the assumption is the users' responsibility. The core logic uses the below simple feedback loop algorithm to calculate the next aggressiveness level of the scheme from the current aggressiveness level and the current feedback (target_score and current_score). It calculates the compensation for next aggressiveness as a proportion of current aggressiveness and distance to the target score. As a result, it arrives at the near-goal state in a short time using big steps when it's far from the goal, but avoids making unnecessarily radical changes that could turn out to be a bad decision using small steps when its near to the goal. f(n) = max(1, f(n - 1) * ((target_score - current_score) / target_score + 1)) Note that the compensation value becomes negative when it's over achieving the goal. That's why the feedback metric and the aggressiveness of the scheme should be positively proportional. The distance-adaptive speed manipulation is simply applied. Example Use Cases ================= If users want to reduce the memory footprint of the system as much as possible as long as the time spent for handling the resulting memory pressure is within a threshold, they could use DAMOS scheme that reclaims cold memory regions aiming for a little level of memory pressure stall time. If users want the active/inactive LRU lists well balanced to reduce the performance impact due to possible future memory pressure, they could use two schemes. The first one would be set to locate hot pages in the active LRU list, aiming for a specific active-to-inactive LRU list size ratio, say, 70%. The second one would be to locate cold pages in the inactive LRU list, aiming for a specific inactive-to-active LRU list size ratio, say, 30%. Then, DAMOS will balance the two schemes based on the goal and feedback. This aim-oriented auto tuning could also be useful for general balancing-required access aware system operations such as system memory auto scaling[3] and tiered memory management[4]. These two example usages are not what current DAMOS implementation is already supporting, but require additional DAMOS action developments, though. Evaluation: subtle memory pressure aiming proactive reclamation =============================================================== To show if the implementation works as expected, we prepare four different system configurations on AWS i3.metal instances. The first setup (original) runs the workload without any DAMOS scheme. The second setup (not-tuned) runs the workload with a virtual address space-based proactive reclamation scheme that pages out memory regions that are not accessed for five seconds or more. The third setup (offline-tuned) runs the same proactive reclamation DAMOS scheme, but after making it tuned for each workload offline, using our previous user-space driven automatic tuning approach, namely DAMOOS[1]. The fourth and final setup (AFDAA) runs the scheme that is the same as that of 'not-tuned' setup, but aims to keep 0.5% of 'some' memory pressure stall time (PSI) for the last 10 seconds using the aiming-oriented auto tuning. For each setup, we run realistic workloads from PARSEC3 and SPLASH-2X benchmark suites. For each run, we measure RSS and runtime of the workload, and 'some' memory pressure stall time (PSI) of the system. We repeat the runs five times and use averaged measurements. For simple comparison of the results, we normalize the measurements to those of 'original'. In the case of the PSI, though, the measurement for 'original' was zero, so we normalize the value to that of 'not-tuned' scheme's result. The normalized results are shown below. Not-tuned Offline-tuned AFDAA RSS 0.622688178226118 0.787950678944904 0.740093483278979 runtime 1.11767826657912 1.0564674983585 1.0910833880499 PSI 1 0.727521443794069 0.308498846350299 The 'not-tuned' scheme achieves about 38.7% memory saving but incur about 11.7% runtime slowdown. The 'offline-tuned' scheme achieves about 22.2% memory saving with about 5.5% runtime slowdown. It also achieves about 28.2% memory pressure stall time saving. AFDAA achieves about 26% memory saving with about 9.1% runtime slowdown. It also achieves about 69.1% memory pressure stall time saving. We repeat this test multiple times, and get consistent results. AFDAA is now integrated in our daily DAMON performance test setup. Apparently the aggressiveness of 'AFDAA' setup is somewhere between those of 'not-tuned' and 'offline-tuned' setup, since its memory saving and runtime overhead are between those of the other two setups. Actually we set the memory pressure stall time goal aiming for this middle aggressiveness. The difference in the two metrics are not significant, though. However, it shows significant saving of the memory pressure stall time, which was the goal of the auto-tuning, over the two variants. Hence, we conclude the automatic tuning is working as expected. Please note that the AFDAA setup is only for the evaluation, and therefore intentionally set a bit aggressive. It might not be appropriate for production environments. The test code is also available[2], so you could reproduce it on your system and workloads. Patches Sequence ================ The first four patches implement the core logic and user interfaces for the auto tuning. The first patch implements the core logic for the auto tuning, and the API for DAMOS users in the kernel space. The second patch implements basic file operations of DAMON sysfs directories and files that will be used for setting the goals and providing the feedback. The third patch connects the quota goals files inputs to the DAMOS core logic. Finally the fourth patch implements a dedicated DAMOS sysfs command for efficiently committing the quota goals feedback. Two patches for simple tests of the logic and interfaces follow. The fifth patch implements the core logic unit test. The sixth patch implements a selftest for the DAMON Sysfs interface for the goals. Finally, three patches for documentation follows. The seventh patch documents the design of the feature. The eighth patch updates the API doc for the new sysfs files. The final eighth patch updates the usage document for the features. References ========== [1] DAOS paper: https://www.amazon.science/publications/daos-data-access-aware-operating-system [2] Evaluation code: https://github.com/damonitor/damon-tests/commit/3f884e61193f0166b8724554b6d06b0c449a712d [3] Memory auto scaling RFC idea: https://lore.kernel.org/damon/20231112195114.61474-1-sj@kernel.org/ [4] DAMON-based tiered memory management RFC idea: https://lore.kernel.org/damon/20231112195602.61525-1-sj@kernel.org/ This patch (of 9) Users can effectively control the upper-limit aggressiveness of DAMOS schemes using the quota feature. The quota provides best result under the limit by prioritizing regions based on the access pattern. That said, finding the best value, which could depend on dynamic characteristics of the system and the workloads, is still challenging. Implement a simple feedback-driven tuning mechanism and use it for automatic tuning of DAMOS quota. The implementation allows users to provide the feedback by setting a feedback score returning callback function. Then DAMOS periodically calls the function back and adjusts the quota based on the return value of the callback and current quota value. Note that the absolute-value based time/size quotas still work as the maximum hard limits of the scheme's aggressiveness. The feedback-driven auto-tuned quota is applied only if it is not exceeding the manually set maximum limits. Same for the scheme-target access pattern and filters like other features. [sj@kernel.org: document get_score_arg field of struct damos_quota] Link: https://lkml.kernel.org/r/20231204170106.60992-1-sj@kernel.org Link: https://lkml.kernel.org/r/20231130023652.50284-1-sj@kernel.org Link: https://lkml.kernel.org/r/20231130023652.50284-2-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: David Gow Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- include/linux/damon.h | 20 +++++++++++++++ mm/damon/core.c | 68 ++++++++++++++++++++++++++++++++++++++++++++------- 2 files changed, 79 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/damon.h b/include/linux/damon.h index ab2f17d9926b..aa34ab433bc5 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -136,6 +136,9 @@ enum damos_action { * @weight_nr_accesses: Weight of the region's nr_accesses for prioritization. * @weight_age: Weight of the region's age for prioritization. * + * @get_score: Feedback function for self-tuning quota. + * @get_score_arg: Parameter for @get_score + * * To avoid consuming too much CPU time or IO resources for applying the * &struct damos->action to large memory, DAMON allows users to set time and/or * size quotas. The quotas can be set by writing non-zero values to &ms and @@ -153,6 +156,17 @@ enum damos_action { * You could customize the prioritization logic by setting &weight_sz, * &weight_nr_accesses, and &weight_age, because monitoring operations are * encouraged to respect those. + * + * If @get_score function pointer is set, DAMON calls it back with + * @get_score_arg and get the return value of it for every @reset_interval. + * Then, DAMON adjusts the effective quota using the return value as a feedback + * score to the current quota, using its internal feedback loop algorithm. + * + * The feedback loop algorithem assumes the quota input and the feedback score + * output are in a positive proportional relationship, and the goal of the + * tuning is getting the feedback screo value of 10,000. If @ms and/or @sz are + * set together, those work as a hard limit quota. If neither @ms nor @sz are + * set, the mechanism starts from the quota of one byte. */ struct damos_quota { unsigned long ms; @@ -163,6 +177,9 @@ struct damos_quota { unsigned int weight_nr_accesses; unsigned int weight_age; + unsigned long (*get_score)(void *arg); + void *get_score_arg; + /* private: */ /* For throughput estimation */ unsigned long total_charged_sz; @@ -179,6 +196,9 @@ struct damos_quota { /* For prioritization */ unsigned long histogram[DAMOS_MAX_SCORE + 1]; unsigned int min_score; + + /* For feedback loop */ + unsigned long esz_bp; }; /** diff --git a/mm/damon/core.c b/mm/damon/core.c index ce1562783e7e..f91715a58dc7 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -1038,26 +1038,76 @@ static void damon_do_apply_schemes(struct damon_ctx *c, } } -/* Shouldn't be called if quota->ms and quota->sz are zero */ +/* + * damon_feed_loop_next_input() - get next input to achieve a target score. + * @last_input The last input. + * @score Current score that made with @last_input. + * + * Calculate next input to achieve the target score, based on the last input + * and current score. Assuming the input and the score are positively + * proportional, calculate how much compensation should be added to or + * subtracted from the last input as a proportion of the last input. Avoid + * next input always being zero by setting it non-zero always. In short form + * (assuming support of float and signed calculations), the algorithm is as + * below. + * + * next_input = max(last_input * ((goal - current) / goal + 1), 1) + * + * For simple implementation, we assume the target score is always 10,000. The + * caller should adjust @score for this. + * + * Returns next input that assumed to achieve the target score. + */ +static unsigned long damon_feed_loop_next_input(unsigned long last_input, + unsigned long score) +{ + const unsigned long goal = 10000; + unsigned long score_goal_diff = max(goal, score) - min(goal, score); + unsigned long score_goal_diff_bp = score_goal_diff * 10000 / goal; + unsigned long compensation = last_input * score_goal_diff_bp / 10000; + /* Set minimum input as 10000 to avoid compensation be zero */ + const unsigned long min_input = 10000; + + if (goal > score) + return last_input + compensation; + if (last_input > compensation + min_input) + return last_input - compensation; + return min_input; +} + +/* Shouldn't be called if quota->ms, quota->sz, and quota->get_score unset */ static void damos_set_effective_quota(struct damos_quota *quota) { unsigned long throughput; unsigned long esz; - if (!quota->ms) { + if (!quota->ms && !quota->get_score) { quota->esz = quota->sz; return; } - if (quota->total_charged_ns) - throughput = quota->total_charged_sz * 1000000 / - quota->total_charged_ns; - else - throughput = PAGE_SIZE * 1024; - esz = throughput * quota->ms; + if (quota->get_score) { + quota->esz_bp = damon_feed_loop_next_input( + max(quota->esz_bp, 10000UL), + quota->get_score(quota->get_score_arg)); + esz = quota->esz_bp / 10000; + } + + if (quota->ms) { + if (quota->total_charged_ns) + throughput = quota->total_charged_sz * 1000000 / + quota->total_charged_ns; + else + throughput = PAGE_SIZE * 1024; + if (quota->get_score) + esz = min(throughput * quota->ms, esz); + else + esz = throughput * quota->ms; + } if (quota->sz && quota->sz < esz) esz = quota->sz; + quota->esz = esz; } @@ -1069,7 +1119,7 @@ static void damos_adjust_quota(struct damon_ctx *c, struct damos *s) unsigned long cumulated_sz; unsigned int score, max_score = 0; - if (!quota->ms && !quota->sz) + if (!quota->ms && !quota->sz && !quota->get_score) return; /* New charge window starts */ -- cgit v1.2.3 From 1486fb50136f4799946f5ecfe050094574647153 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Sat, 18 Nov 2023 10:32:28 +0800 Subject: mm: ksm: use more folio api in ksm_might_need_to_copy() Patch series "mm: cleanup and use more folio in page fault", v3. Rename page_copy_prealloc() to folio_prealloc(), which is used by more functions, also do more folio conversion in page fault. This patch (of 5): Since ksm only support normal page, no swapout/in for ksm large folio too, add large folio check in ksm_might_need_to_copy(), also convert page->index to folio->index as page->index is going away. Then convert ksm_might_need_to_copy() to use more folio api to save nine compound_head() calls, short 'address' to reduce max-line-length. Link: https://lkml.kernel.org/r/20231118023232.1409103-1-wangkefeng.wang@huawei.com Link: https://lkml.kernel.org/r/20231118023232.1409103-2-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: David Hildenbrand Cc: Matthew Wilcox (Oracle) Cc: Sidhartha Kumar Cc: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- include/linux/ksm.h | 4 ++-- mm/ksm.c | 39 +++++++++++++++++++++------------------ 2 files changed, 23 insertions(+), 20 deletions(-) (limited to 'include') diff --git a/include/linux/ksm.h b/include/linux/ksm.h index c2dd786a30e1..4643d5244e77 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h @@ -77,7 +77,7 @@ static inline void ksm_exit(struct mm_struct *mm) * but what if the vma was unmerged while the page was swapped out? */ struct page *ksm_might_need_to_copy(struct page *page, - struct vm_area_struct *vma, unsigned long address); + struct vm_area_struct *vma, unsigned long addr); void rmap_walk_ksm(struct folio *folio, struct rmap_walk_control *rwc); void folio_migrate_ksm(struct folio *newfolio, struct folio *folio); @@ -130,7 +130,7 @@ static inline int ksm_madvise(struct vm_area_struct *vma, unsigned long start, } static inline struct page *ksm_might_need_to_copy(struct page *page, - struct vm_area_struct *vma, unsigned long address) + struct vm_area_struct *vma, unsigned long addr) { return page; } diff --git a/mm/ksm.c b/mm/ksm.c index 5d60d5385de6..b93389a3780e 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -2876,48 +2876,51 @@ void __ksm_exit(struct mm_struct *mm) } struct page *ksm_might_need_to_copy(struct page *page, - struct vm_area_struct *vma, unsigned long address) + struct vm_area_struct *vma, unsigned long addr) { struct folio *folio = page_folio(page); struct anon_vma *anon_vma = folio_anon_vma(folio); - struct page *new_page; + struct folio *new_folio; - if (PageKsm(page)) { - if (page_stable_node(page) && + if (folio_test_large(folio)) + return page; + + if (folio_test_ksm(folio)) { + if (folio_stable_node(folio) && !(ksm_run & KSM_RUN_UNMERGE)) return page; /* no need to copy it */ } else if (!anon_vma) { return page; /* no need to copy it */ - } else if (page->index == linear_page_index(vma, address) && + } else if (folio->index == linear_page_index(vma, addr) && anon_vma->root == vma->anon_vma->root) { return page; /* still no need to copy it */ } if (PageHWPoison(page)) return ERR_PTR(-EHWPOISON); - if (!PageUptodate(page)) + if (!folio_test_uptodate(folio)) return page; /* let do_swap_page report the error */ - new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); - if (new_page && - mem_cgroup_charge(page_folio(new_page), vma->vm_mm, GFP_KERNEL)) { - put_page(new_page); - new_page = NULL; + new_folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, addr, false); + if (new_folio && + mem_cgroup_charge(new_folio, vma->vm_mm, GFP_KERNEL)) { + folio_put(new_folio); + new_folio = NULL; } - if (new_page) { - if (copy_mc_user_highpage(new_page, page, address, vma)) { - put_page(new_page); + if (new_folio) { + if (copy_mc_user_highpage(&new_folio->page, page, addr, vma)) { + folio_put(new_folio); memory_failure_queue(page_to_pfn(page), 0); return ERR_PTR(-EHWPOISON); } - SetPageDirty(new_page); - __SetPageUptodate(new_page); - __SetPageLocked(new_page); + folio_set_dirty(new_folio); + __folio_mark_uptodate(new_folio); + __folio_set_locked(new_folio); #ifdef CONFIG_SWAP count_vm_event(KSM_SWPIN_COPY); #endif } - return new_page; + return new_folio ? &new_folio->page : NULL; } void rmap_walk_ksm(struct folio *folio, struct rmap_walk_control *rwc) -- cgit v1.2.3 From f67f8d4a8c1e1ebc85a6cbdb9a7266f14863461c Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Fri, 1 Dec 2023 14:59:36 -0500 Subject: mm/rmap: fix misplaced parenthesis of a likely() Running my yearly branch profiler to see where likely/unlikely annotation may be added or removed, I discovered this: correct incorrect % Function File Line ------- --------- - -------- ---- ---- 0 457918 100 page_try_dup_anon_rmap rmap.h 264 [..] 458021 0 0 page_try_dup_anon_rmap rmap.h 265 I thought it was interesting that line 264 of rmap.h had a 100% incorrect annotation, but the line directly below it was 100% correct. Looking at the code: if (likely(!is_device_private_page(page) && unlikely(page_needs_cow_for_dma(vma, page)))) It didn't make sense. The "likely()" was around the entire if statement (not just the "!is_device_private_page(page)"), which also included the "unlikely()" portion of that if condition. If the unlikely portion is unlikely to be true, that would make the entire if condition unlikely to be true, so it made no sense at all to say the entire if condition is true. What is more likely to be likely is just the first part of the if statement before the && operation. It's likely to be a misplaced parenthesis. And after making the if condition broken into a likely() && unlikely(), both now appear to be correct! Link: https://lkml.kernel.org/r/20231201145936.5ddfdb50@gandalf.local.home Fixes:fb3d824d1a46c ("mm/rmap: split page_dup_rmap() into page_dup_file_rmap() and page_try_dup_anon_rmap()") Signed-off-by: Steven Rostedt (Google) Acked-by: Vlastimil Babka Cc: David Hildenbrand Cc: Vlastimil Babka Cc: Signed-off-by: Andrew Morton --- include/linux/rmap.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index b26fe858fd44..3c2fc291b071 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -261,8 +261,8 @@ static inline int page_try_dup_anon_rmap(struct page *page, bool compound, * guarantee the pinned page won't be randomly replaced in the * future on write faults. */ - if (likely(!is_device_private_page(page) && - unlikely(page_needs_cow_for_dma(vma, page)))) + if (likely(!is_device_private_page(page)) && + unlikely(page_needs_cow_for_dma(vma, page))) return -EBUSY; ClearPageAnonExclusive(page); -- cgit v1.2.3 From 7d7ef0a4686abe43cd76a141b340a348f45ecdf2 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Wed, 29 Nov 2023 03:21:53 +0000 Subject: mm: memcg: restore subtree stats flushing Stats flushing for memcg currently follows the following rules: - Always flush the entire memcg hierarchy (i.e. flush the root). - Only one flusher is allowed at a time. If someone else tries to flush concurrently, they skip and return immediately. - A periodic flusher flushes all the stats every 2 seconds. The reason this approach is followed is because all flushes are serialized by a global rstat spinlock. On the memcg side, flushing is invoked from userspace reads as well as in-kernel flushers (e.g. reclaim, refault, etc). This approach aims to avoid serializing all flushers on the global lock, which can cause a significant performance hit under high concurrency. This approach has the following problems: - Occasionally a userspace read of the stats of a non-root cgroup will be too expensive as it has to flush the entire hierarchy [1]. - Sometimes the stats accuracy are compromised if there is an ongoing flush, and we skip and return before the subtree of interest is actually flushed, yielding stale stats (by up to 2s due to periodic flushing). This is more visible when reading stats from userspace, but can also affect in-kernel flushers. The latter problem is particulary a concern when userspace reads stats after an event occurs, but gets stats from before the event. Examples: - When memory usage / pressure spikes, a userspace OOM handler may look at the stats of different memcgs to select a victim based on various heuristics (e.g. how much private memory will be freed by killing this). Reading stale stats from before the usage spike in this case may cause a wrongful OOM kill. - A proactive reclaimer may read the stats after writing to memory.reclaim to measure the success of the reclaim operation. Stale stats from before reclaim may give a false negative. - Reading the stats of a parent and a child memcg may be inconsistent (child larger than parent), if the flush doesn't happen when the parent is read, but happens when the child is read. As for in-kernel flushers, they will occasionally get stale stats. No regressions are currently known from this, but if there are regressions, they would be very difficult to debug and link to the source of the problem. This patch aims to fix these problems by restoring subtree flushing, and removing the unified/coalesced flushing logic that skips flushing if there is an ongoing flush. This change would introduce a significant regression with global stats flushing thresholds. With per-memcg stats flushing thresholds, this seems to perform really well. The thresholds protect the underlying lock from unnecessary contention. This patch was tested in two ways to ensure the latency of flushing is up to par, on a machine with 384 cpus: - A synthetic test with 5000 concurrent workers in 500 cgroups doing allocations and reclaim, as well as 1000 readers for memory.stat (variation of [2]). No regressions were noticed in the total runtime. Note that significant regressions in this test are observed with global stats thresholds, but not with per-memcg thresholds. - A synthetic stress test for concurrently reading memcg stats while memory allocation/freeing workers are running in the background, provided by Wei Xu [3]. With 250k threads reading the stats every 100ms in 50k cgroups, 99.9% of reads take <= 50us. Less than 0.01% of reads take more than 1ms, and no reads take more than 100ms. [1] https://lore.kernel.org/lkml/CABWYdi0c6__rh-K7dcM_pkf9BJdTRtAU08M43KO9ME4-dsgfoQ@mail.gmail.com/ [2] https://lore.kernel.org/lkml/CAJD7tka13M-zVZTyQJYL1iUAYvuQ1fcHbCjcOBZcz6POYTV-4g@mail.gmail.com/ [3] https://lore.kernel.org/lkml/CAAPL-u9D2b=iF5Lf_cRnKxUfkiEe0AMDTu6yhrUAzX0b6a6rDg@mail.gmail.com/ [akpm@linux-foundation.org: fix mm/zswap.c] [yosryahmed@google.com: remove stats flushing mutex] Link: https://lkml.kernel.org/r/CAJD7tkZgP3m-VVPn+fF_YuvXeQYK=tZZjJHj=dzD=CcSSpp2qg@mail.gmail.com Link: https://lkml.kernel.org/r/20231129032154.3710765-6-yosryahmed@google.com Signed-off-by: Yosry Ahmed Tested-by: Domenico Cerasuolo Acked-by: Shakeel Butt Cc: Chris Li Cc: Greg Thelen Cc: Ivan Babrou Cc: Johannes Weiner Cc: Michal Hocko Cc: Michal Koutny Cc: Muchun Song Cc: Roman Gushchin Cc: Tejun Heo Cc: Waiman Long Cc: Wei Xu Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 8 +++--- mm/memcontrol.c | 68 ++++++++++++++++++++++++++-------------------- mm/vmscan.c | 2 +- mm/workingset.c | 10 +++++-- mm/zswap.c | 2 +- 5 files changed, 52 insertions(+), 38 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index a308c8eacf20..43b77363ab8e 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1051,8 +1051,8 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec, return x; } -void mem_cgroup_flush_stats(void); -void mem_cgroup_flush_stats_ratelimited(void); +void mem_cgroup_flush_stats(struct mem_cgroup *memcg); +void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg); void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val); @@ -1563,11 +1563,11 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec, return node_page_state(lruvec_pgdat(lruvec), idx); } -static inline void mem_cgroup_flush_stats(void) +static inline void mem_cgroup_flush_stats(struct mem_cgroup *memcg) { } -static inline void mem_cgroup_flush_stats_ratelimited(void) +static inline void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg) { } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c5aa0c2cb68b..b08b9cd4a3a8 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -670,7 +670,6 @@ struct memcg_vmstats { */ static void flush_memcg_stats_dwork(struct work_struct *w); static DECLARE_DEFERRABLE_WORK(stats_flush_dwork, flush_memcg_stats_dwork); -static atomic_t stats_flush_ongoing = ATOMIC_INIT(0); static u64 flush_last_time; #define FLUSH_TIME (2UL*HZ) @@ -731,35 +730,40 @@ static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val) } } -static void do_flush_stats(void) +static void do_flush_stats(struct mem_cgroup *memcg) { - /* - * We always flush the entire tree, so concurrent flushers can just - * skip. This avoids a thundering herd problem on the rstat global lock - * from memcg flushers (e.g. reclaim, refault, etc). - */ - if (atomic_read(&stats_flush_ongoing) || - atomic_xchg(&stats_flush_ongoing, 1)) - return; - - WRITE_ONCE(flush_last_time, jiffies_64); - - cgroup_rstat_flush(root_mem_cgroup->css.cgroup); + if (mem_cgroup_is_root(memcg)) + WRITE_ONCE(flush_last_time, jiffies_64); - atomic_set(&stats_flush_ongoing, 0); + cgroup_rstat_flush(memcg->css.cgroup); } -void mem_cgroup_flush_stats(void) +/* + * mem_cgroup_flush_stats - flush the stats of a memory cgroup subtree + * @memcg: root of the subtree to flush + * + * Flushing is serialized by the underlying global rstat lock. There is also a + * minimum amount of work to be done even if there are no stat updates to flush. + * Hence, we only flush the stats if the updates delta exceeds a threshold. This + * avoids unnecessary work and contention on the underlying lock. + */ +void mem_cgroup_flush_stats(struct mem_cgroup *memcg) { - if (memcg_should_flush_stats(root_mem_cgroup)) - do_flush_stats(); + if (mem_cgroup_disabled()) + return; + + if (!memcg) + memcg = root_mem_cgroup; + + if (memcg_should_flush_stats(memcg)) + do_flush_stats(memcg); } -void mem_cgroup_flush_stats_ratelimited(void) +void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg) { /* Only flush if the periodic flusher is one full cycle late */ if (time_after64(jiffies_64, READ_ONCE(flush_last_time) + 2*FLUSH_TIME)) - mem_cgroup_flush_stats(); + mem_cgroup_flush_stats(memcg); } static void flush_memcg_stats_dwork(struct work_struct *w) @@ -768,7 +772,7 @@ static void flush_memcg_stats_dwork(struct work_struct *w) * Deliberately ignore memcg_should_flush_stats() here so that flushing * in latency-sensitive paths is as cheap as possible. */ - do_flush_stats(); + do_flush_stats(root_mem_cgroup); queue_delayed_work(system_unbound_wq, &stats_flush_dwork, FLUSH_TIME); } @@ -1643,7 +1647,7 @@ static void memcg_stat_format(struct mem_cgroup *memcg, struct seq_buf *s) * * Current memory state: */ - mem_cgroup_flush_stats(); + mem_cgroup_flush_stats(memcg); for (i = 0; i < ARRAY_SIZE(memory_stats); i++) { u64 size; @@ -4193,7 +4197,7 @@ static int memcg_numa_stat_show(struct seq_file *m, void *v) int nid; struct mem_cgroup *memcg = mem_cgroup_from_seq(m); - mem_cgroup_flush_stats(); + mem_cgroup_flush_stats(memcg); for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { seq_printf(m, "%s=%lu", stat->name, @@ -4274,7 +4278,7 @@ static void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s) BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats)); - mem_cgroup_flush_stats(); + mem_cgroup_flush_stats(memcg); for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { unsigned long nr; @@ -4770,7 +4774,7 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); struct mem_cgroup *parent; - mem_cgroup_flush_stats(); + mem_cgroup_flush_stats(memcg); *pdirty = memcg_page_state(memcg, NR_FILE_DIRTY); *pwriteback = memcg_page_state(memcg, NR_WRITEBACK); @@ -6865,7 +6869,7 @@ static int memory_numa_stat_show(struct seq_file *m, void *v) int i; struct mem_cgroup *memcg = mem_cgroup_from_seq(m); - mem_cgroup_flush_stats(); + mem_cgroup_flush_stats(memcg); for (i = 0; i < ARRAY_SIZE(memory_stats); i++) { int nid; @@ -8096,7 +8100,11 @@ bool obj_cgroup_may_zswap(struct obj_cgroup *objcg) break; } - cgroup_rstat_flush(memcg->css.cgroup); + /* + * mem_cgroup_flush_stats() ignores small changes. Use + * do_flush_stats() directly to get accurate stats for charging. + */ + do_flush_stats(memcg); pages = memcg_page_state(memcg, MEMCG_ZSWAP_B) / PAGE_SIZE; if (pages < max) continue; @@ -8161,8 +8169,10 @@ void obj_cgroup_uncharge_zswap(struct obj_cgroup *objcg, size_t size) static u64 zswap_current_read(struct cgroup_subsys_state *css, struct cftype *cft) { - cgroup_rstat_flush(css->cgroup); - return memcg_page_state(mem_cgroup_from_css(css), MEMCG_ZSWAP_B); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + + mem_cgroup_flush_stats(memcg); + return memcg_page_state(memcg, MEMCG_ZSWAP_B); } static int zswap_max_show(struct seq_file *m, void *v) diff --git a/mm/vmscan.c b/mm/vmscan.c index f0eba9ef3332..b4ca3563bcf4 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2226,7 +2226,7 @@ static void prepare_scan_control(pg_data_t *pgdat, struct scan_control *sc) * Flush the memory cgroup stats, so that we read accurate per-memcg * lruvec stats for heuristics. */ - mem_cgroup_flush_stats(); + mem_cgroup_flush_stats(sc->target_mem_cgroup); /* * Determine the scan balance between anon and file LRUs. diff --git a/mm/workingset.c b/mm/workingset.c index 6b9871f5a2e8..2a2a34234df9 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -464,8 +464,12 @@ bool workingset_test_recent(void *shadow, bool file, bool *workingset) rcu_read_unlock(); - /* Flush stats (and potentially sleep) outside the RCU read section */ - mem_cgroup_flush_stats_ratelimited(); + /* + * Flush stats (and potentially sleep) outside the RCU read section. + * XXX: With per-memcg flushing and thresholding, is ratelimiting + * still needed here? + */ + mem_cgroup_flush_stats_ratelimited(eviction_memcg); eviction_lruvec = mem_cgroup_lruvec(eviction_memcg, pgdat); refault = atomic_long_read(&eviction_lruvec->nonresident_age); @@ -676,7 +680,7 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, struct lruvec *lruvec; int i; - mem_cgroup_flush_stats(); + mem_cgroup_flush_stats(sc->memcg); lruvec = mem_cgroup_lruvec(sc->memcg, NODE_DATA(sc->nid)); for (pages = 0, i = 0; i < NR_LRU_LISTS; i++) pages += lruvec_page_state_local(lruvec, diff --git a/mm/zswap.c b/mm/zswap.c index 015425ed9003..ac31fec176e9 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -641,7 +641,7 @@ static unsigned long zswap_shrinker_count(struct shrinker *shrinker, return 0; #ifdef CONFIG_MEMCG_KMEM - mem_cgroup_flush_stats(); + mem_cgroup_flush_stats(memcg); nr_backing = memcg_page_state(memcg, MEMCG_ZSWAP_B) >> PAGE_SHIFT; nr_stored = memcg_page_state(memcg, MEMCG_ZSWAPPED); #else -- cgit v1.2.3 From 3485b88390b0af9e05dc2c3f57e9936f41e159a0 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Thu, 7 Dec 2023 16:12:04 +0000 Subject: mm: thp: introduce multi-size THP sysfs interface In preparation for adding support for anonymous multi-size THP, introduce new sysfs structure that will be used to control the new behaviours. A new directory is added under transparent_hugepage for each supported THP size, and contains an `enabled` file, which can be set to "inherit" (to inherit the global setting), "always", "madvise" or "never". For now, the kernel still only supports PMD-sized anonymous THP, so only 1 directory is populated. The first half of the change converts transhuge_vma_suitable() and hugepage_vma_check() so that they take a bitfield of orders for which the user wants to determine support, and the functions filter out all the orders that can't be supported, given the current sysfs configuration and the VMA dimensions. The resulting functions are renamed to thp_vma_suitable_orders() and thp_vma_allowable_orders() respectively. Convenience functions that take a single, unencoded order and return a boolean are also defined as thp_vma_suitable_order() and thp_vma_allowable_order(). The second half of the change implements the new sysfs interface. It has been done so that each supported THP size has a `struct thpsize`, which describes the relevant metadata and is itself a kobject. This is pretty minimal for now, but should make it easy to add new per-thpsize files to the interface if needed in future (e.g. per-size defrag). Rather than keep the `enabled` state directly in the struct thpsize, I've elected to directly encode it into huge_anon_orders_[always|madvise|inherit] bitfields since this reduces the amount of work required in thp_vma_allowable_orders() which is called for every page fault. See Documentation/admin-guide/mm/transhuge.rst, as modified by this commit, for details of how the new sysfs interface works. [ryan.roberts@arm.com: fix build warning when CONFIG_SYSFS is disabled] Link: https://lkml.kernel.org/r/20231211125320.3997543-1-ryan.roberts@arm.com Link: https://lkml.kernel.org/r/20231207161211.2374093-4-ryan.roberts@arm.com Signed-off-by: Ryan Roberts Reviewed-by: Barry Song Tested-by: Kefeng Wang Tested-by: John Hubbard Acked-by: David Hildenbrand Cc: Alistair Popple Cc: Anshuman Khandual Cc: Catalin Marinas Cc: David Rientjes Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Itaru Kitayama Cc: Kirill A. Shutemov Cc: Luis Chamberlain Cc: Matthew Wilcox (Oracle) Cc: Vlastimil Babka Cc: Yang Shi Cc: Yin Fengwei Cc: Yu Zhao Cc: Zi Yan Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/transhuge.rst | 97 +++++++++--- Documentation/filesystems/proc.rst | 6 +- fs/proc/task_mmu.c | 3 +- include/linux/huge_mm.h | 181 ++++++++++++++++++---- mm/huge_memory.c | 231 +++++++++++++++++++++++++---- mm/khugepaged.c | 20 ++- mm/memory.c | 6 +- mm/page_vma_mapped.c | 3 +- 8 files changed, 459 insertions(+), 88 deletions(-) (limited to 'include') diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst index b0cc8243e093..04eb45a2f940 100644 --- a/Documentation/admin-guide/mm/transhuge.rst +++ b/Documentation/admin-guide/mm/transhuge.rst @@ -45,10 +45,25 @@ components: the two is using hugepages just because of the fact the TLB miss is going to run faster. +Modern kernels support "multi-size THP" (mTHP), which introduces the +ability to allocate memory in blocks that are bigger than a base page +but smaller than traditional PMD-size (as described above), in +increments of a power-of-2 number of pages. mTHP can back anonymous +memory (for example 16K, 32K, 64K, etc). These THPs continue to be +PTE-mapped, but in many cases can still provide similar benefits to +those outlined above: Page faults are significantly reduced (by a +factor of e.g. 4, 8, 16, etc), but latency spikes are much less +prominent because the size of each page isn't as huge as the PMD-sized +variant and there is less memory to clear in each page fault. Some +architectures also employ TLB compression mechanisms to squeeze more +entries in when a set of PTEs are virtually and physically contiguous +and approporiately aligned. In this case, TLB misses will occur less +often. + THP can be enabled system wide or restricted to certain tasks or even memory ranges inside task's address space. Unless THP is completely disabled, there is ``khugepaged`` daemon that scans memory and -collapses sequences of basic pages into huge pages. +collapses sequences of basic pages into PMD-sized huge pages. The THP behaviour is controlled via :ref:`sysfs ` interface and using madvise(2) and prctl(2) system calls. @@ -95,12 +110,40 @@ Global THP controls Transparent Hugepage Support for anonymous memory can be entirely disabled (mostly for debugging purposes) or only enabled inside MADV_HUGEPAGE regions (to avoid the risk of consuming more memory resources) or enabled -system wide. This can be achieved with one of:: +system wide. This can be achieved per-supported-THP-size with one of:: + + echo always >/sys/kernel/mm/transparent_hugepage/hugepages-kB/enabled + echo madvise >/sys/kernel/mm/transparent_hugepage/hugepages-kB/enabled + echo never >/sys/kernel/mm/transparent_hugepage/hugepages-kB/enabled + +where is the hugepage size being addressed, the available sizes +for which vary by system. + +For example:: + + echo always >/sys/kernel/mm/transparent_hugepage/hugepages-2048kB/enabled + +Alternatively it is possible to specify that a given hugepage size +will inherit the top-level "enabled" value:: + + echo inherit >/sys/kernel/mm/transparent_hugepage/hugepages-kB/enabled + +For example:: + + echo inherit >/sys/kernel/mm/transparent_hugepage/hugepages-2048kB/enabled + +The top-level setting (for use with "inherit") can be set by issuing +one of the following commands:: echo always >/sys/kernel/mm/transparent_hugepage/enabled echo madvise >/sys/kernel/mm/transparent_hugepage/enabled echo never >/sys/kernel/mm/transparent_hugepage/enabled +By default, PMD-sized hugepages have enabled="inherit" and all other +hugepage sizes have enabled="never". If enabling multiple hugepage +sizes, the kernel will select the most appropriate enabled size for a +given allocation. + It's also possible to limit defrag efforts in the VM to generate anonymous hugepages in case they're not immediately free to madvise regions or to never try to defrag memory and simply fallback to regular @@ -146,25 +189,34 @@ madvise never should be self-explanatory. -By default kernel tries to use huge zero page on read page fault to -anonymous mapping. It's possible to disable huge zero page by writing 0 -or enable it back by writing 1:: +By default kernel tries to use huge, PMD-mappable zero page on read +page fault to anonymous mapping. It's possible to disable huge zero +page by writing 0 or enable it back by writing 1:: echo 0 >/sys/kernel/mm/transparent_hugepage/use_zero_page echo 1 >/sys/kernel/mm/transparent_hugepage/use_zero_page -Some userspace (such as a test program, or an optimized memory allocation -library) may want to know the size (in bytes) of a transparent hugepage:: +Some userspace (such as a test program, or an optimized memory +allocation library) may want to know the size (in bytes) of a +PMD-mappable transparent hugepage:: cat /sys/kernel/mm/transparent_hugepage/hpage_pmd_size -khugepaged will be automatically started when -transparent_hugepage/enabled is set to "always" or "madvise, and it'll -be automatically shutdown if it's set to "never". +khugepaged will be automatically started when one or more hugepage +sizes are enabled (either by directly setting "always" or "madvise", +or by setting "inherit" while the top-level enabled is set to "always" +or "madvise"), and it'll be automatically shutdown when the last +hugepage size is disabled (either by directly setting "never", or by +setting "inherit" while the top-level enabled is set to "never"). Khugepaged controls ------------------- +.. note:: + khugepaged currently only searches for opportunities to collapse to + PMD-sized THP and no attempt is made to collapse to other THP + sizes. + khugepaged runs usually at low frequency so while one may not want to invoke defrag algorithms synchronously during the page faults, it should be worth invoking defrag at least in khugepaged. However it's @@ -282,19 +334,26 @@ force Need of application restart =========================== -The transparent_hugepage/enabled values and tmpfs mount option only affect -future behavior. So to make them effective you need to restart any -application that could have been using hugepages. This also applies to the -regions registered in khugepaged. +The transparent_hugepage/enabled and +transparent_hugepage/hugepages-kB/enabled values and tmpfs mount +option only affect future behavior. So to make them effective you need +to restart any application that could have been using hugepages. This +also applies to the regions registered in khugepaged. Monitoring usage ================ -The number of anonymous transparent huge pages currently used by the +.. note:: + Currently the below counters only record events relating to + PMD-sized THP. Events relating to other THP sizes are not included. + +The number of PMD-sized anonymous transparent huge pages currently used by the system is available by reading the AnonHugePages field in ``/proc/meminfo``. -To identify what applications are using anonymous transparent huge pages, -it is necessary to read ``/proc/PID/smaps`` and count the AnonHugePages fields -for each mapping. +To identify what applications are using PMD-sized anonymous transparent huge +pages, it is necessary to read ``/proc/PID/smaps`` and count the AnonHugePages +fields for each mapping. (Note that AnonHugePages only applies to traditional +PMD-sized THP for historical reasons and should have been called +AnonHugePmdMapped). The number of file transparent huge pages mapped to userspace is available by reading ShmemPmdMapped and ShmemHugePages fields in ``/proc/meminfo``. @@ -413,7 +472,7 @@ for huge pages. Optimizing the applications =========================== -To be guaranteed that the kernel will map a 2M page immediately in any +To be guaranteed that the kernel will map a THP immediately in any memory region, the mmap region has to be hugepage naturally aligned. posix_memalign() can provide that guarantee. diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst index 49ef12df631b..104c6d047d9b 100644 --- a/Documentation/filesystems/proc.rst +++ b/Documentation/filesystems/proc.rst @@ -528,9 +528,9 @@ replaced by copy-on-write) part of the underlying shmem object out on swap. does not take into account swapped out page of underlying shmem objects. "Locked" indicates whether the mapping is locked in memory or not. -"THPeligible" indicates whether the mapping is eligible for allocating THP -pages as well as the THP is PMD mappable or not - 1 if true, 0 otherwise. -It just shows the current status. +"THPeligible" indicates whether the mapping is eligible for allocating +naturally aligned THP pages of any currently enabled size. 1 if true, 0 +otherwise. "VmFlags" field deserves a separate description. This member represents the kernel flags associated with the particular virtual memory area in two letter diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index d19924bf0a39..79855e1c5b57 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -865,7 +865,8 @@ static int show_smap(struct seq_file *m, void *v) __show_smap(m, &mss, false); seq_printf(m, "THPeligible: %8u\n", - hugepage_vma_check(vma, vma->vm_flags, true, false, true)); + !!thp_vma_allowable_orders(vma, vma->vm_flags, true, false, + true, THP_ORDERS_ALL)); if (arch_pkeys_enabled()) seq_printf(m, "ProtectionKey: %8u\n", vma_pkey(vma)); diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index fa0350b0812a..609c153bae57 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -67,6 +67,24 @@ extern struct kobj_attribute shmem_enabled_attr; #define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT) #define HPAGE_PMD_NR (1<vm_start >> PAGE_SHIFT) - vma->vm_pgoff, - HPAGE_PMD_NR)) + hpage_size >> PAGE_SHIFT)) return false; } - haddr = addr & HPAGE_PMD_MASK; + haddr = ALIGN_DOWN(addr, hpage_size); - if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end) + if (haddr < vma->vm_start || haddr + hpage_size > vma->vm_end) return false; return true; } +/* + * Filter the bitfield of input orders to the ones suitable for use in the vma. + * See thp_vma_suitable_order(). + * All orders that pass the checks are returned as a bitfield. + */ +static inline unsigned long thp_vma_suitable_orders(struct vm_area_struct *vma, + unsigned long addr, unsigned long orders) +{ + int order; + + /* + * Iterate over orders, highest to lowest, removing orders that don't + * meet alignment requirements from the set. Exit loop at first order + * that meets requirements, since all lower orders must also meet + * requirements. + */ + + order = highest_order(orders); + + while (orders) { + if (thp_vma_suitable_order(vma, addr, order)) + break; + order = next_order(&orders, order); + } + + return orders; +} + static inline bool file_thp_enabled(struct vm_area_struct *vma) { struct inode *inode; @@ -130,8 +208,52 @@ static inline bool file_thp_enabled(struct vm_area_struct *vma) !inode_is_open_for_write(inode) && S_ISREG(inode->i_mode); } -bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags, - bool smaps, bool in_pf, bool enforce_sysfs); +unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, + unsigned long vm_flags, bool smaps, + bool in_pf, bool enforce_sysfs, + unsigned long orders); + +/** + * thp_vma_allowable_orders - determine hugepage orders that are allowed for vma + * @vma: the vm area to check + * @vm_flags: use these vm_flags instead of vma->vm_flags + * @smaps: whether answer will be used for smaps file + * @in_pf: whether answer will be used by page fault handler + * @enforce_sysfs: whether sysfs config should be taken into account + * @orders: bitfield of all orders to consider + * + * Calculates the intersection of the requested hugepage orders and the allowed + * hugepage orders for the provided vma. Permitted orders are encoded as a set + * bit at the corresponding bit position (bit-2 corresponds to order-2, bit-3 + * corresponds to order-3, etc). Order-0 is never considered a hugepage order. + * + * Return: bitfield of orders allowed for hugepage in the vma. 0 if no hugepage + * orders are allowed. + */ +static inline +unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma, + unsigned long vm_flags, bool smaps, + bool in_pf, bool enforce_sysfs, + unsigned long orders) +{ + /* Optimization to check if required orders are enabled early. */ + if (enforce_sysfs && vma_is_anonymous(vma)) { + unsigned long mask = READ_ONCE(huge_anon_orders_always); + + if (vm_flags & VM_HUGEPAGE) + mask |= READ_ONCE(huge_anon_orders_madvise); + if (hugepage_global_always() || + ((vm_flags & VM_HUGEPAGE) && hugepage_global_enabled())) + mask |= READ_ONCE(huge_anon_orders_inherit); + + orders &= mask; + if (!orders) + return 0; + } + + return __thp_vma_allowable_orders(vma, vm_flags, smaps, in_pf, + enforce_sysfs, orders); +} #define transparent_hugepage_use_zero_page() \ (transparent_hugepage_flags & \ @@ -267,17 +389,24 @@ static inline bool folio_test_pmd_mappable(struct folio *folio) return false; } -static inline bool transhuge_vma_suitable(struct vm_area_struct *vma, - unsigned long addr) +static inline bool thp_vma_suitable_order(struct vm_area_struct *vma, + unsigned long addr, int order) { return false; } -static inline bool hugepage_vma_check(struct vm_area_struct *vma, - unsigned long vm_flags, bool smaps, - bool in_pf, bool enforce_sysfs) +static inline unsigned long thp_vma_suitable_orders(struct vm_area_struct *vma, + unsigned long addr, unsigned long orders) { - return false; + return 0; +} + +static inline unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma, + unsigned long vm_flags, bool smaps, + bool in_pf, bool enforce_sysfs, + unsigned long orders) +{ + return 0; } static inline void folio_prep_large_rmappable(struct folio *folio) {} diff --git a/mm/huge_memory.c b/mm/huge_memory.c index c848ea97ab02..387b030c7f15 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -74,12 +74,23 @@ static unsigned long deferred_split_scan(struct shrinker *shrink, static atomic_t huge_zero_refcount; struct page *huge_zero_page __read_mostly; unsigned long huge_zero_pfn __read_mostly = ~0UL; +unsigned long huge_anon_orders_always __read_mostly; +unsigned long huge_anon_orders_madvise __read_mostly; +unsigned long huge_anon_orders_inherit __read_mostly; + +unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, + unsigned long vm_flags, bool smaps, + bool in_pf, bool enforce_sysfs, + unsigned long orders) +{ + /* Check the intersection of requested and supported orders. */ + orders &= vma_is_anonymous(vma) ? + THP_ORDERS_ALL_ANON : THP_ORDERS_ALL_FILE; + if (!orders) + return 0; -bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags, - bool smaps, bool in_pf, bool enforce_sysfs) -{ if (!vma->vm_mm) /* vdso */ - return false; + return 0; /* * Explicitly disabled through madvise or prctl, or some @@ -88,16 +99,16 @@ bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags, * */ if ((vm_flags & VM_NOHUGEPAGE) || test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) - return false; + return 0; /* * If the hardware/firmware marked hugepage support disabled. */ if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_UNSUPPORTED)) - return false; + return 0; /* khugepaged doesn't collapse DAX vma, but page fault is fine. */ if (vma_is_dax(vma)) - return in_pf; + return in_pf ? orders : 0; /* * khugepaged special VMA and hugetlb VMA. @@ -105,17 +116,29 @@ bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags, * VM_MIXEDMAP set. */ if (!in_pf && !smaps && (vm_flags & VM_NO_KHUGEPAGED)) - return false; + return 0; /* - * Check alignment for file vma and size for both file and anon vma. + * Check alignment for file vma and size for both file and anon vma by + * filtering out the unsuitable orders. * * Skip the check for page fault. Huge fault does the check in fault - * handlers. And this check is not suitable for huge PUD fault. + * handlers. */ - if (!in_pf && - !transhuge_vma_suitable(vma, (vma->vm_end - HPAGE_PMD_SIZE))) - return false; + if (!in_pf) { + int order = highest_order(orders); + unsigned long addr; + + while (orders) { + addr = vma->vm_end - (PAGE_SIZE << order); + if (thp_vma_suitable_order(vma, addr, order)) + break; + order = next_order(&orders, order); + } + + if (!orders) + return 0; + } /* * Enabled via shmem mount options or sysfs settings. @@ -124,29 +147,33 @@ bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags, */ if (!in_pf && shmem_file(vma->vm_file)) return shmem_is_huge(file_inode(vma->vm_file), vma->vm_pgoff, - !enforce_sysfs, vma->vm_mm, vm_flags); - - /* Enforce sysfs THP requirements as necessary */ - if (enforce_sysfs && - (!hugepage_flags_enabled() || (!(vm_flags & VM_HUGEPAGE) && - !hugepage_flags_always()))) - return false; + !enforce_sysfs, vma->vm_mm, vm_flags) + ? orders : 0; if (!vma_is_anonymous(vma)) { + /* + * Enforce sysfs THP requirements as necessary. Anonymous vmas + * were already handled in thp_vma_allowable_orders(). + */ + if (enforce_sysfs && + (!hugepage_global_enabled() || (!(vm_flags & VM_HUGEPAGE) && + !hugepage_global_always()))) + return 0; + /* * Trust that ->huge_fault() handlers know what they are doing * in fault path. */ if (((in_pf || smaps)) && vma->vm_ops->huge_fault) - return true; + return orders; /* Only regular file is valid in collapse path */ if (((!in_pf || smaps)) && file_thp_enabled(vma)) - return true; - return false; + return orders; + return 0; } if (vma_is_temporary_stack(vma)) - return false; + return 0; /* * THPeligible bit of smaps should show 1 for proper VMAs even @@ -156,9 +183,9 @@ bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags, * the first page fault. */ if (!vma->anon_vma) - return (smaps || in_pf); + return (smaps || in_pf) ? orders : 0; - return true; + return orders; } static bool get_huge_zero_page(void) @@ -412,9 +439,136 @@ static const struct attribute_group hugepage_attr_group = { .attrs = hugepage_attr, }; +static void hugepage_exit_sysfs(struct kobject *hugepage_kobj); +static void thpsize_release(struct kobject *kobj); +static DEFINE_SPINLOCK(huge_anon_orders_lock); +static LIST_HEAD(thpsize_list); + +struct thpsize { + struct kobject kobj; + struct list_head node; + int order; +}; + +#define to_thpsize(kobj) container_of(kobj, struct thpsize, kobj) + +static ssize_t thpsize_enabled_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + int order = to_thpsize(kobj)->order; + const char *output; + + if (test_bit(order, &huge_anon_orders_always)) + output = "[always] inherit madvise never"; + else if (test_bit(order, &huge_anon_orders_inherit)) + output = "always [inherit] madvise never"; + else if (test_bit(order, &huge_anon_orders_madvise)) + output = "always inherit [madvise] never"; + else + output = "always inherit madvise [never]"; + + return sysfs_emit(buf, "%s\n", output); +} + +static ssize_t thpsize_enabled_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int order = to_thpsize(kobj)->order; + ssize_t ret = count; + + if (sysfs_streq(buf, "always")) { + spin_lock(&huge_anon_orders_lock); + clear_bit(order, &huge_anon_orders_inherit); + clear_bit(order, &huge_anon_orders_madvise); + set_bit(order, &huge_anon_orders_always); + spin_unlock(&huge_anon_orders_lock); + } else if (sysfs_streq(buf, "inherit")) { + spin_lock(&huge_anon_orders_lock); + clear_bit(order, &huge_anon_orders_always); + clear_bit(order, &huge_anon_orders_madvise); + set_bit(order, &huge_anon_orders_inherit); + spin_unlock(&huge_anon_orders_lock); + } else if (sysfs_streq(buf, "madvise")) { + spin_lock(&huge_anon_orders_lock); + clear_bit(order, &huge_anon_orders_always); + clear_bit(order, &huge_anon_orders_inherit); + set_bit(order, &huge_anon_orders_madvise); + spin_unlock(&huge_anon_orders_lock); + } else if (sysfs_streq(buf, "never")) { + spin_lock(&huge_anon_orders_lock); + clear_bit(order, &huge_anon_orders_always); + clear_bit(order, &huge_anon_orders_inherit); + clear_bit(order, &huge_anon_orders_madvise); + spin_unlock(&huge_anon_orders_lock); + } else + ret = -EINVAL; + + return ret; +} + +static struct kobj_attribute thpsize_enabled_attr = + __ATTR(enabled, 0644, thpsize_enabled_show, thpsize_enabled_store); + +static struct attribute *thpsize_attrs[] = { + &thpsize_enabled_attr.attr, + NULL, +}; + +static const struct attribute_group thpsize_attr_group = { + .attrs = thpsize_attrs, +}; + +static const struct kobj_type thpsize_ktype = { + .release = &thpsize_release, + .sysfs_ops = &kobj_sysfs_ops, +}; + +static struct thpsize *thpsize_create(int order, struct kobject *parent) +{ + unsigned long size = (PAGE_SIZE << order) / SZ_1K; + struct thpsize *thpsize; + int ret; + + thpsize = kzalloc(sizeof(*thpsize), GFP_KERNEL); + if (!thpsize) + return ERR_PTR(-ENOMEM); + + ret = kobject_init_and_add(&thpsize->kobj, &thpsize_ktype, parent, + "hugepages-%lukB", size); + if (ret) { + kfree(thpsize); + return ERR_PTR(ret); + } + + ret = sysfs_create_group(&thpsize->kobj, &thpsize_attr_group); + if (ret) { + kobject_put(&thpsize->kobj); + return ERR_PTR(ret); + } + + thpsize->order = order; + return thpsize; +} + +static void thpsize_release(struct kobject *kobj) +{ + kfree(to_thpsize(kobj)); +} + static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj) { int err; + struct thpsize *thpsize; + unsigned long orders; + int order; + + /* + * Default to setting PMD-sized THP to inherit the global setting and + * disable all other sizes. powerpc's PMD_ORDER isn't a compile-time + * constant so we have to do this here. + */ + huge_anon_orders_inherit = BIT(PMD_ORDER); *hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj); if (unlikely(!*hugepage_kobj)) { @@ -434,8 +588,24 @@ static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj) goto remove_hp_group; } + orders = THP_ORDERS_ALL_ANON; + order = highest_order(orders); + while (orders) { + thpsize = thpsize_create(order, *hugepage_kobj); + if (IS_ERR(thpsize)) { + pr_err("failed to create thpsize for order %d\n", order); + err = PTR_ERR(thpsize); + goto remove_all; + } + list_add(&thpsize->node, &thpsize_list); + order = next_order(&orders, order); + } + return 0; +remove_all: + hugepage_exit_sysfs(*hugepage_kobj); + return err; remove_hp_group: sysfs_remove_group(*hugepage_kobj, &hugepage_attr_group); delete_obj: @@ -445,6 +615,13 @@ delete_obj: static void __init hugepage_exit_sysfs(struct kobject *hugepage_kobj) { + struct thpsize *thpsize, *tmp; + + list_for_each_entry_safe(thpsize, tmp, &thpsize_list, node) { + list_del(&thpsize->node); + kobject_put(&thpsize->kobj); + } + sysfs_remove_group(hugepage_kobj, &khugepaged_attr_group); sysfs_remove_group(hugepage_kobj, &hugepage_attr_group); kobject_put(hugepage_kobj); @@ -811,7 +988,7 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) struct folio *folio; unsigned long haddr = vmf->address & HPAGE_PMD_MASK; - if (!transhuge_vma_suitable(vma, haddr)) + if (!thp_vma_suitable_order(vma, haddr, PMD_ORDER)) return VM_FAULT_FALLBACK; if (unlikely(anon_vma_prepare(vma))) return VM_FAULT_OOM; diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 064654717843..d72aecd3624a 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -446,7 +446,8 @@ void khugepaged_enter_vma(struct vm_area_struct *vma, { if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) && hugepage_flags_enabled()) { - if (hugepage_vma_check(vma, vm_flags, false, false, true)) + if (thp_vma_allowable_order(vma, vm_flags, false, false, true, + PMD_ORDER)) __khugepaged_enter(vma->vm_mm); } } @@ -922,16 +923,16 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, if (!vma) return SCAN_VMA_NULL; - if (!transhuge_vma_suitable(vma, address)) + if (!thp_vma_suitable_order(vma, address, PMD_ORDER)) return SCAN_ADDRESS_RANGE; - if (!hugepage_vma_check(vma, vma->vm_flags, false, false, - cc->is_khugepaged)) + if (!thp_vma_allowable_order(vma, vma->vm_flags, false, false, + cc->is_khugepaged, PMD_ORDER)) return SCAN_VMA_CHECK; /* * Anon VMA expected, the address may be unmapped then * remapped to file after khugepaged reaquired the mmap_lock. * - * hugepage_vma_check may return true for qualified file + * thp_vma_allowable_order may return true for qualified file * vmas. */ if (expect_anon && (!(*vmap)->anon_vma || !vma_is_anonymous(*vmap))) @@ -1503,7 +1504,8 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, * and map it by a PMD, regardless of sysfs THP settings. As such, let's * analogously elide sysfs THP settings here. */ - if (!hugepage_vma_check(vma, vma->vm_flags, false, false, false)) + if (!thp_vma_allowable_order(vma, vma->vm_flags, false, false, false, + PMD_ORDER)) return SCAN_VMA_CHECK; /* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */ @@ -2368,7 +2370,8 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, progress++; break; } - if (!hugepage_vma_check(vma, vma->vm_flags, false, false, true)) { + if (!thp_vma_allowable_order(vma, vma->vm_flags, false, false, + true, PMD_ORDER)) { skip: progress++; continue; @@ -2705,7 +2708,8 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev, *prev = vma; - if (!hugepage_vma_check(vma, vma->vm_flags, false, false, false)) + if (!thp_vma_allowable_order(vma, vma->vm_flags, false, false, false, + PMD_ORDER)) return -EINVAL; cc = kmalloc(sizeof(*cc), GFP_KERNEL); diff --git a/mm/memory.c b/mm/memory.c index 99582b188ed2..8ab2d994d997 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4322,7 +4322,7 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) pmd_t entry; vm_fault_t ret = VM_FAULT_FALLBACK; - if (!transhuge_vma_suitable(vma, haddr)) + if (!thp_vma_suitable_order(vma, haddr, PMD_ORDER)) return ret; page = compound_head(page); @@ -5116,7 +5116,7 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, return VM_FAULT_OOM; retry_pud: if (pud_none(*vmf.pud) && - hugepage_vma_check(vma, vm_flags, false, true, true)) { + thp_vma_allowable_order(vma, vm_flags, false, true, true, PUD_ORDER)) { ret = create_huge_pud(&vmf); if (!(ret & VM_FAULT_FALLBACK)) return ret; @@ -5150,7 +5150,7 @@ retry_pud: goto retry_pud; if (pmd_none(*vmf.pmd) && - hugepage_vma_check(vma, vm_flags, false, true, true)) { + thp_vma_allowable_order(vma, vm_flags, false, true, true, PMD_ORDER)) { ret = create_huge_pmd(&vmf); if (!(ret & VM_FAULT_FALLBACK)) return ret; diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c index e0b368e545ed..74d2de15fb5e 100644 --- a/mm/page_vma_mapped.c +++ b/mm/page_vma_mapped.c @@ -268,7 +268,8 @@ restart: * cleared *pmd but not decremented compound_mapcount(). */ if ((pvmw->flags & PVMW_SYNC) && - transhuge_vma_suitable(vma, pvmw->address) && + thp_vma_suitable_order(vma, pvmw->address, + PMD_ORDER) && (pvmw->nr_pages >= HPAGE_PMD_NR)) { spinlock_t *ptl = pmd_lock(mm, pvmw->pmd); -- cgit v1.2.3 From 19eaf44954df64f9bc8dec398219e15ad0811497 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Thu, 7 Dec 2023 16:12:05 +0000 Subject: mm: thp: support allocation of anonymous multi-size THP Introduce the logic to allow THP to be configured (through the new sysfs interface we just added) to allocate large folios to back anonymous memory, which are larger than the base page size but smaller than PMD-size. We call this new THP extension "multi-size THP" (mTHP). mTHP continues to be PTE-mapped, but in many cases can still provide similar benefits to traditional PMD-sized THP: Page faults are significantly reduced (by a factor of e.g. 4, 8, 16, etc. depending on the configured order), but latency spikes are much less prominent because the size of each page isn't as huge as the PMD-sized variant and there is less memory to clear in each page fault. The number of per-page operations (e.g. ref counting, rmap management, lru list management) are also significantly reduced since those ops now become per-folio. Some architectures also employ TLB compression mechanisms to squeeze more entries in when a set of PTEs are virtually and physically contiguous and approporiately aligned. In this case, TLB misses will occur less often. The new behaviour is disabled by default, but can be enabled at runtime by writing to /sys/kernel/mm/transparent_hugepage/hugepage-XXkb/enabled (see documentation in previous commit). The long term aim is to change the default to include suitable lower orders, but there are some risks around internal fragmentation that need to be better understood first. [ryan.roberts@arm.com: resolve some multi-size THP review nits] Link: https://lkml.kernel.org/r/20231214160251.3574571-1-ryan.roberts@arm.com Link: https://lkml.kernel.org/r/20231207161211.2374093-5-ryan.roberts@arm.com Signed-off-by: Ryan Roberts Tested-by: Kefeng Wang Tested-by: John Hubbard Acked-by: David Hildenbrand Cc: Alistair Popple Cc: Anshuman Khandual Cc: Barry Song Cc: Catalin Marinas Cc: David Rientjes Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Itaru Kitayama Cc: Kirill A. Shutemov Cc: Luis Chamberlain Cc: Matthew Wilcox (Oracle) Cc: Vlastimil Babka Cc: Yang Shi Cc: Yin Fengwei Cc: Yu Zhao Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 6 ++- mm/memory.c | 109 ++++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 104 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 609c153bae57..fa7a38a30fc6 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -68,9 +68,11 @@ extern struct kobj_attribute shmem_enabled_attr; #define HPAGE_PMD_NR (1<vma; + unsigned long orders; + struct folio *folio; + unsigned long addr; + pte_t *pte; + gfp_t gfp; + int order; + + /* + * If uffd is active for the vma we need per-page fault fidelity to + * maintain the uffd semantics. + */ + if (unlikely(userfaultfd_armed(vma))) + goto fallback; + + /* + * Get a list of all the (large) orders below PMD_ORDER that are enabled + * for this vma. Then filter out the orders that can't be allocated over + * the faulting address and still be fully contained in the vma. + */ + orders = thp_vma_allowable_orders(vma, vma->vm_flags, false, true, true, + BIT(PMD_ORDER) - 1); + orders = thp_vma_suitable_orders(vma, vmf->address, orders); + + if (!orders) + goto fallback; + + pte = pte_offset_map(vmf->pmd, vmf->address & PMD_MASK); + if (!pte) + return ERR_PTR(-EAGAIN); + + /* + * Find the highest order where the aligned range is completely + * pte_none(). Note that all remaining orders will be completely + * pte_none(). + */ + order = highest_order(orders); + while (orders) { + addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order); + if (pte_range_none(pte + pte_index(addr), 1 << order)) + break; + order = next_order(&orders, order); + } + + pte_unmap(pte); + + /* Try allocating the highest of the remaining orders. */ + gfp = vma_thp_gfp_mask(vma); + while (orders) { + addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order); + folio = vma_alloc_folio(gfp, order, vma, addr, true); + if (folio) { + clear_huge_page(&folio->page, vmf->address, 1 << order); + return folio; + } + order = next_order(&orders, order); + } + +fallback: +#endif + return vma_alloc_zeroed_movable_folio(vmf->vma, vmf->address); +} + /* * We enter with non-exclusive mmap_lock (to exclude vma changes, * but allow concurrent faults), and pte mapped but not yet locked. @@ -4134,9 +4212,12 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) { bool uffd_wp = vmf_orig_pte_uffd_wp(vmf); struct vm_area_struct *vma = vmf->vma; + unsigned long addr = vmf->address; struct folio *folio; vm_fault_t ret = 0; + int nr_pages = 1; pte_t entry; + int i; /* File mapping without ->vm_ops ? */ if (vma->vm_flags & VM_SHARED) @@ -4176,10 +4257,16 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) /* Allocate our own private page. */ if (unlikely(anon_vma_prepare(vma))) goto oom; - folio = vma_alloc_zeroed_movable_folio(vma, vmf->address); + /* Returns NULL on OOM or ERR_PTR(-EAGAIN) if we must retry the fault */ + folio = alloc_anon_folio(vmf); + if (IS_ERR(folio)) + return 0; if (!folio) goto oom; + nr_pages = folio_nr_pages(folio); + addr = ALIGN_DOWN(vmf->address, nr_pages * PAGE_SIZE); + if (mem_cgroup_charge(folio, vma->vm_mm, GFP_KERNEL)) goto oom_free_page; folio_throttle_swaprate(folio, GFP_KERNEL); @@ -4196,12 +4283,15 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) if (vma->vm_flags & VM_WRITE) entry = pte_mkwrite(pte_mkdirty(entry), vma); - vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, - &vmf->ptl); + vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, addr, &vmf->ptl); if (!vmf->pte) goto release; - if (vmf_pte_changed(vmf)) { - update_mmu_tlb(vma, vmf->address, vmf->pte); + if (nr_pages == 1 && vmf_pte_changed(vmf)) { + update_mmu_tlb(vma, addr, vmf->pte); + goto release; + } else if (nr_pages > 1 && !pte_range_none(vmf->pte, nr_pages)) { + for (i = 0; i < nr_pages; i++) + update_mmu_tlb(vma, addr + PAGE_SIZE * i, vmf->pte + i); goto release; } @@ -4216,16 +4306,17 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) return handle_userfault(vmf, VM_UFFD_MISSING); } - inc_mm_counter(vma->vm_mm, MM_ANONPAGES); - folio_add_new_anon_rmap(folio, vma, vmf->address); + folio_ref_add(folio, nr_pages - 1); + add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages); + folio_add_new_anon_rmap(folio, vma, addr); folio_add_lru_vma(folio, vma); setpte: if (uffd_wp) entry = pte_mkuffd_wp(entry); - set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry); + set_ptes(vma->vm_mm, addr, vmf->pte, entry, nr_pages); /* No need to invalidate - it was non-present before */ - update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1); + update_mmu_cache_range(vmf, vma, addr, vmf->pte, nr_pages); unlock: if (vmf->pte) pte_unmap_unlock(vmf->pte, vmf->ptl); -- cgit v1.2.3 From 6ad59a3838cd0a8536721e60b8e4fbe5fdeb233a Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 13 Dec 2023 19:03:33 +0000 Subject: mm/damon: update email of SeongJae Patch series "mm/damon: misc updates for 6.8". Update comments, tests, and documents for DAMON. This patch (of 6): SeongJae is using his kernel.org account for DAMON development. Update the old email addresses on the comments of DAMON source files. Link: https://lkml.kernel.org/r/20231213190338.54146-1-sj@kernel.org Link: https://lkml.kernel.org/r/20231213190338.54146-2-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- include/linux/damon.h | 2 +- mm/damon/core-test.h | 2 +- mm/damon/core.c | 2 +- mm/damon/dbgfs-test.h | 2 +- mm/damon/dbgfs.c | 2 +- mm/damon/modules-common.c | 2 +- mm/damon/vaddr-test.h | 2 +- mm/damon/vaddr.c | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/damon.h b/include/linux/damon.h index 12510d8c51c6..5881e4ac30be 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -2,7 +2,7 @@ /* * DAMON api * - * Author: SeongJae Park + * Author: SeongJae Park */ #ifndef _DAMON_H_ diff --git a/mm/damon/core-test.h b/mm/damon/core-test.h index 6e5e9502d648..876e398557b0 100644 --- a/mm/damon/core-test.h +++ b/mm/damon/core-test.h @@ -4,7 +4,7 @@ * * Copyright 2019 Amazon.com, Inc. or its affiliates. All rights reserved. * - * Author: SeongJae Park + * Author: SeongJae Park */ #ifdef CONFIG_DAMON_KUNIT_TEST diff --git a/mm/damon/core.c b/mm/damon/core.c index 2c0cc65d041e..36f6f1d21ff0 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -2,7 +2,7 @@ /* * Data Access Monitor * - * Author: SeongJae Park + * Author: SeongJae Park */ #define pr_fmt(fmt) "damon: " fmt diff --git a/mm/damon/dbgfs-test.h b/mm/damon/dbgfs-test.h index 0bb0d532b159..2d85217f5ba4 100644 --- a/mm/damon/dbgfs-test.h +++ b/mm/damon/dbgfs-test.h @@ -2,7 +2,7 @@ /* * DAMON Debugfs Interface Unit Tests * - * Author: SeongJae Park + * Author: SeongJae Park */ #ifdef CONFIG_DAMON_DBGFS_KUNIT_TEST diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c index dc0ea1fc30ca..7dac24e69e3b 100644 --- a/mm/damon/dbgfs.c +++ b/mm/damon/dbgfs.c @@ -2,7 +2,7 @@ /* * DAMON Debugfs Interface * - * Author: SeongJae Park + * Author: SeongJae Park */ #define pr_fmt(fmt) "damon-dbgfs: " fmt diff --git a/mm/damon/modules-common.c b/mm/damon/modules-common.c index b2381a8466ec..7cf96574cde7 100644 --- a/mm/damon/modules-common.c +++ b/mm/damon/modules-common.c @@ -2,7 +2,7 @@ /* * Common Primitives for DAMON Modules * - * Author: SeongJae Park + * Author: SeongJae Park */ #include diff --git a/mm/damon/vaddr-test.h b/mm/damon/vaddr-test.h index dcf1ca6b31cc..83626483f82b 100644 --- a/mm/damon/vaddr-test.h +++ b/mm/damon/vaddr-test.h @@ -4,7 +4,7 @@ * * Copyright 2019 Amazon.com, Inc. or its affiliates. All rights reserved. * - * Author: SeongJae Park + * Author: SeongJae Park */ #ifdef CONFIG_DAMON_VADDR_KUNIT_TEST diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index a4d1f63c5b23..a67454b825dc 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -2,7 +2,7 @@ /* * DAMON Primitives for Virtual Address Spaces * - * Author: SeongJae Park + * Author: SeongJae Park */ #define pr_fmt(fmt) "damon-va: " fmt -- cgit v1.2.3 From 0abfa8efad8dccc3899f64dafa985a251714a709 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 12 Dec 2023 20:33:16 -0800 Subject: gfp: gfp_types.h: fix typos & punctuation Correct typos/spellos and punctutation. Link: https://lkml.kernel.org/r/20231213043316.10128-1-rdunlap@infradead.org Signed-off-by: Randy Dunlap Signed-off-by: Andrew Morton --- include/linux/gfp_types.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/gfp_types.h b/include/linux/gfp_types.h index ae994534a12a..1b6053da8754 100644 --- a/include/linux/gfp_types.h +++ b/include/linux/gfp_types.h @@ -162,25 +162,25 @@ typedef unsigned int __bitwise gfp_t; * %__GFP_RECLAIM is shorthand to allow/forbid both direct and kswapd reclaim. * * The default allocator behavior depends on the request size. We have a concept - * of so called costly allocations (with order > %PAGE_ALLOC_COSTLY_ORDER). + * of so-called costly allocations (with order > %PAGE_ALLOC_COSTLY_ORDER). * !costly allocations are too essential to fail so they are implicitly * non-failing by default (with some exceptions like OOM victims might fail so * the caller still has to check for failures) while costly requests try to be * not disruptive and back off even without invoking the OOM killer. * The following three modifiers might be used to override some of these - * implicit rules + * implicit rules. * * %__GFP_NORETRY: The VM implementation will try only very lightweight * memory direct reclaim to get some memory under memory pressure (thus * it can sleep). It will avoid disruptive actions like OOM killer. The * caller must handle the failure which is quite likely to happen under * heavy memory pressure. The flag is suitable when failure can easily be - * handled at small cost, such as reduced throughput + * handled at small cost, such as reduced throughput. * * %__GFP_RETRY_MAYFAIL: The VM implementation will retry memory reclaim * procedures that have previously failed if there is some indication - * that progress has been made else where. It can wait for other - * tasks to attempt high level approaches to freeing memory such as + * that progress has been made elsewhere. It can wait for other + * tasks to attempt high-level approaches to freeing memory such as * compaction (which removes fragmentation) and page-out. * There is still a definite limit to the number of retries, but it is * a larger limit than with %__GFP_NORETRY. @@ -230,7 +230,7 @@ typedef unsigned int __bitwise gfp_t; * is being zeroed (either via __GFP_ZERO or via init_on_alloc, provided that * __GFP_SKIP_ZERO is not set). This flag is intended for optimization: setting * memory tags at the same time as zeroing memory has minimal additional - * performace impact. + * performance impact. * * %__GFP_SKIP_KASAN makes KASAN skip unpoisoning on page allocation. * Used for userspace and vmalloc pages; the latter are unpoisoned by -- cgit v1.2.3 From adef440691bab824e39c1b17382322d195e1fab0 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Wed, 6 Dec 2023 02:36:56 -0800 Subject: userfaultfd: UFFDIO_MOVE uABI Implement the uABI of UFFDIO_MOVE ioctl. UFFDIO_COPY performs ~20% better than UFFDIO_MOVE when the application needs pages to be allocated [1]. However, with UFFDIO_MOVE, if pages are available (in userspace) for recycling, as is usually the case in heap compaction algorithms, then we can avoid the page allocation and memcpy (done by UFFDIO_COPY). Also, since the pages are recycled in the userspace, we avoid the need to release (via madvise) the pages back to the kernel [2]. We see over 40% reduction (on a Google pixel 6 device) in the compacting thread's completion time by using UFFDIO_MOVE vs. UFFDIO_COPY. This was measured using a benchmark that emulates a heap compaction implementation using userfaultfd (to allow concurrent accesses by application threads). More details of the usecase are explained in [2]. Furthermore, UFFDIO_MOVE enables moving swapped-out pages without touching them within the same vma. Today, it can only be done by mremap, however it forces splitting the vma. [1] https://lore.kernel.org/all/1425575884-2574-1-git-send-email-aarcange@redhat.com/ [2] https://lore.kernel.org/linux-mm/CA+EESO4uO84SSnBhArH4HvLNhaUQ5nZKNKXqxRCyjniNVjp0Aw@mail.gmail.com/ Update for the ioctl_userfaultfd(2) manpage: UFFDIO_MOVE (Since Linux xxx) Move a continuous memory chunk into the userfault registered range and optionally wake up the blocked thread. The source and destination addresses and the number of bytes to move are specified by the src, dst, and len fields of the uffdio_move structure pointed to by argp: struct uffdio_move { __u64 dst; /* Destination of move */ __u64 src; /* Source of move */ __u64 len; /* Number of bytes to move */ __u64 mode; /* Flags controlling behavior of move */ __s64 move; /* Number of bytes moved, or negated error */ }; The following value may be bitwise ORed in mode to change the behavior of the UFFDIO_MOVE operation: UFFDIO_MOVE_MODE_DONTWAKE Do not wake up the thread that waits for page-fault resolution UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES Allow holes in the source virtual range that is being moved. When not specified, the holes will result in ENOENT error. When specified, the holes will be accounted as successfully moved memory. This is mostly useful to move hugepage aligned virtual regions without knowing if there are transparent hugepages in the regions or not, but preventing the risk of having to split the hugepage during the operation. The move field is used by the kernel to return the number of bytes that was actually moved, or an error (a negated errno- style value). If the value returned in move doesn't match the value that was specified in len, the operation fails with the error EAGAIN. The move field is output-only; it is not read by the UFFDIO_MOVE operation. The operation may fail for various reasons. Usually, remapping of pages that are not exclusive to the given process fail; once KSM might deduplicate pages or fork() COW-shares pages during fork() with child processes, they are no longer exclusive. Further, the kernel might only perform lightweight checks for detecting whether the pages are exclusive, and return -EBUSY in case that check fails. To make the operation more likely to succeed, KSM should be disabled, fork() should be avoided or MADV_DONTFORK should be configured for the source VMA before fork(). This ioctl(2) operation returns 0 on success. In this case, the entire area was moved. On error, -1 is returned and errno is set to indicate the error. Possible errors include: EAGAIN The number of bytes moved (i.e., the value returned in the move field) does not equal the value that was specified in the len field. EINVAL Either dst or len was not a multiple of the system page size, or the range specified by src and len or dst and len was invalid. EINVAL An invalid bit was specified in the mode field. ENOENT The source virtual memory range has unmapped holes and UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES is not set. EEXIST The destination virtual memory range is fully or partially mapped. EBUSY The pages in the source virtual memory range are either pinned or not exclusive to the process. The kernel might only perform lightweight checks for detecting whether the pages are exclusive. To make the operation more likely to succeed, KSM should be disabled, fork() should be avoided or MADV_DONTFORK should be configured for the source virtual memory area before fork(). ENOMEM Allocating memory needed for the operation failed. ESRCH The target process has exited at the time of a UFFDIO_MOVE operation. Link: https://lkml.kernel.org/r/20231206103702.3873743-3-surenb@google.com Signed-off-by: Andrea Arcangeli Signed-off-by: Suren Baghdasaryan Cc: Al Viro Cc: Axel Rasmussen Cc: Brian Geffon Cc: Christian Brauner Cc: David Hildenbrand Cc: Hugh Dickins Cc: Jann Horn Cc: Kalesh Singh Cc: Liam R. Howlett Cc: Lokesh Gidra Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport (IBM) Cc: Nicolas Geoffray Cc: Peter Xu Cc: Ryan Roberts Cc: Shuah Khan Cc: ZhangPeng Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/userfaultfd.rst | 3 + fs/userfaultfd.c | 72 ++++ include/linux/rmap.h | 5 + include/linux/userfaultfd_k.h | 11 + include/uapi/linux/userfaultfd.h | 29 +- mm/huge_memory.c | 122 ++++++ mm/khugepaged.c | 3 + mm/rmap.c | 6 + mm/userfaultfd.c | 614 +++++++++++++++++++++++++++ 9 files changed, 864 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/Documentation/admin-guide/mm/userfaultfd.rst b/Documentation/admin-guide/mm/userfaultfd.rst index 203e26da5f92..e5cc8848dcb3 100644 --- a/Documentation/admin-guide/mm/userfaultfd.rst +++ b/Documentation/admin-guide/mm/userfaultfd.rst @@ -113,6 +113,9 @@ events, except page fault notifications, may be generated: areas. ``UFFD_FEATURE_MINOR_SHMEM`` is the analogous feature indicating support for shmem virtual memory areas. +- ``UFFD_FEATURE_MOVE`` indicates that the kernel supports moving an + existing page contents from userspace. + The userland application should set the feature flags it intends to use when invoking the ``UFFDIO_API`` ioctl, to request that those features be enabled if supported. diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index e8af40b05549..6e2a4d6a0d8f 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -2005,6 +2005,75 @@ static inline unsigned int uffd_ctx_features(__u64 user_features) return (unsigned int)user_features | UFFD_FEATURE_INITIALIZED; } +static int userfaultfd_move(struct userfaultfd_ctx *ctx, + unsigned long arg) +{ + __s64 ret; + struct uffdio_move uffdio_move; + struct uffdio_move __user *user_uffdio_move; + struct userfaultfd_wake_range range; + struct mm_struct *mm = ctx->mm; + + user_uffdio_move = (struct uffdio_move __user *) arg; + + if (atomic_read(&ctx->mmap_changing)) + return -EAGAIN; + + if (copy_from_user(&uffdio_move, user_uffdio_move, + /* don't copy "move" last field */ + sizeof(uffdio_move)-sizeof(__s64))) + return -EFAULT; + + /* Do not allow cross-mm moves. */ + if (mm != current->mm) + return -EINVAL; + + ret = validate_range(mm, uffdio_move.dst, uffdio_move.len); + if (ret) + return ret; + + ret = validate_range(mm, uffdio_move.src, uffdio_move.len); + if (ret) + return ret; + + if (uffdio_move.mode & ~(UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES| + UFFDIO_MOVE_MODE_DONTWAKE)) + return -EINVAL; + + if (mmget_not_zero(mm)) { + mmap_read_lock(mm); + + /* Re-check after taking mmap_lock */ + if (likely(!atomic_read(&ctx->mmap_changing))) + ret = move_pages(ctx, mm, uffdio_move.dst, uffdio_move.src, + uffdio_move.len, uffdio_move.mode); + else + ret = -EINVAL; + + mmap_read_unlock(mm); + mmput(mm); + } else { + return -ESRCH; + } + + if (unlikely(put_user(ret, &user_uffdio_move->move))) + return -EFAULT; + if (ret < 0) + goto out; + + /* len == 0 would wake all */ + VM_WARN_ON(!ret); + range.len = ret; + if (!(uffdio_move.mode & UFFDIO_MOVE_MODE_DONTWAKE)) { + range.start = uffdio_move.dst; + wake_userfault(ctx, &range); + } + ret = range.len == uffdio_move.len ? 0 : -EAGAIN; + +out: + return ret; +} + /* * userland asks for a certain API version and we return which bits * and ioctl commands are implemented in this kernel for such API @@ -2097,6 +2166,9 @@ static long userfaultfd_ioctl(struct file *file, unsigned cmd, case UFFDIO_ZEROPAGE: ret = userfaultfd_zeropage(ctx, arg); break; + case UFFDIO_MOVE: + ret = userfaultfd_move(ctx, arg); + break; case UFFDIO_WRITEPROTECT: ret = userfaultfd_writeprotect(ctx, arg); break; diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 3c2fc291b071..af6a32b6f3e7 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -121,6 +121,11 @@ static inline void anon_vma_lock_write(struct anon_vma *anon_vma) down_write(&anon_vma->root->rwsem); } +static inline int anon_vma_trylock_write(struct anon_vma *anon_vma) +{ + return down_write_trylock(&anon_vma->root->rwsem); +} + static inline void anon_vma_unlock_write(struct anon_vma *anon_vma) { up_write(&anon_vma->root->rwsem); diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index f2dc19f40d05..e4056547fbe6 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -93,6 +93,17 @@ extern int mwriteprotect_range(struct mm_struct *dst_mm, extern long uffd_wp_range(struct vm_area_struct *vma, unsigned long start, unsigned long len, bool enable_wp); +/* move_pages */ +void double_pt_lock(spinlock_t *ptl1, spinlock_t *ptl2); +void double_pt_unlock(spinlock_t *ptl1, spinlock_t *ptl2); +ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm, + unsigned long dst_start, unsigned long src_start, + unsigned long len, __u64 flags); +int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pmd_t dst_pmdval, + struct vm_area_struct *dst_vma, + struct vm_area_struct *src_vma, + unsigned long dst_addr, unsigned long src_addr); + /* mm helpers */ static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma, struct vm_userfaultfd_ctx vm_ctx) diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h index 0dbc81015018..2841e4ea8f2c 100644 --- a/include/uapi/linux/userfaultfd.h +++ b/include/uapi/linux/userfaultfd.h @@ -41,7 +41,8 @@ UFFD_FEATURE_WP_HUGETLBFS_SHMEM | \ UFFD_FEATURE_WP_UNPOPULATED | \ UFFD_FEATURE_POISON | \ - UFFD_FEATURE_WP_ASYNC) + UFFD_FEATURE_WP_ASYNC | \ + UFFD_FEATURE_MOVE) #define UFFD_API_IOCTLS \ ((__u64)1 << _UFFDIO_REGISTER | \ (__u64)1 << _UFFDIO_UNREGISTER | \ @@ -50,6 +51,7 @@ ((__u64)1 << _UFFDIO_WAKE | \ (__u64)1 << _UFFDIO_COPY | \ (__u64)1 << _UFFDIO_ZEROPAGE | \ + (__u64)1 << _UFFDIO_MOVE | \ (__u64)1 << _UFFDIO_WRITEPROTECT | \ (__u64)1 << _UFFDIO_CONTINUE | \ (__u64)1 << _UFFDIO_POISON) @@ -73,6 +75,7 @@ #define _UFFDIO_WAKE (0x02) #define _UFFDIO_COPY (0x03) #define _UFFDIO_ZEROPAGE (0x04) +#define _UFFDIO_MOVE (0x05) #define _UFFDIO_WRITEPROTECT (0x06) #define _UFFDIO_CONTINUE (0x07) #define _UFFDIO_POISON (0x08) @@ -92,6 +95,8 @@ struct uffdio_copy) #define UFFDIO_ZEROPAGE _IOWR(UFFDIO, _UFFDIO_ZEROPAGE, \ struct uffdio_zeropage) +#define UFFDIO_MOVE _IOWR(UFFDIO, _UFFDIO_MOVE, \ + struct uffdio_move) #define UFFDIO_WRITEPROTECT _IOWR(UFFDIO, _UFFDIO_WRITEPROTECT, \ struct uffdio_writeprotect) #define UFFDIO_CONTINUE _IOWR(UFFDIO, _UFFDIO_CONTINUE, \ @@ -222,6 +227,9 @@ struct uffdio_api { * asynchronous mode is supported in which the write fault is * automatically resolved and write-protection is un-set. * It implies UFFD_FEATURE_WP_UNPOPULATED. + * + * UFFD_FEATURE_MOVE indicates that the kernel supports moving an + * existing page contents from userspace. */ #define UFFD_FEATURE_PAGEFAULT_FLAG_WP (1<<0) #define UFFD_FEATURE_EVENT_FORK (1<<1) @@ -239,6 +247,7 @@ struct uffdio_api { #define UFFD_FEATURE_WP_UNPOPULATED (1<<13) #define UFFD_FEATURE_POISON (1<<14) #define UFFD_FEATURE_WP_ASYNC (1<<15) +#define UFFD_FEATURE_MOVE (1<<16) __u64 features; __u64 ioctls; @@ -347,6 +356,24 @@ struct uffdio_poison { __s64 updated; }; +struct uffdio_move { + __u64 dst; + __u64 src; + __u64 len; + /* + * Especially if used to atomically remove memory from the + * address space the wake on the dst range is not needed. + */ +#define UFFDIO_MOVE_MODE_DONTWAKE ((__u64)1<<0) +#define UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES ((__u64)1<<1) + __u64 mode; + /* + * "move" is written by the ioctl and must be at the end: the + * copy_from_user will not read the last 8 bytes. + */ + __s64 move; +}; + /* * Flags for the userfaultfd(2) system call itself. */ diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 387b030c7f15..6be1a380a298 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2141,6 +2141,128 @@ unlock: return ret; } +#ifdef CONFIG_USERFAULTFD +/* + * The PT lock for src_pmd and the mmap_lock for reading are held by + * the caller, but it must return after releasing the page_table_lock. + * Just move the page from src_pmd to dst_pmd if possible. + * Return zero if succeeded in moving the page, -EAGAIN if it needs to be + * repeated by the caller, or other errors in case of failure. + */ +int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pmd_t dst_pmdval, + struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, + unsigned long dst_addr, unsigned long src_addr) +{ + pmd_t _dst_pmd, src_pmdval; + struct page *src_page; + struct folio *src_folio; + struct anon_vma *src_anon_vma; + spinlock_t *src_ptl, *dst_ptl; + pgtable_t src_pgtable; + struct mmu_notifier_range range; + int err = 0; + + src_pmdval = *src_pmd; + src_ptl = pmd_lockptr(mm, src_pmd); + + lockdep_assert_held(src_ptl); + mmap_assert_locked(mm); + + /* Sanity checks before the operation */ + if (WARN_ON_ONCE(!pmd_none(dst_pmdval)) || WARN_ON_ONCE(src_addr & ~HPAGE_PMD_MASK) || + WARN_ON_ONCE(dst_addr & ~HPAGE_PMD_MASK)) { + spin_unlock(src_ptl); + return -EINVAL; + } + + if (!pmd_trans_huge(src_pmdval)) { + spin_unlock(src_ptl); + if (is_pmd_migration_entry(src_pmdval)) { + pmd_migration_entry_wait(mm, &src_pmdval); + return -EAGAIN; + } + return -ENOENT; + } + + src_page = pmd_page(src_pmdval); + if (unlikely(!PageAnonExclusive(src_page))) { + spin_unlock(src_ptl); + return -EBUSY; + } + + src_folio = page_folio(src_page); + folio_get(src_folio); + spin_unlock(src_ptl); + + flush_cache_range(src_vma, src_addr, src_addr + HPAGE_PMD_SIZE); + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, src_addr, + src_addr + HPAGE_PMD_SIZE); + mmu_notifier_invalidate_range_start(&range); + + folio_lock(src_folio); + + /* + * split_huge_page walks the anon_vma chain without the page + * lock. Serialize against it with the anon_vma lock, the page + * lock is not enough. + */ + src_anon_vma = folio_get_anon_vma(src_folio); + if (!src_anon_vma) { + err = -EAGAIN; + goto unlock_folio; + } + anon_vma_lock_write(src_anon_vma); + + dst_ptl = pmd_lockptr(mm, dst_pmd); + double_pt_lock(src_ptl, dst_ptl); + if (unlikely(!pmd_same(*src_pmd, src_pmdval) || + !pmd_same(*dst_pmd, dst_pmdval))) { + err = -EAGAIN; + goto unlock_ptls; + } + if (folio_maybe_dma_pinned(src_folio) || + !PageAnonExclusive(&src_folio->page)) { + err = -EBUSY; + goto unlock_ptls; + } + + if (WARN_ON_ONCE(!folio_test_head(src_folio)) || + WARN_ON_ONCE(!folio_test_anon(src_folio))) { + err = -EBUSY; + goto unlock_ptls; + } + + folio_move_anon_rmap(src_folio, dst_vma); + WRITE_ONCE(src_folio->index, linear_page_index(dst_vma, dst_addr)); + + src_pmdval = pmdp_huge_clear_flush(src_vma, src_addr, src_pmd); + /* Folio got pinned from under us. Put it back and fail the move. */ + if (folio_maybe_dma_pinned(src_folio)) { + set_pmd_at(mm, src_addr, src_pmd, src_pmdval); + err = -EBUSY; + goto unlock_ptls; + } + + _dst_pmd = mk_huge_pmd(&src_folio->page, dst_vma->vm_page_prot); + /* Follow mremap() behavior and treat the entry dirty after the move */ + _dst_pmd = pmd_mkwrite(pmd_mkdirty(_dst_pmd), dst_vma); + set_pmd_at(mm, dst_addr, dst_pmd, _dst_pmd); + + src_pgtable = pgtable_trans_huge_withdraw(mm, src_pmd); + pgtable_trans_huge_deposit(mm, dst_pmd, src_pgtable); +unlock_ptls: + double_pt_unlock(src_ptl, dst_ptl); + anon_vma_unlock_write(src_anon_vma); + put_anon_vma(src_anon_vma); +unlock_folio: + /* unblock rmap walks */ + folio_unlock(src_folio); + mmu_notifier_invalidate_range_end(&range); + folio_put(src_folio); + return err; +} +#endif /* CONFIG_USERFAULTFD */ + /* * Returns page table lock pointer if a given pmd maps a thp, NULL otherwise. * diff --git a/mm/khugepaged.c b/mm/khugepaged.c index d72aecd3624a..de174d049e71 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1140,6 +1140,9 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, * Prevent all access to pagetables with the exception of * gup_fast later handled by the ptep_clear_flush and the VM * handled by the anon_vma lock + PG_lock. + * + * UFFDIO_MOVE is prevented to race as well thanks to the + * mmap_lock. */ mmap_write_lock(mm); result = hugepage_vma_revalidate(mm, address, true, &vma, cc); diff --git a/mm/rmap.c b/mm/rmap.c index 15a55304aa3b..846fc79f3ca9 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -490,6 +490,12 @@ void __init anon_vma_init(void) * page_remove_rmap() that the anon_vma pointer from page->mapping is valid * if there is a mapcount, we can dereference the anon_vma after observing * those. + * + * NOTE: the caller should normally hold folio lock when calling this. If + * not, the caller needs to double check the anon_vma didn't change after + * taking the anon_vma lock for either read or write (UFFDIO_MOVE can modify it + * concurrently without folio lock protection). See folio_lock_anon_vma_read() + * which has already covered that, and comment above remap_pages(). */ struct anon_vma *folio_get_anon_vma(struct folio *folio) { diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 0b6ca553bebe..9ec814e47e99 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -842,3 +842,617 @@ out_unlock: mmap_read_unlock(dst_mm); return err; } + + +void double_pt_lock(spinlock_t *ptl1, + spinlock_t *ptl2) + __acquires(ptl1) + __acquires(ptl2) +{ + spinlock_t *ptl_tmp; + + if (ptl1 > ptl2) { + /* exchange ptl1 and ptl2 */ + ptl_tmp = ptl1; + ptl1 = ptl2; + ptl2 = ptl_tmp; + } + /* lock in virtual address order to avoid lock inversion */ + spin_lock(ptl1); + if (ptl1 != ptl2) + spin_lock_nested(ptl2, SINGLE_DEPTH_NESTING); + else + __acquire(ptl2); +} + +void double_pt_unlock(spinlock_t *ptl1, + spinlock_t *ptl2) + __releases(ptl1) + __releases(ptl2) +{ + spin_unlock(ptl1); + if (ptl1 != ptl2) + spin_unlock(ptl2); + else + __release(ptl2); +} + + +static int move_present_pte(struct mm_struct *mm, + struct vm_area_struct *dst_vma, + struct vm_area_struct *src_vma, + unsigned long dst_addr, unsigned long src_addr, + pte_t *dst_pte, pte_t *src_pte, + pte_t orig_dst_pte, pte_t orig_src_pte, + spinlock_t *dst_ptl, spinlock_t *src_ptl, + struct folio *src_folio) +{ + int err = 0; + + double_pt_lock(dst_ptl, src_ptl); + + if (!pte_same(*src_pte, orig_src_pte) || + !pte_same(*dst_pte, orig_dst_pte)) { + err = -EAGAIN; + goto out; + } + if (folio_test_large(src_folio) || + folio_maybe_dma_pinned(src_folio) || + !PageAnonExclusive(&src_folio->page)) { + err = -EBUSY; + goto out; + } + + folio_move_anon_rmap(src_folio, dst_vma); + WRITE_ONCE(src_folio->index, linear_page_index(dst_vma, dst_addr)); + + orig_src_pte = ptep_clear_flush(src_vma, src_addr, src_pte); + /* Folio got pinned from under us. Put it back and fail the move. */ + if (folio_maybe_dma_pinned(src_folio)) { + set_pte_at(mm, src_addr, src_pte, orig_src_pte); + err = -EBUSY; + goto out; + } + + orig_dst_pte = mk_pte(&src_folio->page, dst_vma->vm_page_prot); + /* Follow mremap() behavior and treat the entry dirty after the move */ + orig_dst_pte = pte_mkwrite(pte_mkdirty(orig_dst_pte), dst_vma); + + set_pte_at(mm, dst_addr, dst_pte, orig_dst_pte); +out: + double_pt_unlock(dst_ptl, src_ptl); + return err; +} + +static int move_swap_pte(struct mm_struct *mm, + unsigned long dst_addr, unsigned long src_addr, + pte_t *dst_pte, pte_t *src_pte, + pte_t orig_dst_pte, pte_t orig_src_pte, + spinlock_t *dst_ptl, spinlock_t *src_ptl) +{ + if (!pte_swp_exclusive(orig_src_pte)) + return -EBUSY; + + double_pt_lock(dst_ptl, src_ptl); + + if (!pte_same(*src_pte, orig_src_pte) || + !pte_same(*dst_pte, orig_dst_pte)) { + double_pt_unlock(dst_ptl, src_ptl); + return -EAGAIN; + } + + orig_src_pte = ptep_get_and_clear(mm, src_addr, src_pte); + set_pte_at(mm, dst_addr, dst_pte, orig_src_pte); + double_pt_unlock(dst_ptl, src_ptl); + + return 0; +} + +/* + * The mmap_lock for reading is held by the caller. Just move the page + * from src_pmd to dst_pmd if possible, and return true if succeeded + * in moving the page. + */ +static int move_pages_pte(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, + struct vm_area_struct *dst_vma, + struct vm_area_struct *src_vma, + unsigned long dst_addr, unsigned long src_addr, + __u64 mode) +{ + swp_entry_t entry; + pte_t orig_src_pte, orig_dst_pte; + pte_t src_folio_pte; + spinlock_t *src_ptl, *dst_ptl; + pte_t *src_pte = NULL; + pte_t *dst_pte = NULL; + + struct folio *src_folio = NULL; + struct anon_vma *src_anon_vma = NULL; + struct mmu_notifier_range range; + int err = 0; + + flush_cache_range(src_vma, src_addr, src_addr + PAGE_SIZE); + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, + src_addr, src_addr + PAGE_SIZE); + mmu_notifier_invalidate_range_start(&range); +retry: + dst_pte = pte_offset_map_nolock(mm, dst_pmd, dst_addr, &dst_ptl); + + /* Retry if a huge pmd materialized from under us */ + if (unlikely(!dst_pte)) { + err = -EAGAIN; + goto out; + } + + src_pte = pte_offset_map_nolock(mm, src_pmd, src_addr, &src_ptl); + + /* + * We held the mmap_lock for reading so MADV_DONTNEED + * can zap transparent huge pages under us, or the + * transparent huge page fault can establish new + * transparent huge pages under us. + */ + if (unlikely(!src_pte)) { + err = -EAGAIN; + goto out; + } + + /* Sanity checks before the operation */ + if (WARN_ON_ONCE(pmd_none(*dst_pmd)) || WARN_ON_ONCE(pmd_none(*src_pmd)) || + WARN_ON_ONCE(pmd_trans_huge(*dst_pmd)) || WARN_ON_ONCE(pmd_trans_huge(*src_pmd))) { + err = -EINVAL; + goto out; + } + + spin_lock(dst_ptl); + orig_dst_pte = *dst_pte; + spin_unlock(dst_ptl); + if (!pte_none(orig_dst_pte)) { + err = -EEXIST; + goto out; + } + + spin_lock(src_ptl); + orig_src_pte = *src_pte; + spin_unlock(src_ptl); + if (pte_none(orig_src_pte)) { + if (!(mode & UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES)) + err = -ENOENT; + else /* nothing to do to move a hole */ + err = 0; + goto out; + } + + /* If PTE changed after we locked the folio them start over */ + if (src_folio && unlikely(!pte_same(src_folio_pte, orig_src_pte))) { + err = -EAGAIN; + goto out; + } + + if (pte_present(orig_src_pte)) { + /* + * Pin and lock both source folio and anon_vma. Since we are in + * RCU read section, we can't block, so on contention have to + * unmap the ptes, obtain the lock and retry. + */ + if (!src_folio) { + struct folio *folio; + + /* + * Pin the page while holding the lock to be sure the + * page isn't freed under us + */ + spin_lock(src_ptl); + if (!pte_same(orig_src_pte, *src_pte)) { + spin_unlock(src_ptl); + err = -EAGAIN; + goto out; + } + + folio = vm_normal_folio(src_vma, src_addr, orig_src_pte); + if (!folio || !PageAnonExclusive(&folio->page)) { + spin_unlock(src_ptl); + err = -EBUSY; + goto out; + } + + folio_get(folio); + src_folio = folio; + src_folio_pte = orig_src_pte; + spin_unlock(src_ptl); + + if (!folio_trylock(src_folio)) { + pte_unmap(&orig_src_pte); + pte_unmap(&orig_dst_pte); + src_pte = dst_pte = NULL; + /* now we can block and wait */ + folio_lock(src_folio); + goto retry; + } + + if (WARN_ON_ONCE(!folio_test_anon(src_folio))) { + err = -EBUSY; + goto out; + } + } + + /* at this point we have src_folio locked */ + if (folio_test_large(src_folio)) { + err = split_folio(src_folio); + if (err) + goto out; + } + + if (!src_anon_vma) { + /* + * folio_referenced walks the anon_vma chain + * without the folio lock. Serialize against it with + * the anon_vma lock, the folio lock is not enough. + */ + src_anon_vma = folio_get_anon_vma(src_folio); + if (!src_anon_vma) { + /* page was unmapped from under us */ + err = -EAGAIN; + goto out; + } + if (!anon_vma_trylock_write(src_anon_vma)) { + pte_unmap(&orig_src_pte); + pte_unmap(&orig_dst_pte); + src_pte = dst_pte = NULL; + /* now we can block and wait */ + anon_vma_lock_write(src_anon_vma); + goto retry; + } + } + + err = move_present_pte(mm, dst_vma, src_vma, + dst_addr, src_addr, dst_pte, src_pte, + orig_dst_pte, orig_src_pte, + dst_ptl, src_ptl, src_folio); + } else { + entry = pte_to_swp_entry(orig_src_pte); + if (non_swap_entry(entry)) { + if (is_migration_entry(entry)) { + pte_unmap(&orig_src_pte); + pte_unmap(&orig_dst_pte); + src_pte = dst_pte = NULL; + migration_entry_wait(mm, src_pmd, src_addr); + err = -EAGAIN; + } else + err = -EFAULT; + goto out; + } + + err = move_swap_pte(mm, dst_addr, src_addr, + dst_pte, src_pte, + orig_dst_pte, orig_src_pte, + dst_ptl, src_ptl); + } + +out: + if (src_anon_vma) { + anon_vma_unlock_write(src_anon_vma); + put_anon_vma(src_anon_vma); + } + if (src_folio) { + folio_unlock(src_folio); + folio_put(src_folio); + } + if (dst_pte) + pte_unmap(dst_pte); + if (src_pte) + pte_unmap(src_pte); + mmu_notifier_invalidate_range_end(&range); + + return err; +} + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static inline bool move_splits_huge_pmd(unsigned long dst_addr, + unsigned long src_addr, + unsigned long src_end) +{ + return (src_addr & ~HPAGE_PMD_MASK) || (dst_addr & ~HPAGE_PMD_MASK) || + src_end - src_addr < HPAGE_PMD_SIZE; +} +#else +static inline bool move_splits_huge_pmd(unsigned long dst_addr, + unsigned long src_addr, + unsigned long src_end) +{ + /* This is unreachable anyway, just to avoid warnings when HPAGE_PMD_SIZE==0 */ + return false; +} +#endif + +static inline bool vma_move_compatible(struct vm_area_struct *vma) +{ + return !(vma->vm_flags & (VM_PFNMAP | VM_IO | VM_HUGETLB | + VM_MIXEDMAP | VM_SHADOW_STACK)); +} + +static int validate_move_areas(struct userfaultfd_ctx *ctx, + struct vm_area_struct *src_vma, + struct vm_area_struct *dst_vma) +{ + /* Only allow moving if both have the same access and protection */ + if ((src_vma->vm_flags & VM_ACCESS_FLAGS) != (dst_vma->vm_flags & VM_ACCESS_FLAGS) || + pgprot_val(src_vma->vm_page_prot) != pgprot_val(dst_vma->vm_page_prot)) + return -EINVAL; + + /* Only allow moving if both are mlocked or both aren't */ + if ((src_vma->vm_flags & VM_LOCKED) != (dst_vma->vm_flags & VM_LOCKED)) + return -EINVAL; + + /* + * For now, we keep it simple and only move between writable VMAs. + * Access flags are equal, therefore cheching only the source is enough. + */ + if (!(src_vma->vm_flags & VM_WRITE)) + return -EINVAL; + + /* Check if vma flags indicate content which can be moved */ + if (!vma_move_compatible(src_vma) || !vma_move_compatible(dst_vma)) + return -EINVAL; + + /* Ensure dst_vma is registered in uffd we are operating on */ + if (!dst_vma->vm_userfaultfd_ctx.ctx || + dst_vma->vm_userfaultfd_ctx.ctx != ctx) + return -EINVAL; + + /* Only allow moving across anonymous vmas */ + if (!vma_is_anonymous(src_vma) || !vma_is_anonymous(dst_vma)) + return -EINVAL; + + /* + * Ensure the dst_vma has a anon_vma or this page + * would get a NULL anon_vma when moved in the + * dst_vma. + */ + if (unlikely(anon_vma_prepare(dst_vma))) + return -ENOMEM; + + return 0; +} + +/** + * move_pages - move arbitrary anonymous pages of an existing vma + * @ctx: pointer to the userfaultfd context + * @mm: the address space to move pages + * @dst_start: start of the destination virtual memory range + * @src_start: start of the source virtual memory range + * @len: length of the virtual memory range + * @mode: flags from uffdio_move.mode + * + * Must be called with mmap_lock held for read. + * + * move_pages() remaps arbitrary anonymous pages atomically in zero + * copy. It only works on non shared anonymous pages because those can + * be relocated without generating non linear anon_vmas in the rmap + * code. + * + * It provides a zero copy mechanism to handle userspace page faults. + * The source vma pages should have mapcount == 1, which can be + * enforced by using madvise(MADV_DONTFORK) on src vma. + * + * The thread receiving the page during the userland page fault + * will receive the faulting page in the source vma through the network, + * storage or any other I/O device (MADV_DONTFORK in the source vma + * avoids move_pages() to fail with -EBUSY if the process forks before + * move_pages() is called), then it will call move_pages() to map the + * page in the faulting address in the destination vma. + * + * This userfaultfd command works purely via pagetables, so it's the + * most efficient way to move physical non shared anonymous pages + * across different virtual addresses. Unlike mremap()/mmap()/munmap() + * it does not create any new vmas. The mapping in the destination + * address is atomic. + * + * It only works if the vma protection bits are identical from the + * source and destination vma. + * + * It can remap non shared anonymous pages within the same vma too. + * + * If the source virtual memory range has any unmapped holes, or if + * the destination virtual memory range is not a whole unmapped hole, + * move_pages() will fail respectively with -ENOENT or -EEXIST. This + * provides a very strict behavior to avoid any chance of memory + * corruption going unnoticed if there are userland race conditions. + * Only one thread should resolve the userland page fault at any given + * time for any given faulting address. This means that if two threads + * try to both call move_pages() on the same destination address at the + * same time, the second thread will get an explicit error from this + * command. + * + * The command retval will return "len" is successful. The command + * however can be interrupted by fatal signals or errors. If + * interrupted it will return the number of bytes successfully + * remapped before the interruption if any, or the negative error if + * none. It will never return zero. Either it will return an error or + * an amount of bytes successfully moved. If the retval reports a + * "short" remap, the move_pages() command should be repeated by + * userland with src+retval, dst+reval, len-retval if it wants to know + * about the error that interrupted it. + * + * The UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES flag can be specified to + * prevent -ENOENT errors to materialize if there are holes in the + * source virtual range that is being remapped. The holes will be + * accounted as successfully remapped in the retval of the + * command. This is mostly useful to remap hugepage naturally aligned + * virtual regions without knowing if there are transparent hugepage + * in the regions or not, but preventing the risk of having to split + * the hugepmd during the remap. + * + * If there's any rmap walk that is taking the anon_vma locks without + * first obtaining the folio lock (the only current instance is + * folio_referenced), they will have to verify if the folio->mapping + * has changed after taking the anon_vma lock. If it changed they + * should release the lock and retry obtaining a new anon_vma, because + * it means the anon_vma was changed by move_pages() before the lock + * could be obtained. This is the only additional complexity added to + * the rmap code to provide this anonymous page remapping functionality. + */ +ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm, + unsigned long dst_start, unsigned long src_start, + unsigned long len, __u64 mode) +{ + struct vm_area_struct *src_vma, *dst_vma; + unsigned long src_addr, dst_addr; + pmd_t *src_pmd, *dst_pmd; + long err = -EINVAL; + ssize_t moved = 0; + + /* Sanitize the command parameters. */ + if (WARN_ON_ONCE(src_start & ~PAGE_MASK) || + WARN_ON_ONCE(dst_start & ~PAGE_MASK) || + WARN_ON_ONCE(len & ~PAGE_MASK)) + goto out; + + /* Does the address range wrap, or is the span zero-sized? */ + if (WARN_ON_ONCE(src_start + len <= src_start) || + WARN_ON_ONCE(dst_start + len <= dst_start)) + goto out; + + /* + * Make sure the vma is not shared, that the src and dst remap + * ranges are both valid and fully within a single existing + * vma. + */ + src_vma = find_vma(mm, src_start); + if (!src_vma || (src_vma->vm_flags & VM_SHARED)) + goto out; + if (src_start < src_vma->vm_start || + src_start + len > src_vma->vm_end) + goto out; + + dst_vma = find_vma(mm, dst_start); + if (!dst_vma || (dst_vma->vm_flags & VM_SHARED)) + goto out; + if (dst_start < dst_vma->vm_start || + dst_start + len > dst_vma->vm_end) + goto out; + + err = validate_move_areas(ctx, src_vma, dst_vma); + if (err) + goto out; + + for (src_addr = src_start, dst_addr = dst_start; + src_addr < src_start + len;) { + spinlock_t *ptl; + pmd_t dst_pmdval; + unsigned long step_size; + + /* + * Below works because anonymous area would not have a + * transparent huge PUD. If file-backed support is added, + * that case would need to be handled here. + */ + src_pmd = mm_find_pmd(mm, src_addr); + if (unlikely(!src_pmd)) { + if (!(mode & UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES)) { + err = -ENOENT; + break; + } + src_pmd = mm_alloc_pmd(mm, src_addr); + if (unlikely(!src_pmd)) { + err = -ENOMEM; + break; + } + } + dst_pmd = mm_alloc_pmd(mm, dst_addr); + if (unlikely(!dst_pmd)) { + err = -ENOMEM; + break; + } + + dst_pmdval = pmdp_get_lockless(dst_pmd); + /* + * If the dst_pmd is mapped as THP don't override it and just + * be strict. If dst_pmd changes into TPH after this check, the + * move_pages_huge_pmd() will detect the change and retry + * while move_pages_pte() will detect the change and fail. + */ + if (unlikely(pmd_trans_huge(dst_pmdval))) { + err = -EEXIST; + break; + } + + ptl = pmd_trans_huge_lock(src_pmd, src_vma); + if (ptl) { + if (pmd_devmap(*src_pmd)) { + spin_unlock(ptl); + err = -ENOENT; + break; + } + + /* Check if we can move the pmd without splitting it. */ + if (move_splits_huge_pmd(dst_addr, src_addr, src_start + len) || + !pmd_none(dst_pmdval)) { + struct folio *folio = pfn_folio(pmd_pfn(*src_pmd)); + + if (!folio || !PageAnonExclusive(&folio->page)) { + spin_unlock(ptl); + err = -EBUSY; + break; + } + + spin_unlock(ptl); + split_huge_pmd(src_vma, src_pmd, src_addr); + /* The folio will be split by move_pages_pte() */ + continue; + } + + err = move_pages_huge_pmd(mm, dst_pmd, src_pmd, + dst_pmdval, dst_vma, src_vma, + dst_addr, src_addr); + step_size = HPAGE_PMD_SIZE; + } else { + if (pmd_none(*src_pmd)) { + if (!(mode & UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES)) { + err = -ENOENT; + break; + } + if (unlikely(__pte_alloc(mm, src_pmd))) { + err = -ENOMEM; + break; + } + } + + if (unlikely(pte_alloc(mm, dst_pmd))) { + err = -ENOMEM; + break; + } + + err = move_pages_pte(mm, dst_pmd, src_pmd, + dst_vma, src_vma, + dst_addr, src_addr, mode); + step_size = PAGE_SIZE; + } + + cond_resched(); + + if (fatal_signal_pending(current)) { + /* Do not override an error */ + if (!err || err == -EAGAIN) + err = -EINTR; + break; + } + + if (err) { + if (err == -EAGAIN) + continue; + break; + } + + /* Proceed to the next page */ + dst_addr += step_size; + src_addr += step_size; + moved += step_size; + } + +out: + VM_WARN_ON(moved < 0); + VM_WARN_ON(err > 0); + VM_WARN_ON(!moved && !err); + return moved ? moved : err; +} -- cgit v1.2.3 From 96db66d9c8f3c1547325af01b1f328b85d6ee1b9 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 11 Dec 2023 16:22:06 +0000 Subject: mm: convert ksm_might_need_to_copy() to work on folios Patch series "Finish two folio conversions". Most callers of page_add_new_anon_rmap() and lru_cache_add_inactive_or_unevictable() have been converted to their folio equivalents, but there are still a few stragglers. There's a bit of preparatory work in ksm and unuse_pte(), but after that it's pretty mechanical. This patch (of 9): Accept a folio as an argument and return a folio result. Removes a call to compound_head() in do_swap_page(), and prevents folio & page from getting out of sync in unuse_pte(). Reviewed-by: David Hildenbrand [willy@infradead.org: fix smatch warning] Link: https://lkml.kernel.org/r/ZXnPtblC6A1IkyAB@casper.infradead.org [david@redhat.com: only adjust the page if the folio changed] Link: https://lkml.kernel.org/r/6a8f2110-fa91-4c10-9eae-88315309a6e3@redhat.com Link: https://lkml.kernel.org/r/20231211162214.2146080-1-willy@infradead.org Link: https://lkml.kernel.org/r/20231211162214.2146080-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: David Hildenbrand Signed-off-by: Andrew Morton --- include/linux/ksm.h | 6 +++--- mm/ksm.c | 21 +++++++++++---------- mm/memory.c | 11 +++++++---- mm/swapfile.c | 8 +++++--- 4 files changed, 26 insertions(+), 20 deletions(-) (limited to 'include') diff --git a/include/linux/ksm.h b/include/linux/ksm.h index 4643d5244e77..401348e9f92b 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h @@ -76,7 +76,7 @@ static inline void ksm_exit(struct mm_struct *mm) * We'd like to make this conditional on vma->vm_flags & VM_MERGEABLE, * but what if the vma was unmerged while the page was swapped out? */ -struct page *ksm_might_need_to_copy(struct page *page, +struct folio *ksm_might_need_to_copy(struct folio *folio, struct vm_area_struct *vma, unsigned long addr); void rmap_walk_ksm(struct folio *folio, struct rmap_walk_control *rwc); @@ -129,10 +129,10 @@ static inline int ksm_madvise(struct vm_area_struct *vma, unsigned long start, return 0; } -static inline struct page *ksm_might_need_to_copy(struct page *page, +static inline struct folio *ksm_might_need_to_copy(struct folio *folio, struct vm_area_struct *vma, unsigned long addr) { - return page; + return folio; } static inline void rmap_walk_ksm(struct folio *folio, diff --git a/mm/ksm.c b/mm/ksm.c index c0e1995fb444..e2ce850c2739 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -2873,30 +2873,30 @@ void __ksm_exit(struct mm_struct *mm) trace_ksm_exit(mm); } -struct page *ksm_might_need_to_copy(struct page *page, +struct folio *ksm_might_need_to_copy(struct folio *folio, struct vm_area_struct *vma, unsigned long addr) { - struct folio *folio = page_folio(page); + struct page *page = folio_page(folio, 0); struct anon_vma *anon_vma = folio_anon_vma(folio); struct folio *new_folio; if (folio_test_large(folio)) - return page; + return folio; if (folio_test_ksm(folio)) { if (folio_stable_node(folio) && !(ksm_run & KSM_RUN_UNMERGE)) - return page; /* no need to copy it */ + return folio; /* no need to copy it */ } else if (!anon_vma) { - return page; /* no need to copy it */ + return folio; /* no need to copy it */ } else if (folio->index == linear_page_index(vma, addr) && anon_vma->root == vma->anon_vma->root) { - return page; /* still no need to copy it */ + return folio; /* still no need to copy it */ } if (PageHWPoison(page)) return ERR_PTR(-EHWPOISON); if (!folio_test_uptodate(folio)) - return page; /* let do_swap_page report the error */ + return folio; /* let do_swap_page report the error */ new_folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, addr, false); if (new_folio && @@ -2905,9 +2905,10 @@ struct page *ksm_might_need_to_copy(struct page *page, new_folio = NULL; } if (new_folio) { - if (copy_mc_user_highpage(&new_folio->page, page, addr, vma)) { + if (copy_mc_user_highpage(folio_page(new_folio, 0), page, + addr, vma)) { folio_put(new_folio); - memory_failure_queue(page_to_pfn(page), 0); + memory_failure_queue(folio_pfn(folio), 0); return ERR_PTR(-EHWPOISON); } folio_set_dirty(new_folio); @@ -2918,7 +2919,7 @@ struct page *ksm_might_need_to_copy(struct page *page, #endif } - return new_folio ? &new_folio->page : NULL; + return new_folio; } void rmap_walk_ksm(struct folio *folio, struct rmap_walk_control *rwc) diff --git a/mm/memory.c b/mm/memory.c index b9cc56a75f4b..7649cb9eb7f5 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3942,15 +3942,18 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) * page->index of !PageKSM() pages would be nonlinear inside the * anon VMA -- PageKSM() is lost on actual swapout. */ - page = ksm_might_need_to_copy(page, vma, vmf->address); - if (unlikely(!page)) { + folio = ksm_might_need_to_copy(folio, vma, vmf->address); + if (unlikely(!folio)) { ret = VM_FAULT_OOM; + folio = swapcache; goto out_page; - } else if (unlikely(PTR_ERR(page) == -EHWPOISON)) { + } else if (unlikely(folio == ERR_PTR(-EHWPOISON))) { ret = VM_FAULT_HWPOISON; + folio = swapcache; goto out_page; } - folio = page_folio(page); + if (folio != swapcache) + page = folio_page(folio, 0); /* * If we want to map a page that's in the swapcache writable, we diff --git a/mm/swapfile.c b/mm/swapfile.c index 8be70912e298..0371b7b3cd27 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1749,11 +1749,13 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, int ret = 1; swapcache = page; - page = ksm_might_need_to_copy(page, vma, addr); - if (unlikely(!page)) + folio = ksm_might_need_to_copy(folio, vma, addr); + if (unlikely(!folio)) return -ENOMEM; - else if (unlikely(PTR_ERR(page) == -EHWPOISON)) + else if (unlikely(folio == ERR_PTR(-EHWPOISON))) hwpoisoned = true; + else + page = folio_file_page(folio, swp_offset(entry)); pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); if (unlikely(!pte || !pte_same_as_swp(ptep_get(pte), -- cgit v1.2.3 From cafa8e37a2ebd344ae0774324c21f46640bbaab3 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 11 Dec 2023 16:22:14 +0000 Subject: mm: remove page_add_new_anon_rmap and lru_cache_add_inactive_or_unevictable All callers have now been converted to folio_add_new_anon_rmap() and folio_add_lru_vma() so we can remove the wrapper. Link: https://lkml.kernel.org/r/20231211162214.2146080-10-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton --- include/linux/rmap.h | 2 -- include/linux/swap.h | 3 --- mm/folio-compat.c | 16 ---------------- 3 files changed, 21 deletions(-) (limited to 'include') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index af6a32b6f3e7..0ae2bb0e77f5 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -197,8 +197,6 @@ typedef int __bitwise rmap_t; void folio_move_anon_rmap(struct folio *, struct vm_area_struct *); void page_add_anon_rmap(struct page *, struct vm_area_struct *, unsigned long address, rmap_t flags); -void page_add_new_anon_rmap(struct page *, struct vm_area_struct *, - unsigned long address); void folio_add_new_anon_rmap(struct folio *, struct vm_area_struct *, unsigned long address); void page_add_file_rmap(struct page *, struct vm_area_struct *, diff --git a/include/linux/swap.h b/include/linux/swap.h index f6dd6575b905..3e1909087f6a 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -397,9 +397,6 @@ void folio_deactivate(struct folio *folio); void folio_mark_lazyfree(struct folio *folio); extern void swap_setup(void); -extern void lru_cache_add_inactive_or_unevictable(struct page *page, - struct vm_area_struct *vma); - /* linux/mm/vmscan.c */ extern unsigned long zone_reclaimable_pages(struct zone *zone); extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order, diff --git a/mm/folio-compat.c b/mm/folio-compat.c index aee3b9a16828..50412014f16f 100644 --- a/mm/folio-compat.c +++ b/mm/folio-compat.c @@ -77,12 +77,6 @@ bool redirty_page_for_writepage(struct writeback_control *wbc, } EXPORT_SYMBOL(redirty_page_for_writepage); -void lru_cache_add_inactive_or_unevictable(struct page *page, - struct vm_area_struct *vma) -{ - folio_add_lru_vma(page_folio(page), vma); -} - int add_to_page_cache_lru(struct page *page, struct address_space *mapping, pgoff_t index, gfp_t gfp) { @@ -122,13 +116,3 @@ void putback_lru_page(struct page *page) { folio_putback_lru(page_folio(page)); } - -#ifdef CONFIG_MMU -void page_add_new_anon_rmap(struct page *page, struct vm_area_struct *vma, - unsigned long address) -{ - VM_BUG_ON_PAGE(PageTail(page), page); - - return folio_add_new_anon_rmap((struct folio *)page, vma, address); -} -#endif -- cgit v1.2.3 From 5088b49730afaaf3134d42705cfcff7ce8be082e Mon Sep 17 00:00:00 2001 From: Stefan Roesch Date: Mon, 18 Dec 2023 15:10:53 -0800 Subject: mm/ksm: add tracepoint for ksm advisor This adds a new tracepoint for the ksm advisor. It reports the last scan time, the new setting of the pages_to_scan parameter and the average cpu percent usage of the ksmd background thread for the last scan. Link: https://lkml.kernel.org/r/20231218231054.1625219-4-shr@devkernel.io Signed-off-by: Stefan Roesch Acked-by: David Hildenbrand Cc: Johannes Weiner Cc: Rik van Riel Signed-off-by: Andrew Morton --- include/trace/events/ksm.h | 33 +++++++++++++++++++++++++++++++++ mm/ksm.c | 1 + 2 files changed, 34 insertions(+) (limited to 'include') diff --git a/include/trace/events/ksm.h b/include/trace/events/ksm.h index b5ac35c1d0e8..e728647b5d26 100644 --- a/include/trace/events/ksm.h +++ b/include/trace/events/ksm.h @@ -245,6 +245,39 @@ TRACE_EVENT(ksm_remove_rmap_item, __entry->pfn, __entry->rmap_item, __entry->mm) ); +/** + * ksm_advisor - called after the advisor has run + * + * @scan_time: scan time in seconds + * @pages_to_scan: new pages_to_scan value + * @cpu_percent: cpu usage in percent + * + * Allows to trace the ksm advisor. + */ +TRACE_EVENT(ksm_advisor, + + TP_PROTO(s64 scan_time, unsigned long pages_to_scan, + unsigned int cpu_percent), + + TP_ARGS(scan_time, pages_to_scan, cpu_percent), + + TP_STRUCT__entry( + __field(s64, scan_time) + __field(unsigned long, pages_to_scan) + __field(unsigned int, cpu_percent) + ), + + TP_fast_assign( + __entry->scan_time = scan_time; + __entry->pages_to_scan = pages_to_scan; + __entry->cpu_percent = cpu_percent; + ), + + TP_printk("ksm scan time %lld pages_to_scan %lu cpu percent %u", + __entry->scan_time, __entry->pages_to_scan, + __entry->cpu_percent) +); + #endif /* _TRACE_KSM_H */ /* This part must be outside protection */ diff --git a/mm/ksm.c b/mm/ksm.c index ce3001745562..8fa6053a225d 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -458,6 +458,7 @@ static void scan_time_advisor(void) advisor_ctx.cpu_time = cpu_time; ksm_thread_pages_to_scan = pages; + trace_ksm_advisor(scan_time, pages, cpu_percent); } static void advisor_stop_scan(void) -- cgit v1.2.3 From 8ba2f844f050a82624ba3ad5146aa3c116f506f7 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Thu, 28 Dec 2023 09:45:46 +0000 Subject: mm/zswap: change per-cpu mutex and buffer to per-acomp_ctx First of all, we need to rename acomp_ctx->dstmem field to buffer, since we are now using for purposes other than compression. Then we change per-cpu mutex and buffer to per-acomp_ctx, since them belong to the acomp_ctx and are necessary parts when used in the compress/decompress contexts. So we can remove the old per-cpu mutex and dstmem. Link: https://lkml.kernel.org/r/20231213-zswap-dstmem-v5-5-9382162bbf05@bytedance.com Signed-off-by: Chengming Zhou Acked-by: Chris Li (Google) Reviewed-by: Nhat Pham Cc: Barry Song <21cnbao@gmail.com> Cc: Dan Streetman Cc: Johannes Weiner Cc: Seth Jennings Cc: Vitaly Wool Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- include/linux/cpuhotplug.h | 1 - mm/zswap.c | 104 ++++++++++++++------------------------------- 2 files changed, 33 insertions(+), 72 deletions(-) (limited to 'include') diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index efc0c0b07efb..c3e06e21766a 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -124,7 +124,6 @@ enum cpuhp_state { CPUHP_ARM_BL_PREPARE, CPUHP_TRACE_RB_PREPARE, CPUHP_MM_ZS_PREPARE, - CPUHP_MM_ZSWP_MEM_PREPARE, CPUHP_MM_ZSWP_POOL_PREPARE, CPUHP_KVM_PPC_BOOK3S_PREPARE, CPUHP_ZCOMP_PREPARE, diff --git a/mm/zswap.c b/mm/zswap.c index 880c33bbe146..12ce5a68da91 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -161,8 +161,8 @@ struct crypto_acomp_ctx { struct crypto_acomp *acomp; struct acomp_req *req; struct crypto_wait wait; - u8 *dstmem; - struct mutex *mutex; + u8 *buffer; + struct mutex mutex; }; /* @@ -688,63 +688,26 @@ static void zswap_alloc_shrinker(struct zswap_pool *pool) /********************************* * per-cpu code **********************************/ -static DEFINE_PER_CPU(u8 *, zswap_dstmem); -/* - * If users dynamically change the zpool type and compressor at runtime, i.e. - * zswap is running, zswap can have more than one zpool on one cpu, but they - * are sharing dtsmem. So we need this mutex to be per-cpu. - */ -static DEFINE_PER_CPU(struct mutex *, zswap_mutex); - -static int zswap_dstmem_prepare(unsigned int cpu) -{ - struct mutex *mutex; - u8 *dst; - - dst = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu)); - if (!dst) - return -ENOMEM; - - mutex = kmalloc_node(sizeof(*mutex), GFP_KERNEL, cpu_to_node(cpu)); - if (!mutex) { - kfree(dst); - return -ENOMEM; - } - - mutex_init(mutex); - per_cpu(zswap_dstmem, cpu) = dst; - per_cpu(zswap_mutex, cpu) = mutex; - return 0; -} - -static int zswap_dstmem_dead(unsigned int cpu) -{ - struct mutex *mutex; - u8 *dst; - - mutex = per_cpu(zswap_mutex, cpu); - kfree(mutex); - per_cpu(zswap_mutex, cpu) = NULL; - - dst = per_cpu(zswap_dstmem, cpu); - kfree(dst); - per_cpu(zswap_dstmem, cpu) = NULL; - - return 0; -} - static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node) { struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node); struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu); struct crypto_acomp *acomp; struct acomp_req *req; + int ret; + + mutex_init(&acomp_ctx->mutex); + + acomp_ctx->buffer = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu)); + if (!acomp_ctx->buffer) + return -ENOMEM; acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0, cpu_to_node(cpu)); if (IS_ERR(acomp)) { pr_err("could not alloc crypto acomp %s : %ld\n", pool->tfm_name, PTR_ERR(acomp)); - return PTR_ERR(acomp); + ret = PTR_ERR(acomp); + goto acomp_fail; } acomp_ctx->acomp = acomp; @@ -752,8 +715,8 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node) if (!req) { pr_err("could not alloc crypto acomp_request %s\n", pool->tfm_name); - crypto_free_acomp(acomp_ctx->acomp); - return -ENOMEM; + ret = -ENOMEM; + goto req_fail; } acomp_ctx->req = req; @@ -766,10 +729,13 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node) acomp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG, crypto_req_done, &acomp_ctx->wait); - acomp_ctx->mutex = per_cpu(zswap_mutex, cpu); - acomp_ctx->dstmem = per_cpu(zswap_dstmem, cpu); - return 0; + +req_fail: + crypto_free_acomp(acomp_ctx->acomp); +acomp_fail: + kfree(acomp_ctx->buffer); + return ret; } static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node) @@ -782,6 +748,7 @@ static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node) acomp_request_free(acomp_ctx->req); if (!IS_ERR_OR_NULL(acomp_ctx->acomp)) crypto_free_acomp(acomp_ctx->acomp); + kfree(acomp_ctx->buffer); } return 0; @@ -1391,12 +1358,12 @@ static void __zswap_load(struct zswap_entry *entry, struct page *page) u8 *src; acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); - mutex_lock(acomp_ctx->mutex); + mutex_lock(&acomp_ctx->mutex); src = zpool_map_handle(zpool, entry->handle, ZPOOL_MM_RO); if (!zpool_can_sleep_mapped(zpool)) { - memcpy(acomp_ctx->dstmem, src, entry->length); - src = acomp_ctx->dstmem; + memcpy(acomp_ctx->buffer, src, entry->length); + src = acomp_ctx->buffer; zpool_unmap_handle(zpool, entry->handle); } @@ -1406,7 +1373,7 @@ static void __zswap_load(struct zswap_entry *entry, struct page *page) acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, PAGE_SIZE); BUG_ON(crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait)); BUG_ON(acomp_ctx->req->dlen != PAGE_SIZE); - mutex_unlock(acomp_ctx->mutex); + mutex_unlock(&acomp_ctx->mutex); if (zpool_can_sleep_mapped(zpool)) zpool_unmap_handle(zpool, entry->handle); @@ -1622,13 +1589,17 @@ bool zswap_store(struct folio *folio) /* compress */ acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); - mutex_lock(acomp_ctx->mutex); + mutex_lock(&acomp_ctx->mutex); - dst = acomp_ctx->dstmem; + dst = acomp_ctx->buffer; sg_init_table(&input, 1); sg_set_page(&input, page, PAGE_SIZE, 0); - /* zswap_dstmem is of size (PAGE_SIZE * 2). Reflect same in sg_list */ + /* + * We need PAGE_SIZE * 2 here since there maybe over-compression case, + * and hardware-accelerators may won't check the dst buffer size, so + * giving the dst buffer with enough length to avoid buffer overflow. + */ sg_init_one(&output, dst, PAGE_SIZE * 2); acomp_request_set_params(acomp_ctx->req, &input, &output, PAGE_SIZE, dlen); /* @@ -1668,7 +1639,7 @@ bool zswap_store(struct folio *folio) buf = zpool_map_handle(zpool, handle, ZPOOL_MM_WO); memcpy(buf, dst, dlen); zpool_unmap_handle(zpool, handle); - mutex_unlock(acomp_ctx->mutex); + mutex_unlock(&acomp_ctx->mutex); /* populate entry */ entry->swpentry = swp_entry(type, offset); @@ -1711,7 +1682,7 @@ insert_entry: return true; put_dstmem: - mutex_unlock(acomp_ctx->mutex); + mutex_unlock(&acomp_ctx->mutex); put_pool: zswap_pool_put(entry->pool); freepage: @@ -1886,13 +1857,6 @@ static int zswap_setup(void) goto cache_fail; } - ret = cpuhp_setup_state(CPUHP_MM_ZSWP_MEM_PREPARE, "mm/zswap:prepare", - zswap_dstmem_prepare, zswap_dstmem_dead); - if (ret) { - pr_err("dstmem alloc failed\n"); - goto dstmem_fail; - } - ret = cpuhp_setup_state_multi(CPUHP_MM_ZSWP_POOL_PREPARE, "mm/zswap_pool:prepare", zswap_cpu_comp_prepare, @@ -1924,8 +1888,6 @@ fallback_fail: if (pool) zswap_pool_destroy(pool); hp_fail: - cpuhp_remove_state(CPUHP_MM_ZSWP_MEM_PREPARE); -dstmem_fail: kmem_cache_destroy(zswap_entry_cache); cache_fail: /* if built-in, we aren't unloaded on failure; don't allow use */ -- cgit v1.2.3 From 96c7b0b42239e7b8987b2664b458dc74e825f760 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 13 Dec 2023 21:58:30 +0000 Subject: mm: return the folio from __read_swap_cache_async() Patch series "More swap folio conversions". These all seem like fairly straightforward conversions to me. A lot of compound_head() calls get removed. And page_swap_info(), which is nice. This patch (of 13): Move the folio->page conversion into the callers that actually want that. Most of the callers are happier with the folio anyway. If the page_allocated boolean is set, the folio allocated is of order-0, so it is safe to pass the page directly to swap_readpage(). Link: https://lkml.kernel.org/r/20231213215842.671461-1-willy@infradead.org Link: https://lkml.kernel.org/r/20231213215842.671461-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/zswap.h | 4 +-- mm/swap.h | 7 +++-- mm/swap_state.c | 75 ++++++++++++++++++++++++--------------------------- mm/zswap.c | 58 +++++++++++++++++++-------------------- 4 files changed, 69 insertions(+), 75 deletions(-) (limited to 'include') diff --git a/include/linux/zswap.h b/include/linux/zswap.h index 08c240e16a01..e88572d4c720 100644 --- a/include/linux/zswap.h +++ b/include/linux/zswap.h @@ -34,7 +34,7 @@ void zswap_swapon(int type); void zswap_swapoff(int type); void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg); void zswap_lruvec_state_init(struct lruvec *lruvec); -void zswap_page_swapin(struct page *page); +void zswap_folio_swapin(struct folio *folio); #else struct zswap_lruvec_state {}; @@ -54,7 +54,7 @@ static inline void zswap_swapon(int type) {} static inline void zswap_swapoff(int type) {} static inline void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg) {} static inline void zswap_lruvec_state_init(struct lruvec *lruvec) {} -static inline void zswap_page_swapin(struct page *page) {} +static inline void zswap_folio_swapin(struct folio *folio) {} #endif #endif /* _LINUX_ZSWAP_H */ diff --git a/mm/swap.h b/mm/swap.h index c0dc73e10e91..a60ab1cfcaf2 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -49,10 +49,9 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, struct vm_area_struct *vma, unsigned long addr, struct swap_iocb **plug); -struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, - struct mempolicy *mpol, pgoff_t ilx, - bool *new_page_allocated, - bool skip_if_exists); +struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_flags, + struct mempolicy *mpol, pgoff_t ilx, bool *new_page_allocated, + bool skip_if_exists); struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t flag, struct mempolicy *mpol, pgoff_t ilx); struct page *swapin_readahead(swp_entry_t entry, gfp_t flag, diff --git a/mm/swap_state.c b/mm/swap_state.c index c597cec606e4..874b40a1f502 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -410,14 +410,12 @@ struct folio *filemap_get_incore_folio(struct address_space *mapping, return folio; } -struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, - struct mempolicy *mpol, pgoff_t ilx, - bool *new_page_allocated, - bool skip_if_exists) +struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, + struct mempolicy *mpol, pgoff_t ilx, bool *new_page_allocated, + bool skip_if_exists) { struct swap_info_struct *si; struct folio *folio; - struct page *page; void *shadow = NULL; *new_page_allocated = false; @@ -434,10 +432,8 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, */ folio = filemap_get_folio(swap_address_space(entry), swp_offset(entry)); - if (!IS_ERR(folio)) { - page = folio_file_page(folio, swp_offset(entry)); - goto got_page; - } + if (!IS_ERR(folio)) + goto got_folio; /* * Just skip read ahead for unused swap slot. @@ -451,7 +447,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, goto fail_put_swap; /* - * Get a new page to read into from swap. Allocate it now, + * Get a new folio to read into from swap. Allocate it now, * before marking swap_map SWAP_HAS_CACHE, when -EEXIST will * cause any racers to loop around until we add it to cache. */ @@ -487,13 +483,13 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, * stumble across a swap_map entry whose SWAP_HAS_CACHE * has not yet been cleared. Or race against another * __read_swap_cache_async(), which has set SWAP_HAS_CACHE - * in swap_map, but not yet added its page to swap cache. + * in swap_map, but not yet added its folio to swap cache. */ schedule_timeout_uninterruptible(1); } /* - * The swap entry is ours to swap in. Prepare the new page. + * The swap entry is ours to swap in. Prepare the new folio. */ __folio_set_locked(folio); @@ -514,10 +510,9 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, /* Caller will initiate read into locked folio */ folio_add_lru(folio); *new_page_allocated = true; - page = &folio->page; -got_page: +got_folio: put_swap_device(si); - return page; + return folio; fail_unlock: put_swap_folio(folio, entry); @@ -545,16 +540,16 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, bool page_allocated; struct mempolicy *mpol; pgoff_t ilx; - struct page *page; + struct folio *folio; mpol = get_vma_policy(vma, addr, 0, &ilx); - page = __read_swap_cache_async(entry, gfp_mask, mpol, ilx, + folio = __read_swap_cache_async(entry, gfp_mask, mpol, ilx, &page_allocated, false); mpol_cond_put(mpol); if (page_allocated) - swap_readpage(page, false, plug); - return page; + swap_readpage(&folio->page, false, plug); + return folio_file_page(folio, swp_offset(entry)); } static unsigned int __swapin_nr_pages(unsigned long prev_offset, @@ -639,7 +634,7 @@ static unsigned long swapin_nr_pages(unsigned long offset) struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, struct mempolicy *mpol, pgoff_t ilx) { - struct page *page; + struct folio *folio; unsigned long entry_offset = swp_offset(entry); unsigned long offset = entry_offset; unsigned long start_offset, end_offset; @@ -664,31 +659,31 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, blk_start_plug(&plug); for (offset = start_offset; offset <= end_offset ; offset++) { /* Ok, do the async read-ahead now */ - page = __read_swap_cache_async( + folio = __read_swap_cache_async( swp_entry(swp_type(entry), offset), gfp_mask, mpol, ilx, &page_allocated, false); - if (!page) + if (!folio) continue; if (page_allocated) { - swap_readpage(page, false, &splug); + swap_readpage(&folio->page, false, &splug); if (offset != entry_offset) { - SetPageReadahead(page); + folio_set_readahead(folio); count_vm_event(SWAP_RA); } } - put_page(page); + folio_put(folio); } blk_finish_plug(&plug); swap_read_unplug(splug); lru_add_drain(); /* Push any new pages onto the LRU now */ skip: /* The page was likely read above, so no need for plugging here */ - page = __read_swap_cache_async(entry, gfp_mask, mpol, ilx, + folio = __read_swap_cache_async(entry, gfp_mask, mpol, ilx, &page_allocated, false); if (unlikely(page_allocated)) - swap_readpage(page, false, NULL); - zswap_page_swapin(page); - return page; + swap_readpage(&folio->page, false, NULL); + zswap_folio_swapin(folio); + return folio_file_page(folio, swp_offset(entry)); } int init_swap_address_space(unsigned int type, unsigned long nr_pages) @@ -806,7 +801,7 @@ static struct page *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask, { struct blk_plug plug; struct swap_iocb *splug = NULL; - struct page *page; + struct folio *folio; pte_t *pte = NULL, pentry; unsigned long addr; swp_entry_t entry; @@ -839,18 +834,18 @@ static struct page *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask, continue; pte_unmap(pte); pte = NULL; - page = __read_swap_cache_async(entry, gfp_mask, mpol, ilx, + folio = __read_swap_cache_async(entry, gfp_mask, mpol, ilx, &page_allocated, false); - if (!page) + if (!folio) continue; if (page_allocated) { - swap_readpage(page, false, &splug); + swap_readpage(&folio->page, false, &splug); if (i != ra_info.offset) { - SetPageReadahead(page); + folio_set_readahead(folio); count_vm_event(SWAP_RA); } } - put_page(page); + folio_put(folio); } if (pte) pte_unmap(pte); @@ -858,13 +853,13 @@ static struct page *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask, swap_read_unplug(splug); lru_add_drain(); skip: - /* The page was likely read above, so no need for plugging here */ - page = __read_swap_cache_async(targ_entry, gfp_mask, mpol, targ_ilx, + /* The folio was likely read above, so no need for plugging here */ + folio = __read_swap_cache_async(targ_entry, gfp_mask, mpol, targ_ilx, &page_allocated, false); if (unlikely(page_allocated)) - swap_readpage(page, false, NULL); - zswap_page_swapin(page); - return page; + swap_readpage(&folio->page, false, NULL); + zswap_folio_swapin(folio); + return folio_file_page(folio, swp_offset(entry)); } /** diff --git a/mm/zswap.c b/mm/zswap.c index 12ce5a68da91..6d9ee2a5334a 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -368,12 +368,12 @@ void zswap_lruvec_state_init(struct lruvec *lruvec) atomic_long_set(&lruvec->zswap_lruvec_state.nr_zswap_protected, 0); } -void zswap_page_swapin(struct page *page) +void zswap_folio_swapin(struct folio *folio) { struct lruvec *lruvec; - if (page) { - lruvec = folio_lruvec(page_folio(page)); + if (folio) { + lruvec = folio_lruvec(folio); atomic_long_inc(&lruvec->zswap_lruvec_state.nr_zswap_protected); } } @@ -1383,14 +1383,14 @@ static void __zswap_load(struct zswap_entry *entry, struct page *page) * writeback code **********************************/ /* - * Attempts to free an entry by adding a page to the swap cache, - * decompressing the entry data into the page, and issuing a - * bio write to write the page back to the swap device. + * Attempts to free an entry by adding a folio to the swap cache, + * decompressing the entry data into the folio, and issuing a + * bio write to write the folio back to the swap device. * - * This can be thought of as a "resumed writeback" of the page + * This can be thought of as a "resumed writeback" of the folio * to the swap device. We are basically resuming the same swap * writeback path that was intercepted with the zswap_store() - * in the first place. After the page has been decompressed into + * in the first place. After the folio has been decompressed into * the swap cache, the compressed version stored by zswap can be * freed. */ @@ -1398,56 +1398,56 @@ static int zswap_writeback_entry(struct zswap_entry *entry, struct zswap_tree *tree) { swp_entry_t swpentry = entry->swpentry; - struct page *page; + struct folio *folio; struct mempolicy *mpol; - bool page_was_allocated; + bool folio_was_allocated; struct writeback_control wbc = { .sync_mode = WB_SYNC_NONE, }; - /* try to allocate swap cache page */ + /* try to allocate swap cache folio */ mpol = get_task_policy(current); - page = __read_swap_cache_async(swpentry, GFP_KERNEL, mpol, - NO_INTERLEAVE_INDEX, &page_was_allocated, true); - if (!page) + folio = __read_swap_cache_async(swpentry, GFP_KERNEL, mpol, + NO_INTERLEAVE_INDEX, &folio_was_allocated, true); + if (!folio) return -ENOMEM; /* - * Found an existing page, we raced with load/swapin. We generally - * writeback cold pages from zswap, and swapin means the page just - * became hot. Skip this page and let the caller find another one. + * Found an existing folio, we raced with load/swapin. We generally + * writeback cold folios from zswap, and swapin means the folio just + * became hot. Skip this folio and let the caller find another one. */ - if (!page_was_allocated) { - put_page(page); + if (!folio_was_allocated) { + folio_put(folio); return -EEXIST; } /* - * Page is locked, and the swapcache is now secured against + * folio is locked, and the swapcache is now secured against * concurrent swapping to and from the slot. Verify that the * swap entry hasn't been invalidated and recycled behind our * backs (our zswap_entry reference doesn't prevent that), to - * avoid overwriting a new swap page with old compressed data. + * avoid overwriting a new swap folio with old compressed data. */ spin_lock(&tree->lock); if (zswap_rb_search(&tree->rbroot, swp_offset(entry->swpentry)) != entry) { spin_unlock(&tree->lock); - delete_from_swap_cache(page_folio(page)); + delete_from_swap_cache(folio); return -ENOMEM; } spin_unlock(&tree->lock); - __zswap_load(entry, page); + __zswap_load(entry, &folio->page); - /* page is up to date */ - SetPageUptodate(page); + /* folio is up to date */ + folio_mark_uptodate(folio); /* move it to the tail of the inactive list after end_writeback */ - SetPageReclaim(page); + folio_set_reclaim(folio); /* start writeback */ - __swap_writepage(page, &wbc); - put_page(page); + __swap_writepage(&folio->page, &wbc); + folio_put(folio); return 0; } @@ -1593,7 +1593,7 @@ bool zswap_store(struct folio *folio) dst = acomp_ctx->buffer; sg_init_table(&input, 1); - sg_set_page(&input, page, PAGE_SIZE, 0); + sg_set_page(&input, &folio->page, PAGE_SIZE, 0); /* * We need PAGE_SIZE * 2 here since there maybe over-compression case, -- cgit v1.2.3 From 3a61e6f668120ee2c7840b91891c858d575d07e2 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 13 Dec 2023 21:58:38 +0000 Subject: mm: convert swap_page_sector() to swap_folio_sector() All callers have a folio, so pass it in. Saves a couple of calls to compound_head(). Link: https://lkml.kernel.org/r/20231213215842.671461-10-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/swap.h | 2 +- mm/page_io.c | 8 ++++---- mm/swapfile.c | 6 +++--- 3 files changed, 8 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/swap.h b/include/linux/swap.h index 3e1909087f6a..2d09e9b7ee70 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -493,7 +493,7 @@ struct backing_dev_info; extern int init_swap_address_space(unsigned int type, unsigned long nr_pages); extern void exit_swap_address_space(unsigned int type); extern struct swap_info_struct *get_swap_device(swp_entry_t entry); -sector_t swap_page_sector(struct page *page); +sector_t swap_folio_sector(struct folio *folio); static inline void put_swap_device(struct swap_info_struct *si) { diff --git a/mm/page_io.c b/mm/page_io.c index e18afcd9c19a..6736c56526bf 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -334,7 +334,7 @@ static void swap_writepage_bdev_sync(struct folio *folio, bio_init(&bio, sis->bdev, &bv, 1, REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc)); - bio.bi_iter.bi_sector = swap_page_sector(&folio->page); + bio.bi_iter.bi_sector = swap_folio_sector(folio); bio_add_folio_nofail(&bio, folio, folio_size(folio), 0); bio_associate_blkg_from_page(&bio, folio); @@ -355,7 +355,7 @@ static void swap_writepage_bdev_async(struct folio *folio, bio = bio_alloc(sis->bdev, 1, REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc), GFP_NOIO); - bio->bi_iter.bi_sector = swap_page_sector(&folio->page); + bio->bi_iter.bi_sector = swap_folio_sector(folio); bio->bi_end_io = end_swap_bio_write; bio_add_folio_nofail(bio, folio, folio_size(folio), 0); @@ -461,7 +461,7 @@ static void swap_readpage_bdev_sync(struct folio *folio, struct bio bio; bio_init(&bio, sis->bdev, &bv, 1, REQ_OP_READ); - bio.bi_iter.bi_sector = swap_page_sector(&folio->page); + bio.bi_iter.bi_sector = swap_folio_sector(folio); bio_add_folio_nofail(&bio, folio, folio_size(folio), 0); /* * Keep this task valid during swap readpage because the oom killer may @@ -480,7 +480,7 @@ static void swap_readpage_bdev_async(struct folio *folio, struct bio *bio; bio = bio_alloc(sis->bdev, 1, REQ_OP_READ, GFP_KERNEL); - bio->bi_iter.bi_sector = swap_page_sector(&folio->page); + bio->bi_iter.bi_sector = swap_folio_sector(folio); bio->bi_end_io = end_swap_bio_read; bio_add_folio_nofail(bio, folio, folio_size(folio), 0); count_vm_event(PSWPIN); diff --git a/mm/swapfile.c b/mm/swapfile.c index 1501bc956456..b22c47b11d65 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -227,14 +227,14 @@ offset_to_swap_extent(struct swap_info_struct *sis, unsigned long offset) BUG(); } -sector_t swap_page_sector(struct page *page) +sector_t swap_folio_sector(struct folio *folio) { - struct swap_info_struct *sis = page_swap_info(page); + struct swap_info_struct *sis = swp_swap_info(folio->swap); struct swap_extent *se; sector_t sector; pgoff_t offset; - offset = __page_file_index(page); + offset = swp_offset(folio->swap); se = offset_to_swap_extent(sis, offset); sector = se->start_block + (offset - se->start_page); return sector << (PAGE_SHIFT - 9); -- cgit v1.2.3 From 69fe7d67cb0c6eeab3d4c9a3bf950f9d12af4719 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 13 Dec 2023 21:58:40 +0000 Subject: mm: remove page_swap_info() It's more efficient to get the swap_info_struct by calling swp_swap_info() directly. Link: https://lkml.kernel.org/r/20231213215842.671461-12-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/swap.h | 3 +-- mm/swap.h | 2 +- mm/swapfile.c | 8 +------- 3 files changed, 3 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/swap.h b/include/linux/swap.h index 2d09e9b7ee70..4db00ddad261 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -487,8 +487,7 @@ extern sector_t swapdev_block(int, pgoff_t); extern int __swap_count(swp_entry_t entry); extern int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry); extern int swp_swapcount(swp_entry_t entry); -extern struct swap_info_struct *page_swap_info(struct page *); -extern struct swap_info_struct *swp_swap_info(swp_entry_t entry); +struct swap_info_struct *swp_swap_info(swp_entry_t entry); struct backing_dev_info; extern int init_swap_address_space(unsigned int type, unsigned long nr_pages); extern void exit_swap_address_space(unsigned int type); diff --git a/mm/swap.h b/mm/swap.h index 859ae8f0fd2d..6bf25342589f 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -60,7 +60,7 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t flag, static inline unsigned int folio_swap_flags(struct folio *folio) { - return page_swap_info(&folio->page)->flags; + return swp_swap_info(folio->swap)->flags; } #else /* CONFIG_SWAP */ struct swap_iocb; diff --git a/mm/swapfile.c b/mm/swapfile.c index f3e23a3d26ae..2f877ca44513 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -3369,18 +3369,12 @@ struct swap_info_struct *swp_swap_info(swp_entry_t entry) return swap_type_to_swap_info(swp_type(entry)); } -struct swap_info_struct *page_swap_info(struct page *page) -{ - swp_entry_t entry = page_swap_entry(page); - return swp_swap_info(entry); -} - /* * out-of-line methods to avoid include hell. */ struct address_space *swapcache_mapping(struct folio *folio) { - return page_swap_info(&folio->page)->swap_file->f_mapping; + return swp_swap_info(folio->swap)->swap_file->f_mapping; } EXPORT_SYMBOL_GPL(swapcache_mapping); -- cgit v1.2.3 From f099c961f4998ad7107b1c6a7d6efb225e9a4614 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 15 Dec 2023 20:02:32 +0000 Subject: fs: remove clean_page_buffers() Patch series "Clean up the writeback paths". Most of these patches verge on the trivial, converting filesystems that just use block_write_full_page() to use mpage_writepages(). But as we saw with Christoph's earlier patchset, there can be some "interesting" gotchas, and I clearly haven't tested the majority of filesystems I've touched here. Patches 3 & 4 get rid of a lot of stack usage on architectures with larger page sizes; 1024 bytes on 64-bit systems with 64KiB pages. It starts to open the door to larger folio sizes on all architectures, but it's certainly not enough yet. Patch 14 is kind of trivial, but it's nice to get that simplification in. This patch (of 14): This function has been unused since the removal of bdev_write_page(). Link: https://lkml.kernel.org/r/20231215200245.748418-1-willy@infradead.org Link: https://lkml.kernel.org/r/20231215200245.748418-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Reviewed-by: Jens Axboe Signed-off-by: Andrew Morton --- fs/mpage.c | 10 ---------- include/linux/buffer_head.h | 1 - 2 files changed, 11 deletions(-) (limited to 'include') diff --git a/fs/mpage.c b/fs/mpage.c index ffb064ed9d04..63bf99856024 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -455,16 +455,6 @@ static void clean_buffers(struct page *page, unsigned first_unmapped) try_to_free_buffers(page_folio(page)); } -/* - * For situations where we want to clean all buffers attached to a page. - * We don't need to calculate how many buffers are attached to the page, - * we just need to specify a number larger than the maximum number of buffers. - */ -void clean_page_buffers(struct page *page) -{ - clean_buffers(page, ~0U); -} - static int __mpage_writepage(struct folio *folio, struct writeback_control *wbc, void *data) { diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 5f23ee599889..94f6161eb45e 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -270,7 +270,6 @@ int generic_write_end(struct file *, struct address_space *, loff_t, unsigned, unsigned, struct page *, void *); void folio_zero_new_buffers(struct folio *folio, size_t from, size_t to); -void clean_page_buffers(struct page *page); int cont_write_begin(struct file *, struct address_space *, loff_t, unsigned, struct page **, void **, get_block_t *, loff_t *); -- cgit v1.2.3 From 17bf23a981be9c6629198a76940c777eb5c8c521 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 15 Dec 2023 20:02:44 +0000 Subject: fs: convert block_write_full_page to block_write_full_folio Convert the function to be compatible with writepage_t so that it can be passed to write_cache_pages() by blkdev. This removes a call to compound_head(). We can also remove the function export as both callers are built-in. Link: https://lkml.kernel.org/r/20231215200245.748418-14-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Reviewed-by: Jens Axboe Signed-off-by: Andrew Morton --- block/fops.c | 21 ++++++++++++++++++--- fs/buffer.c | 16 +++++++--------- fs/ext4/page-io.c | 2 +- fs/gfs2/aops.c | 4 ++-- fs/mpage.c | 2 +- fs/ntfs/aops.c | 4 ++-- fs/ocfs2/alloc.c | 2 +- fs/ocfs2/file.c | 2 +- include/linux/buffer_head.h | 4 ++-- 9 files changed, 35 insertions(+), 22 deletions(-) (limited to 'include') diff --git a/block/fops.c b/block/fops.c index 0bdad1e8d514..0cf8cf72cdfa 100644 --- a/block/fops.c +++ b/block/fops.c @@ -410,9 +410,24 @@ static int blkdev_get_block(struct inode *inode, sector_t iblock, return 0; } -static int blkdev_writepage(struct page *page, struct writeback_control *wbc) +/* + * We cannot call mpage_writepages() as it does not take the buffer lock. + * We must use block_write_full_folio() directly which holds the buffer + * lock. The buffer lock provides the synchronisation with writeback + * that filesystems rely on when they use the blockdev's mapping. + */ +static int blkdev_writepages(struct address_space *mapping, + struct writeback_control *wbc) { - return block_write_full_page(page, blkdev_get_block, wbc); + struct blk_plug plug; + int err; + + blk_start_plug(&plug); + err = write_cache_pages(mapping, wbc, block_write_full_folio, + blkdev_get_block); + blk_finish_plug(&plug); + + return err; } static int blkdev_read_folio(struct file *file, struct folio *folio) @@ -449,7 +464,7 @@ const struct address_space_operations def_blk_aops = { .invalidate_folio = block_invalidate_folio, .read_folio = blkdev_read_folio, .readahead = blkdev_readahead, - .writepage = blkdev_writepage, + .writepages = blkdev_writepages, .write_begin = blkdev_write_begin, .write_end = blkdev_write_end, .migrate_folio = buffer_migrate_folio_norefs, diff --git a/fs/buffer.c b/fs/buffer.c index 3a8c8322ed28..c838b4a31009 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -372,7 +372,7 @@ static void end_buffer_async_read_io(struct buffer_head *bh, int uptodate) } /* - * Completion handler for block_write_full_page() - pages which are unlocked + * Completion handler for block_write_full_folio() - pages which are unlocked * during I/O, and which have PageWriteback cleared upon I/O completion. */ void end_buffer_async_write(struct buffer_head *bh, int uptodate) @@ -1771,18 +1771,18 @@ static struct buffer_head *folio_create_buffers(struct folio *folio, */ /* - * While block_write_full_page is writing back the dirty buffers under + * While block_write_full_folio is writing back the dirty buffers under * the page lock, whoever dirtied the buffers may decide to clean them * again at any time. We handle that by only looking at the buffer * state inside lock_buffer(). * - * If block_write_full_page() is called for regular writeback + * If block_write_full_folio() is called for regular writeback * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a * locked buffer. This only can happen if someone has written the buffer * directly, with submit_bh(). At the address_space level PageWriteback * prevents this contention from occurring. * - * If block_write_full_page() is called with wbc->sync_mode == + * If block_write_full_folio() is called with wbc->sync_mode == * WB_SYNC_ALL, the writes are posted using REQ_SYNC; this * causes the writes to be flagged as synchronous writes. */ @@ -1829,7 +1829,7 @@ int __block_write_full_folio(struct inode *inode, struct folio *folio, * truncate in progress. */ /* - * The buffer was zeroed by block_write_full_page() + * The buffer was zeroed by block_write_full_folio() */ clear_buffer_dirty(bh); set_buffer_uptodate(bh); @@ -2696,10 +2696,9 @@ EXPORT_SYMBOL(block_truncate_page); /* * The generic ->writepage function for buffer-backed address_spaces */ -int block_write_full_page(struct page *page, get_block_t *get_block, - struct writeback_control *wbc) +int block_write_full_folio(struct folio *folio, struct writeback_control *wbc, + void *get_block) { - struct folio *folio = page_folio(page); struct inode * const inode = folio->mapping->host; loff_t i_size = i_size_read(inode); @@ -2726,7 +2725,6 @@ int block_write_full_page(struct page *page, get_block_t *get_block, return __block_write_full_folio(inode, folio, get_block, wbc, end_buffer_async_write); } -EXPORT_SYMBOL(block_write_full_page); sector_t generic_block_bmap(struct address_space *mapping, sector_t block, get_block_t *get_block) diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index dfdd7e5cf038..312bc6813357 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -444,7 +444,7 @@ int ext4_bio_write_folio(struct ext4_io_submit *io, struct folio *folio, folio_clear_error(folio); /* - * Comments copied from block_write_full_page: + * Comments copied from block_write_full_folio: * * The folio straddles i_size. It must be zeroed out on each and every * writepage invocation because it may be mmapped. "A file is mapped diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 5cffb079b87c..f986cd032b76 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -82,11 +82,11 @@ static int gfs2_get_block_noalloc(struct inode *inode, sector_t lblock, } /** - * gfs2_write_jdata_folio - gfs2 jdata-specific version of block_write_full_page + * gfs2_write_jdata_folio - gfs2 jdata-specific version of block_write_full_folio * @folio: The folio to write * @wbc: The writeback control * - * This is the same as calling block_write_full_page, but it also + * This is the same as calling block_write_full_folio, but it also * writes pages outside of i_size */ static int gfs2_write_jdata_folio(struct folio *folio, diff --git a/fs/mpage.c b/fs/mpage.c index d4963f3d8051..738882e0766d 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -642,7 +642,7 @@ confused: /* * The caller has a ref on the inode, so *mapping is stable */ - ret = block_write_full_page(&folio->page, mpd->get_block, wbc); + ret = block_write_full_folio(folio, wbc, mpd->get_block); mapping_set_error(mapping, ret); out: mpd->bio = bio; diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c index 70479ce915e8..6c414957e2c2 100644 --- a/fs/ntfs/aops.c +++ b/fs/ntfs/aops.c @@ -1304,7 +1304,7 @@ done: * page cleaned. The VM has already locked the page and marked it clean. * * For non-resident attributes, ntfs_writepage() writes the @page by calling - * the ntfs version of the generic block_write_full_page() function, + * the ntfs version of the generic block_write_full_folio() function, * ntfs_write_block(), which in turn if necessary creates and writes the * buffers associated with the page asynchronously. * @@ -1314,7 +1314,7 @@ done: * vfs inode dirty code path for the inode the mft record belongs to or via the * vm page dirty code path for the page the mft record is in. * - * Based on ntfs_read_folio() and fs/buffer.c::block_write_full_page(). + * Based on ntfs_read_folio() and fs/buffer.c::block_write_full_folio(). * * Return 0 on success and -errno on error. */ diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 91b32b2377ac..ea9127ba3208 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -6934,7 +6934,7 @@ static int ocfs2_grab_eof_pages(struct inode *inode, loff_t start, loff_t end, * nonzero data on subsequent file extends. * * We need to call this before i_size is updated on the inode because - * otherwise block_write_full_page() will skip writeout of pages past + * otherwise block_write_full_folio() will skip writeout of pages past * i_size. */ int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle, diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 94e2a1244442..8b6d15010703 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -818,7 +818,7 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, /* * fs-writeback will release the dirty pages without page lock * whose offset are over inode size, the release happens at - * block_write_full_page(). + * block_write_full_folio(). */ i_size_write(inode, abs_to); inode->i_blocks = ocfs2_inode_sector_count(inode); diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 94f6161eb45e..396b2adf24bf 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -252,8 +252,8 @@ void __bh_read_batch(int nr, struct buffer_head *bhs[], * address_spaces. */ void block_invalidate_folio(struct folio *folio, size_t offset, size_t length); -int block_write_full_page(struct page *page, get_block_t *get_block, - struct writeback_control *wbc); +int block_write_full_folio(struct folio *folio, struct writeback_control *wbc, + void *get_block); int __block_write_full_folio(struct inode *inode, struct folio *folio, get_block_t *get_block, struct writeback_control *wbc, bh_end_io_t *handler); -- cgit v1.2.3 From 14059f66a959c760467ea2041e165f412845bcb8 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 15 Dec 2023 20:02:45 +0000 Subject: fs: remove the bh_end_io argument from __block_write_full_folio All callers are passing end_buffer_async_write as this argument, so we can hardcode references to it within __block_write_full_folio(). That lets us make end_buffer_async_write() static. Link: https://lkml.kernel.org/r/20231215200245.748418-15-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Jens Axboe Reviewed-by: Christoph Hellwig Signed-off-by: Andrew Morton --- fs/buffer.c | 22 ++++++++++------------ fs/gfs2/aops.c | 2 +- include/linux/buffer_head.h | 4 +--- 3 files changed, 12 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/fs/buffer.c b/fs/buffer.c index c838b4a31009..19548369bc6c 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -372,10 +372,10 @@ static void end_buffer_async_read_io(struct buffer_head *bh, int uptodate) } /* - * Completion handler for block_write_full_folio() - pages which are unlocked - * during I/O, and which have PageWriteback cleared upon I/O completion. + * Completion handler for block_write_full_folio() - folios which are unlocked + * during I/O, and which have the writeback flag cleared upon I/O completion. */ -void end_buffer_async_write(struct buffer_head *bh, int uptodate) +static void end_buffer_async_write(struct buffer_head *bh, int uptodate) { unsigned long flags; struct buffer_head *first; @@ -415,7 +415,6 @@ still_busy: spin_unlock_irqrestore(&first->b_uptodate_lock, flags); return; } -EXPORT_SYMBOL(end_buffer_async_write); /* * If a page's buffers are under async readin (end_buffer_async_read @@ -1787,8 +1786,7 @@ static struct buffer_head *folio_create_buffers(struct folio *folio, * causes the writes to be flagged as synchronous writes. */ int __block_write_full_folio(struct inode *inode, struct folio *folio, - get_block_t *get_block, struct writeback_control *wbc, - bh_end_io_t *handler) + get_block_t *get_block, struct writeback_control *wbc) { int err; sector_t block; @@ -1867,7 +1865,8 @@ int __block_write_full_folio(struct inode *inode, struct folio *folio, continue; } if (test_clear_buffer_dirty(bh)) { - mark_buffer_async_write_endio(bh, handler); + mark_buffer_async_write_endio(bh, + end_buffer_async_write); } else { unlock_buffer(bh); } @@ -1920,7 +1919,8 @@ recover: if (buffer_mapped(bh) && buffer_dirty(bh) && !buffer_delay(bh)) { lock_buffer(bh); - mark_buffer_async_write_endio(bh, handler); + mark_buffer_async_write_endio(bh, + end_buffer_async_write); } else { /* * The buffer may have been set dirty during @@ -2704,8 +2704,7 @@ int block_write_full_folio(struct folio *folio, struct writeback_control *wbc, /* Is the folio fully inside i_size? */ if (folio_pos(folio) + folio_size(folio) <= i_size) - return __block_write_full_folio(inode, folio, get_block, wbc, - end_buffer_async_write); + return __block_write_full_folio(inode, folio, get_block, wbc); /* Is the folio fully outside i_size? (truncate in progress) */ if (folio_pos(folio) >= i_size) { @@ -2722,8 +2721,7 @@ int block_write_full_folio(struct folio *folio, struct writeback_control *wbc, */ folio_zero_segment(folio, offset_in_folio(folio, i_size), folio_size(folio)); - return __block_write_full_folio(inode, folio, get_block, wbc, - end_buffer_async_write); + return __block_write_full_folio(inode, folio, get_block, wbc); } sector_t generic_block_bmap(struct address_space *mapping, sector_t block, diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index f986cd032b76..9914d7f54f7d 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -108,7 +108,7 @@ static int gfs2_write_jdata_folio(struct folio *folio, folio_size(folio)); return __block_write_full_folio(inode, folio, gfs2_get_block_noalloc, - wbc, end_buffer_async_write); + wbc); } /** diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 396b2adf24bf..d78454a4dd1f 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -205,7 +205,6 @@ struct buffer_head *create_empty_buffers(struct folio *folio, unsigned long blocksize, unsigned long b_state); void end_buffer_read_sync(struct buffer_head *bh, int uptodate); void end_buffer_write_sync(struct buffer_head *bh, int uptodate); -void end_buffer_async_write(struct buffer_head *bh, int uptodate); /* Things to do with buffers at mapping->private_list */ void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode); @@ -255,8 +254,7 @@ void block_invalidate_folio(struct folio *folio, size_t offset, size_t length); int block_write_full_folio(struct folio *folio, struct writeback_control *wbc, void *get_block); int __block_write_full_folio(struct inode *inode, struct folio *folio, - get_block_t *get_block, struct writeback_control *wbc, - bh_end_io_t *handler); + get_block_t *get_block, struct writeback_control *wbc); int block_read_full_folio(struct folio *, get_block_t *); bool block_is_partially_uptodate(struct folio *, size_t from, size_t count); int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len, -- cgit v1.2.3 From 280ec6ccb6422aa4a04f9ac4216ddcf055acc95d Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Tue, 19 Dec 2023 23:28:45 +0100 Subject: kasan: rename kasan_slab_free_mempool to kasan_mempool_poison_object Patch series "kasan: save mempool stack traces". This series updates KASAN to save alloc and free stack traces for secondary-level allocators that cache and reuse allocations internally instead of giving them back to the underlying allocator (e.g. mempool). As a part of this change, introduce and document a set of KASAN hooks: bool kasan_mempool_poison_pages(struct page *page, unsigned int order); void kasan_mempool_unpoison_pages(struct page *page, unsigned int order); bool kasan_mempool_poison_object(void *ptr); void kasan_mempool_unpoison_object(void *ptr, size_t size); and use them in the mempool code. Besides mempool, skbuff and io_uring also cache allocations and already use KASAN hooks to poison those. Their code is updated to use the new mempool hooks. The new hooks save alloc and free stack traces (for normal kmalloc and slab objects; stack traces for large kmalloc objects and page_alloc are not supported by KASAN yet), improve the readability of the users' code, and also allow the users to prevent double-free and invalid-free bugs; see the patches for the details. This patch (of 21): Rename kasan_slab_free_mempool to kasan_mempool_poison_object. kasan_slab_free_mempool is a slightly confusing name: it is unclear whether this function poisons the object when it is freed into mempool or does something when the object is freed from mempool to the underlying allocator. The new name also aligns with other mempool-related KASAN hooks added in the following patches in this series. Link: https://lkml.kernel.org/r/cover.1703024586.git.andreyknvl@google.com Link: https://lkml.kernel.org/r/c5618685abb7cdbf9fb4897f565e7759f601da84.1703024586.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Cc: Alexander Lobakin Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Breno Leitao Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Marco Elver Signed-off-by: Andrew Morton --- include/linux/kasan.h | 8 ++++---- io_uring/alloc_cache.h | 3 +-- mm/kasan/common.c | 4 ++-- mm/mempool.c | 2 +- 4 files changed, 8 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 72cb693b075b..6310435f528b 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -172,11 +172,11 @@ static __always_inline void kasan_kfree_large(void *ptr) __kasan_kfree_large(ptr, _RET_IP_); } -void __kasan_slab_free_mempool(void *ptr, unsigned long ip); -static __always_inline void kasan_slab_free_mempool(void *ptr) +void __kasan_mempool_poison_object(void *ptr, unsigned long ip); +static __always_inline void kasan_mempool_poison_object(void *ptr) { if (kasan_enabled()) - __kasan_slab_free_mempool(ptr, _RET_IP_); + __kasan_mempool_poison_object(ptr, _RET_IP_); } void * __must_check __kasan_slab_alloc(struct kmem_cache *s, @@ -256,7 +256,7 @@ static inline bool kasan_slab_free(struct kmem_cache *s, void *object, bool init return false; } static inline void kasan_kfree_large(void *ptr) {} -static inline void kasan_slab_free_mempool(void *ptr) {} +static inline void kasan_mempool_poison_object(void *ptr) {} static inline void *kasan_slab_alloc(struct kmem_cache *s, void *object, gfp_t flags, bool init) { diff --git a/io_uring/alloc_cache.h b/io_uring/alloc_cache.h index 241245cb54a6..8de0414e8efe 100644 --- a/io_uring/alloc_cache.h +++ b/io_uring/alloc_cache.h @@ -16,8 +16,7 @@ static inline bool io_alloc_cache_put(struct io_alloc_cache *cache, if (cache->nr_cached < cache->max_cached) { cache->nr_cached++; wq_stack_add_head(&entry->node, &cache->list); - /* KASAN poisons object */ - kasan_slab_free_mempool(entry); + kasan_mempool_poison_object(entry); return true; } return false; diff --git a/mm/kasan/common.c b/mm/kasan/common.c index fe6c4b43ad9f..e0394d0ee7f1 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -271,7 +271,7 @@ static inline bool ____kasan_kfree_large(void *ptr, unsigned long ip) /* * The object will be poisoned by kasan_poison_pages() or - * kasan_slab_free_mempool(). + * kasan_mempool_poison_object(). */ return false; @@ -282,7 +282,7 @@ void __kasan_kfree_large(void *ptr, unsigned long ip) ____kasan_kfree_large(ptr, ip); } -void __kasan_slab_free_mempool(void *ptr, unsigned long ip) +void __kasan_mempool_poison_object(void *ptr, unsigned long ip) { struct folio *folio; diff --git a/mm/mempool.c b/mm/mempool.c index b3d2084fd989..7e1c729f292b 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -107,7 +107,7 @@ static inline void poison_element(mempool_t *pool, void *element) static __always_inline void kasan_poison_element(mempool_t *pool, void *element) { if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc) - kasan_slab_free_mempool(element); + kasan_mempool_poison_object(element); else if (pool->alloc == mempool_alloc_pages) kasan_poison_pages(element, (unsigned long)pool->pool_data, false); -- cgit v1.2.3 From 9b94fe91099cbf05606151ef05bea9632666f5d5 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Tue, 19 Dec 2023 23:28:46 +0100 Subject: kasan: move kasan_mempool_poison_object Move kasan_mempool_poison_object after all slab-related KASAN hooks. This is a preparatory change for the following patches in this series. No functional changes. Link: https://lkml.kernel.org/r/23ea215409f43c13cdf9ecc454501a264c107d67.1703024586.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Cc: Alexander Lobakin Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Breno Leitao Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Marco Elver Signed-off-by: Andrew Morton --- include/linux/kasan.h | 16 ++++++++-------- mm/kasan/common.c | 46 +++++++++++++++++++++++----------------------- 2 files changed, 31 insertions(+), 31 deletions(-) (limited to 'include') diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 6310435f528b..0d1f925c136d 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -172,13 +172,6 @@ static __always_inline void kasan_kfree_large(void *ptr) __kasan_kfree_large(ptr, _RET_IP_); } -void __kasan_mempool_poison_object(void *ptr, unsigned long ip); -static __always_inline void kasan_mempool_poison_object(void *ptr) -{ - if (kasan_enabled()) - __kasan_mempool_poison_object(ptr, _RET_IP_); -} - void * __must_check __kasan_slab_alloc(struct kmem_cache *s, void *object, gfp_t flags, bool init); static __always_inline void * __must_check kasan_slab_alloc( @@ -219,6 +212,13 @@ static __always_inline void * __must_check kasan_krealloc(const void *object, return (void *)object; } +void __kasan_mempool_poison_object(void *ptr, unsigned long ip); +static __always_inline void kasan_mempool_poison_object(void *ptr) +{ + if (kasan_enabled()) + __kasan_mempool_poison_object(ptr, _RET_IP_); +} + /* * Unlike kasan_check_read/write(), kasan_check_byte() is performed even for * the hardware tag-based mode that doesn't rely on compiler instrumentation. @@ -256,7 +256,6 @@ static inline bool kasan_slab_free(struct kmem_cache *s, void *object, bool init return false; } static inline void kasan_kfree_large(void *ptr) {} -static inline void kasan_mempool_poison_object(void *ptr) {} static inline void *kasan_slab_alloc(struct kmem_cache *s, void *object, gfp_t flags, bool init) { @@ -276,6 +275,7 @@ static inline void *kasan_krealloc(const void *object, size_t new_size, { return (void *)object; } +static inline void kasan_mempool_poison_object(void *ptr) {} static inline bool kasan_check_byte(const void *address) { return true; diff --git a/mm/kasan/common.c b/mm/kasan/common.c index e0394d0ee7f1..fc7f711607e1 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -282,29 +282,6 @@ void __kasan_kfree_large(void *ptr, unsigned long ip) ____kasan_kfree_large(ptr, ip); } -void __kasan_mempool_poison_object(void *ptr, unsigned long ip) -{ - struct folio *folio; - - folio = virt_to_folio(ptr); - - /* - * Even though this function is only called for kmem_cache_alloc and - * kmalloc backed mempool allocations, those allocations can still be - * !PageSlab() when the size provided to kmalloc is larger than - * KMALLOC_MAX_SIZE, and kmalloc falls back onto page_alloc. - */ - if (unlikely(!folio_test_slab(folio))) { - if (____kasan_kfree_large(ptr, ip)) - return; - kasan_poison(ptr, folio_size(folio), KASAN_PAGE_FREE, false); - } else { - struct slab *slab = folio_slab(folio); - - ____kasan_slab_free(slab->slab_cache, ptr, ip, false, false); - } -} - void * __must_check __kasan_slab_alloc(struct kmem_cache *cache, void *object, gfp_t flags, bool init) { @@ -452,6 +429,29 @@ void * __must_check __kasan_krealloc(const void *object, size_t size, gfp_t flag return ____kasan_kmalloc(slab->slab_cache, object, size, flags); } +void __kasan_mempool_poison_object(void *ptr, unsigned long ip) +{ + struct folio *folio; + + folio = virt_to_folio(ptr); + + /* + * Even though this function is only called for kmem_cache_alloc and + * kmalloc backed mempool allocations, those allocations can still be + * !PageSlab() when the size provided to kmalloc is larger than + * KMALLOC_MAX_SIZE, and kmalloc falls back onto page_alloc. + */ + if (unlikely(!folio_test_slab(folio))) { + if (____kasan_kfree_large(ptr, ip)) + return; + kasan_poison(ptr, folio_size(folio), KASAN_PAGE_FREE, false); + } else { + struct slab *slab = folio_slab(folio); + + ____kasan_slab_free(slab->slab_cache, ptr, ip, false, false); + } +} + bool __kasan_check_byte(const void *address, unsigned long ip) { if (!kasan_byte_accessible(address)) { -- cgit v1.2.3 From 1bb843048d00050678c392dab87a15c8b756df6f Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Tue, 19 Dec 2023 23:28:47 +0100 Subject: kasan: document kasan_mempool_poison_object Add documentation comment for kasan_mempool_poison_object. Link: https://lkml.kernel.org/r/af33ba8cabfa1ad731fe23a3f874bfc8d3b7fed4.1703024586.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Cc: Alexander Lobakin Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Breno Leitao Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Marco Elver Signed-off-by: Andrew Morton --- include/linux/kasan.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include') diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 0d1f925c136d..bbf6e2fa4ffd 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -213,6 +213,24 @@ static __always_inline void * __must_check kasan_krealloc(const void *object, } void __kasan_mempool_poison_object(void *ptr, unsigned long ip); +/** + * kasan_mempool_poison_object - Check and poison a mempool slab allocation. + * @ptr: Pointer to the slab allocation. + * + * This function is intended for kernel subsystems that cache slab allocations + * to reuse them instead of freeing them back to the slab allocator (e.g. + * mempool). + * + * This function poisons a slab allocation without initializing its memory and + * without putting it into the quarantine (for the Generic mode). + * + * This function also performs checks to detect double-free and invalid-free + * bugs and reports them. + * + * This function operates on all slab allocations including large kmalloc + * allocations (the ones returned by kmalloc_large() or by kmalloc() with the + * size > KMALLOC_MAX_SIZE). + */ static __always_inline void kasan_mempool_poison_object(void *ptr) { if (kasan_enabled()) -- cgit v1.2.3 From 2e7c954c11af96aa1e0566a706f22152ef91d759 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Tue, 19 Dec 2023 23:28:48 +0100 Subject: kasan: add return value for kasan_mempool_poison_object Add a return value for kasan_mempool_poison_object that lets the caller know whether the allocation is affected by a double-free or an invalid-free bug. The caller can use this return value to stop operating on the object. Also introduce a check_page_allocation helper function to improve the code readability. Link: https://lkml.kernel.org/r/618af65273875fb9f56954285443279b15f1fcd9.1703024586.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Cc: Alexander Lobakin Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Breno Leitao Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Marco Elver Signed-off-by: Andrew Morton --- include/linux/kasan.h | 17 ++++++++++++----- mm/kasan/common.c | 21 ++++++++++----------- 2 files changed, 22 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/include/linux/kasan.h b/include/linux/kasan.h index bbf6e2fa4ffd..33387e254caa 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -212,7 +212,7 @@ static __always_inline void * __must_check kasan_krealloc(const void *object, return (void *)object; } -void __kasan_mempool_poison_object(void *ptr, unsigned long ip); +bool __kasan_mempool_poison_object(void *ptr, unsigned long ip); /** * kasan_mempool_poison_object - Check and poison a mempool slab allocation. * @ptr: Pointer to the slab allocation. @@ -225,16 +225,20 @@ void __kasan_mempool_poison_object(void *ptr, unsigned long ip); * without putting it into the quarantine (for the Generic mode). * * This function also performs checks to detect double-free and invalid-free - * bugs and reports them. + * bugs and reports them. The caller can use the return value of this function + * to find out if the allocation is buggy. * * This function operates on all slab allocations including large kmalloc * allocations (the ones returned by kmalloc_large() or by kmalloc() with the * size > KMALLOC_MAX_SIZE). + * + * Return: true if the allocation can be safely reused; false otherwise. */ -static __always_inline void kasan_mempool_poison_object(void *ptr) +static __always_inline bool kasan_mempool_poison_object(void *ptr) { if (kasan_enabled()) - __kasan_mempool_poison_object(ptr, _RET_IP_); + return __kasan_mempool_poison_object(ptr, _RET_IP_); + return true; } /* @@ -293,7 +297,10 @@ static inline void *kasan_krealloc(const void *object, size_t new_size, { return (void *)object; } -static inline void kasan_mempool_poison_object(void *ptr) {} +static inline bool kasan_mempool_poison_object(void *ptr) +{ + return true; +} static inline bool kasan_check_byte(const void *address) { return true; diff --git a/mm/kasan/common.c b/mm/kasan/common.c index fc7f711607e1..2b4869de4985 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -254,7 +254,7 @@ bool __kasan_slab_free(struct kmem_cache *cache, void *object, return ____kasan_slab_free(cache, object, ip, true, init); } -static inline bool ____kasan_kfree_large(void *ptr, unsigned long ip) +static inline bool check_page_allocation(void *ptr, unsigned long ip) { if (!kasan_arch_is_ready()) return false; @@ -269,17 +269,14 @@ static inline bool ____kasan_kfree_large(void *ptr, unsigned long ip) return true; } - /* - * The object will be poisoned by kasan_poison_pages() or - * kasan_mempool_poison_object(). - */ - return false; } void __kasan_kfree_large(void *ptr, unsigned long ip) { - ____kasan_kfree_large(ptr, ip); + check_page_allocation(ptr, ip); + + /* The object will be poisoned by kasan_poison_pages(). */ } void * __must_check __kasan_slab_alloc(struct kmem_cache *cache, @@ -429,7 +426,7 @@ void * __must_check __kasan_krealloc(const void *object, size_t size, gfp_t flag return ____kasan_kmalloc(slab->slab_cache, object, size, flags); } -void __kasan_mempool_poison_object(void *ptr, unsigned long ip) +bool __kasan_mempool_poison_object(void *ptr, unsigned long ip) { struct folio *folio; @@ -442,13 +439,15 @@ void __kasan_mempool_poison_object(void *ptr, unsigned long ip) * KMALLOC_MAX_SIZE, and kmalloc falls back onto page_alloc. */ if (unlikely(!folio_test_slab(folio))) { - if (____kasan_kfree_large(ptr, ip)) - return; + if (check_page_allocation(ptr, ip)) + return false; kasan_poison(ptr, folio_size(folio), KASAN_PAGE_FREE, false); + return true; } else { struct slab *slab = folio_slab(folio); - ____kasan_slab_free(slab->slab_cache, ptr, ip, false, false); + return !____kasan_slab_free(slab->slab_cache, ptr, ip, + false, false); } } -- cgit v1.2.3 From 1956832753735b1c399b86b2c66cb7c317dc9f31 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Tue, 19 Dec 2023 23:28:49 +0100 Subject: kasan: introduce kasan_mempool_unpoison_object Introduce and document a kasan_mempool_unpoison_object hook. This hook serves as a replacement for the generic kasan_unpoison_range that the mempool code relies on right now. mempool will be updated to use the new hook in one of the following patches. For now, define the new hook to be identical to kasan_unpoison_range. One of the following patches will update it to add stack trace collection. Link: https://lkml.kernel.org/r/dae25f0e18ed8fd50efe509c5b71a0592de5c18d.1703024586.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Cc: Alexander Lobakin Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Breno Leitao Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Marco Elver Signed-off-by: Andrew Morton --- include/linux/kasan.h | 31 +++++++++++++++++++++++++++++++ mm/kasan/common.c | 5 +++++ 2 files changed, 36 insertions(+) (limited to 'include') diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 33387e254caa..c5fe303bc1c2 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -228,6 +228,9 @@ bool __kasan_mempool_poison_object(void *ptr, unsigned long ip); * bugs and reports them. The caller can use the return value of this function * to find out if the allocation is buggy. * + * Before the poisoned allocation can be reused, it must be unpoisoned via + * kasan_mempool_unpoison_object(). + * * This function operates on all slab allocations including large kmalloc * allocations (the ones returned by kmalloc_large() or by kmalloc() with the * size > KMALLOC_MAX_SIZE). @@ -241,6 +244,32 @@ static __always_inline bool kasan_mempool_poison_object(void *ptr) return true; } +void __kasan_mempool_unpoison_object(void *ptr, size_t size, unsigned long ip); +/** + * kasan_mempool_unpoison_object - Unpoison a mempool slab allocation. + * @ptr: Pointer to the slab allocation. + * @size: Size to be unpoisoned. + * + * This function is intended for kernel subsystems that cache slab allocations + * to reuse them instead of freeing them back to the slab allocator (e.g. + * mempool). + * + * This function unpoisons a slab allocation that was previously poisoned via + * kasan_mempool_poison_object() without initializing its memory. For the + * tag-based modes, this function does not assign a new tag to the allocation + * and instead restores the original tags based on the pointer value. + * + * This function operates on all slab allocations including large kmalloc + * allocations (the ones returned by kmalloc_large() or by kmalloc() with the + * size > KMALLOC_MAX_SIZE). + */ +static __always_inline void kasan_mempool_unpoison_object(void *ptr, + size_t size) +{ + if (kasan_enabled()) + __kasan_mempool_unpoison_object(ptr, size, _RET_IP_); +} + /* * Unlike kasan_check_read/write(), kasan_check_byte() is performed even for * the hardware tag-based mode that doesn't rely on compiler instrumentation. @@ -301,6 +330,8 @@ static inline bool kasan_mempool_poison_object(void *ptr) { return true; } +static inline void kasan_mempool_unpoison_object(void *ptr, size_t size) {} + static inline bool kasan_check_byte(const void *address) { return true; diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 2b4869de4985..4b85d35bb8ab 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -451,6 +451,11 @@ bool __kasan_mempool_poison_object(void *ptr, unsigned long ip) } } +void __kasan_mempool_unpoison_object(void *ptr, size_t size, unsigned long ip) +{ + kasan_unpoison(ptr, size, false); +} + bool __kasan_check_byte(const void *address, unsigned long ip) { if (!kasan_byte_accessible(address)) { -- cgit v1.2.3 From f129c31039283df884913142b0f3797d64d3a9d6 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Tue, 19 Dec 2023 23:28:50 +0100 Subject: kasan: introduce kasan_mempool_poison_pages Introduce and document a kasan_mempool_poison_pages hook to be used by the mempool code instead of kasan_poison_pages. Compated to kasan_poison_pages, the new hook: 1. For the tag-based modes, skips checking and poisoning allocations that were not tagged due to sampling. 2. Checks for double-free and invalid-free bugs. In the future, kasan_poison_pages can also be updated to handle #2, but this is out-of-scope of this series. Link: https://lkml.kernel.org/r/88dc7340cce28249abf789f6e0c792c317df9ba5.1703024586.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Cc: Alexander Lobakin Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Breno Leitao Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Marco Elver Signed-off-by: Andrew Morton --- include/linux/kasan.h | 27 +++++++++++++++++++++++++++ mm/kasan/common.c | 23 +++++++++++++++++++++++ 2 files changed, 50 insertions(+) (limited to 'include') diff --git a/include/linux/kasan.h b/include/linux/kasan.h index c5fe303bc1c2..de2a695ad34d 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -212,6 +212,29 @@ static __always_inline void * __must_check kasan_krealloc(const void *object, return (void *)object; } +bool __kasan_mempool_poison_pages(struct page *page, unsigned int order, + unsigned long ip); +/** + * kasan_mempool_poison_pages - Check and poison a mempool page allocation. + * @page: Pointer to the page allocation. + * @order: Order of the allocation. + * + * This function is intended for kernel subsystems that cache page allocations + * to reuse them instead of freeing them back to page_alloc (e.g. mempool). + * + * This function is similar to kasan_mempool_poison_object() but operates on + * page allocations. + * + * Return: true if the allocation can be safely reused; false otherwise. + */ +static __always_inline bool kasan_mempool_poison_pages(struct page *page, + unsigned int order) +{ + if (kasan_enabled()) + return __kasan_mempool_poison_pages(page, order, _RET_IP_); + return true; +} + bool __kasan_mempool_poison_object(void *ptr, unsigned long ip); /** * kasan_mempool_poison_object - Check and poison a mempool slab allocation. @@ -326,6 +349,10 @@ static inline void *kasan_krealloc(const void *object, size_t new_size, { return (void *)object; } +static inline bool kasan_mempool_poison_pages(struct page *page, unsigned int order) +{ + return true; +} static inline bool kasan_mempool_poison_object(void *ptr) { return true; diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 4b85d35bb8ab..b416f4c265a4 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -426,6 +426,29 @@ void * __must_check __kasan_krealloc(const void *object, size_t size, gfp_t flag return ____kasan_kmalloc(slab->slab_cache, object, size, flags); } +bool __kasan_mempool_poison_pages(struct page *page, unsigned int order, + unsigned long ip) +{ + unsigned long *ptr; + + if (unlikely(PageHighMem(page))) + return true; + + /* Bail out if allocation was excluded due to sampling. */ + if (!IS_ENABLED(CONFIG_KASAN_GENERIC) && + page_kasan_tag(page) == KASAN_TAG_KERNEL) + return true; + + ptr = page_address(page); + + if (check_page_allocation(ptr, ip)) + return false; + + kasan_poison(ptr, PAGE_SIZE << order, KASAN_PAGE_FREE, false); + + return true; +} + bool __kasan_mempool_poison_object(void *ptr, unsigned long ip) { struct folio *folio; -- cgit v1.2.3 From 9f41c59ae3163690868a32bd77e9e33c3bab555e Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Tue, 19 Dec 2023 23:28:51 +0100 Subject: kasan: introduce kasan_mempool_unpoison_pages Introduce and document a new kasan_mempool_unpoison_pages hook to be used by the mempool code instead of kasan_unpoison_pages. This hook is not functionally different from kasan_unpoison_pages, but using it improves the mempool code readability. Link: https://lkml.kernel.org/r/239bd9af6176f2cc59f5c25893eb36143184daff.1703024586.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Cc: Alexander Lobakin Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Breno Leitao Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Marco Elver Signed-off-by: Andrew Morton --- include/linux/kasan.h | 25 +++++++++++++++++++++++++ mm/kasan/common.c | 6 ++++++ 2 files changed, 31 insertions(+) (limited to 'include') diff --git a/include/linux/kasan.h b/include/linux/kasan.h index de2a695ad34d..f8ebde384bd7 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -225,6 +225,9 @@ bool __kasan_mempool_poison_pages(struct page *page, unsigned int order, * This function is similar to kasan_mempool_poison_object() but operates on * page allocations. * + * Before the poisoned allocation can be reused, it must be unpoisoned via + * kasan_mempool_unpoison_pages(). + * * Return: true if the allocation can be safely reused; false otherwise. */ static __always_inline bool kasan_mempool_poison_pages(struct page *page, @@ -235,6 +238,27 @@ static __always_inline bool kasan_mempool_poison_pages(struct page *page, return true; } +void __kasan_mempool_unpoison_pages(struct page *page, unsigned int order, + unsigned long ip); +/** + * kasan_mempool_unpoison_pages - Unpoison a mempool page allocation. + * @page: Pointer to the page allocation. + * @order: Order of the allocation. + * + * This function is intended for kernel subsystems that cache page allocations + * to reuse them instead of freeing them back to page_alloc (e.g. mempool). + * + * This function unpoisons a page allocation that was previously poisoned by + * kasan_mempool_poison_pages() without zeroing the allocation's memory. For + * the tag-based modes, this function assigns a new tag to the allocation. + */ +static __always_inline void kasan_mempool_unpoison_pages(struct page *page, + unsigned int order) +{ + if (kasan_enabled()) + __kasan_mempool_unpoison_pages(page, order, _RET_IP_); +} + bool __kasan_mempool_poison_object(void *ptr, unsigned long ip); /** * kasan_mempool_poison_object - Check and poison a mempool slab allocation. @@ -353,6 +377,7 @@ static inline bool kasan_mempool_poison_pages(struct page *page, unsigned int or { return true; } +static inline void kasan_mempool_unpoison_pages(struct page *page, unsigned int order) {} static inline bool kasan_mempool_poison_object(void *ptr) { return true; diff --git a/mm/kasan/common.c b/mm/kasan/common.c index b416f4c265a4..7ebc001d0fcd 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -449,6 +449,12 @@ bool __kasan_mempool_poison_pages(struct page *page, unsigned int order, return true; } +void __kasan_mempool_unpoison_pages(struct page *page, unsigned int order, + unsigned long ip) +{ + __kasan_unpoison_pages(page, order, false); +} + bool __kasan_mempool_poison_object(void *ptr, unsigned long ip) { struct folio *folio; -- cgit v1.2.3 From b556a462eb8df6b6836c318d23f43409c40a7c7e Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Tue, 19 Dec 2023 23:28:53 +0100 Subject: kasan: save free stack traces for slab mempools Make kasan_mempool_poison_object save free stack traces for slab and kmalloc mempools when the object is freed into the mempool. Also simplify and rename ____kasan_slab_free to poison_slab_object and do a few other reability changes. Link: https://lkml.kernel.org/r/413a7c7c3344fb56809853339ffaabc9e4905e94.1703024586.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Cc: Alexander Lobakin Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Breno Leitao Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Marco Elver Signed-off-by: Andrew Morton --- include/linux/kasan.h | 5 +++-- mm/kasan/common.c | 20 +++++++++----------- 2 files changed, 12 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/linux/kasan.h b/include/linux/kasan.h index f8ebde384bd7..e636a00e26ba 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -268,8 +268,9 @@ bool __kasan_mempool_poison_object(void *ptr, unsigned long ip); * to reuse them instead of freeing them back to the slab allocator (e.g. * mempool). * - * This function poisons a slab allocation without initializing its memory and - * without putting it into the quarantine (for the Generic mode). + * This function poisons a slab allocation and saves a free stack trace for it + * without initializing the allocation's memory and without putting it into the + * quarantine (for the Generic mode). * * This function also performs checks to detect double-free and invalid-free * bugs and reports them. The caller can use the return value of this function diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 3f4a1ed69e03..59146886e57d 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -207,8 +207,8 @@ void * __must_check __kasan_init_slab_obj(struct kmem_cache *cache, return (void *)object; } -static inline bool ____kasan_slab_free(struct kmem_cache *cache, void *object, - unsigned long ip, bool quarantine, bool init) +static inline bool poison_slab_object(struct kmem_cache *cache, void *object, + unsigned long ip, bool init) { void *tagged_object; @@ -221,13 +221,12 @@ static inline bool ____kasan_slab_free(struct kmem_cache *cache, void *object, if (is_kfence_address(object)) return false; - if (unlikely(nearest_obj(cache, virt_to_slab(object), object) != - object)) { + if (unlikely(nearest_obj(cache, virt_to_slab(object), object) != object)) { kasan_report_invalid_free(tagged_object, ip, KASAN_REPORT_INVALID_FREE); return true; } - /* RCU slabs could be legally used after free within the RCU period */ + /* RCU slabs could be legally used after free within the RCU period. */ if (unlikely(cache->flags & SLAB_TYPESAFE_BY_RCU)) return false; @@ -239,19 +238,18 @@ static inline bool ____kasan_slab_free(struct kmem_cache *cache, void *object, kasan_poison(object, round_up(cache->object_size, KASAN_GRANULE_SIZE), KASAN_SLAB_FREE, init); - if ((IS_ENABLED(CONFIG_KASAN_GENERIC) && !quarantine)) - return false; - if (kasan_stack_collection_enabled()) kasan_save_free_info(cache, tagged_object); - return kasan_quarantine_put(cache, object); + return false; } bool __kasan_slab_free(struct kmem_cache *cache, void *object, unsigned long ip, bool init) { - return ____kasan_slab_free(cache, object, ip, true, init); + bool buggy_object = poison_slab_object(cache, object, ip, init); + + return buggy_object ? true : kasan_quarantine_put(cache, object); } static inline bool check_page_allocation(void *ptr, unsigned long ip) @@ -472,7 +470,7 @@ bool __kasan_mempool_poison_object(void *ptr, unsigned long ip) } slab = folio_slab(folio); - return !____kasan_slab_free(slab->slab_cache, ptr, ip, false, false); + return !poison_slab_object(slab->slab_cache, ptr, ip, false); } void __kasan_mempool_unpoison_object(void *ptr, size_t size, unsigned long ip) -- cgit v1.2.3 From 29d7355a9d05de9a6e38cc4d1146fb96c43853fb Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Tue, 19 Dec 2023 23:28:56 +0100 Subject: kasan: save alloc stack traces for mempool Update kasan_mempool_unpoison_object to properly poison the redzone and save alloc strack traces for kmalloc and slab pools. As a part of this change, split out and use a unpoison_slab_object helper function from __kasan_slab_alloc. [nathan@kernel.org: mark unpoison_slab_object() as static] Link: https://lkml.kernel.org/r/20231221180042.104694-1-andrey.konovalov@linux.dev Link: https://lkml.kernel.org/r/05ad235da8347cfe14d496d01b2aaf074b4f607c.1703024586.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Signed-off-by: Nathan Chancellor Cc: Alexander Lobakin Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Breno Leitao Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Marco Elver Signed-off-by: Andrew Morton --- include/linux/kasan.h | 7 ++++--- mm/kasan/common.c | 50 ++++++++++++++++++++++++++++++++++++++++---------- 2 files changed, 44 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/linux/kasan.h b/include/linux/kasan.h index e636a00e26ba..7392c5d89b92 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -303,9 +303,10 @@ void __kasan_mempool_unpoison_object(void *ptr, size_t size, unsigned long ip); * mempool). * * This function unpoisons a slab allocation that was previously poisoned via - * kasan_mempool_poison_object() without initializing its memory. For the - * tag-based modes, this function does not assign a new tag to the allocation - * and instead restores the original tags based on the pointer value. + * kasan_mempool_poison_object() and saves an alloc stack trace for it without + * initializing the allocation's memory. For the tag-based modes, this function + * does not assign a new tag to the allocation and instead restores the + * original tags based on the pointer value. * * This function operates on all slab allocations including large kmalloc * allocations (the ones returned by kmalloc_large() or by kmalloc() with the diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 962805bf5f62..bf16c2dfa8e7 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -277,6 +277,20 @@ void __kasan_kfree_large(void *ptr, unsigned long ip) /* The object will be poisoned by kasan_poison_pages(). */ } +static inline void unpoison_slab_object(struct kmem_cache *cache, void *object, + gfp_t flags, bool init) +{ + /* + * Unpoison the whole object. For kmalloc() allocations, + * poison_kmalloc_redzone() will do precise poisoning. + */ + kasan_unpoison(object, cache->object_size, init); + + /* Save alloc info (if possible) for non-kmalloc() allocations. */ + if (kasan_stack_collection_enabled() && !is_kmalloc_cache(cache)) + kasan_save_alloc_info(cache, object, flags); +} + void * __must_check __kasan_slab_alloc(struct kmem_cache *cache, void *object, gfp_t flags, bool init) { @@ -299,15 +313,8 @@ void * __must_check __kasan_slab_alloc(struct kmem_cache *cache, tag = assign_tag(cache, object, false); tagged_object = set_tag(object, tag); - /* - * Unpoison the whole object. - * For kmalloc() allocations, kasan_kmalloc() will do precise poisoning. - */ - kasan_unpoison(tagged_object, cache->object_size, init); - - /* Save alloc info (if possible) for non-kmalloc() allocations. */ - if (kasan_stack_collection_enabled() && !is_kmalloc_cache(cache)) - kasan_save_alloc_info(cache, tagged_object, flags); + /* Unpoison the object and save alloc info for non-kmalloc() allocations. */ + unpoison_slab_object(cache, tagged_object, flags, init); return tagged_object; } @@ -482,7 +489,30 @@ bool __kasan_mempool_poison_object(void *ptr, unsigned long ip) void __kasan_mempool_unpoison_object(void *ptr, size_t size, unsigned long ip) { - kasan_unpoison(ptr, size, false); + struct slab *slab; + gfp_t flags = 0; /* Might be executing under a lock. */ + + if (is_kfence_address(kasan_reset_tag(ptr))) + return; + + slab = virt_to_slab(ptr); + + /* + * This function can be called for large kmalloc allocation that get + * their memory from page_alloc. + */ + if (unlikely(!slab)) { + kasan_unpoison(ptr, size, false); + poison_kmalloc_large_redzone(ptr, size, flags); + return; + } + + /* Unpoison the object and save alloc info for non-kmalloc() allocations. */ + unpoison_slab_object(slab->slab_cache, ptr, size, flags); + + /* Poison the redzone and save alloc info for kmalloc() allocations. */ + if (is_kmalloc_cache(slab->slab_cache)) + poison_kmalloc_redzone(slab->slab_cache, ptr, size, flags); } bool __kasan_check_byte(const void *address, unsigned long ip) -- cgit v1.2.3 From 37dcc69ad17a008d2b720bdc39f070ef2a959430 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Tue, 19 Dec 2023 23:28:59 +0100 Subject: mempool: introduce mempool_use_prealloc_only Introduce a new mempool_alloc_preallocated API that asks the mempool to only use the elements preallocated during the mempool's creation when allocating and to not attempt allocating new ones from the underlying allocator. This API is required to test the KASAN poisoning/unpoisoning functionality in KASAN tests, but it might be also useful on its own. Link: https://lkml.kernel.org/r/a14d809dbdfd04cc33bcacc632fee2abd6b83c00.1703024586.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Cc: Alexander Lobakin Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Breno Leitao Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Marco Elver Signed-off-by: Andrew Morton --- include/linux/mempool.h | 1 + mm/mempool.c | 37 +++++++++++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+) (limited to 'include') diff --git a/include/linux/mempool.h b/include/linux/mempool.h index 4aae6c06c5f2..7be1e32e6d42 100644 --- a/include/linux/mempool.h +++ b/include/linux/mempool.h @@ -51,6 +51,7 @@ extern mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn, extern int mempool_resize(mempool_t *pool, int new_min_nr); extern void mempool_destroy(mempool_t *pool); extern void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask) __malloc; +extern void *mempool_alloc_preallocated(mempool_t *pool) __malloc; extern void mempool_free(void *element, mempool_t *pool); /* diff --git a/mm/mempool.c b/mm/mempool.c index 103dc4770cfb..cb7b4b56cec1 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -456,6 +456,43 @@ repeat_alloc: } EXPORT_SYMBOL(mempool_alloc); +/** + * mempool_alloc_preallocated - allocate an element from preallocated elements + * belonging to a specific memory pool + * @pool: pointer to the memory pool which was allocated via + * mempool_create(). + * + * This function is similar to mempool_alloc, but it only attempts allocating + * an element from the preallocated elements. It does not sleep and immediately + * returns if no preallocated elements are available. + * + * Return: pointer to the allocated element or %NULL if no elements are + * available. + */ +void *mempool_alloc_preallocated(mempool_t *pool) +{ + void *element; + unsigned long flags; + + spin_lock_irqsave(&pool->lock, flags); + if (likely(pool->curr_nr)) { + element = remove_element(pool); + spin_unlock_irqrestore(&pool->lock, flags); + /* paired with rmb in mempool_free(), read comment there */ + smp_wmb(); + /* + * Update the allocation stack trace as this is more useful + * for debugging. + */ + kmemleak_update_trace(element); + return element; + } + spin_unlock_irqrestore(&pool->lock, flags); + + return NULL; +} +EXPORT_SYMBOL(mempool_alloc_preallocated); + /** * mempool_free - return an element to the pool. * @element: pool element pointer. -- cgit v1.2.3 From 1ce9a0523938f87dd8505233cc3445f8e2d8dcee Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Tue, 19 Dec 2023 23:29:03 +0100 Subject: kasan: rename and document kasan_(un)poison_object_data Rename kasan_unpoison_object_data to kasan_unpoison_new_object and add a documentation comment. Do the same for kasan_poison_object_data. The new names and the comments should suggest the users that these hooks are intended for internal use by the slab allocator. The following patch will remove non-slab-internal uses of these hooks. No functional changes. [andreyknvl@google.com: update references to renamed functions in comments] Link: https://lkml.kernel.org/r/20231221180637.105098-1-andrey.konovalov@linux.dev Link: https://lkml.kernel.org/r/eab156ebbd635f9635ef67d1a4271f716994e628.1703024586.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Lobakin Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Breno Leitao Cc: Dmitry Vyukov Cc: Evgenii Stepanov Signed-off-by: Andrew Morton --- include/linux/kasan.h | 35 +++++++++++++++++++++++++++-------- mm/kasan/common.c | 4 ++-- mm/kasan/shadow.c | 4 ++-- mm/slab.c | 10 ++++------ mm/slub.c | 4 ++-- net/core/skbuff.c | 8 ++++---- 6 files changed, 41 insertions(+), 24 deletions(-) (limited to 'include') diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 7392c5d89b92..d49e3d4c099e 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -129,20 +129,39 @@ static __always_inline void kasan_poison_slab(struct slab *slab) __kasan_poison_slab(slab); } -void __kasan_unpoison_object_data(struct kmem_cache *cache, void *object); -static __always_inline void kasan_unpoison_object_data(struct kmem_cache *cache, +void __kasan_unpoison_new_object(struct kmem_cache *cache, void *object); +/** + * kasan_unpoison_new_object - Temporarily unpoison a new slab object. + * @cache: Cache the object belong to. + * @object: Pointer to the object. + * + * This function is intended for the slab allocator's internal use. It + * temporarily unpoisons an object from a newly allocated slab without doing + * anything else. The object must later be repoisoned by + * kasan_poison_new_object(). + */ +static __always_inline void kasan_unpoison_new_object(struct kmem_cache *cache, void *object) { if (kasan_enabled()) - __kasan_unpoison_object_data(cache, object); + __kasan_unpoison_new_object(cache, object); } -void __kasan_poison_object_data(struct kmem_cache *cache, void *object); -static __always_inline void kasan_poison_object_data(struct kmem_cache *cache, +void __kasan_poison_new_object(struct kmem_cache *cache, void *object); +/** + * kasan_unpoison_new_object - Repoison a new slab object. + * @cache: Cache the object belong to. + * @object: Pointer to the object. + * + * This function is intended for the slab allocator's internal use. It + * repoisons an object that was previously unpoisoned by + * kasan_unpoison_new_object() without doing anything else. + */ +static __always_inline void kasan_poison_new_object(struct kmem_cache *cache, void *object) { if (kasan_enabled()) - __kasan_poison_object_data(cache, object); + __kasan_poison_new_object(cache, object); } void * __must_check __kasan_init_slab_obj(struct kmem_cache *cache, @@ -342,9 +361,9 @@ static inline bool kasan_unpoison_pages(struct page *page, unsigned int order, return false; } static inline void kasan_poison_slab(struct slab *slab) {} -static inline void kasan_unpoison_object_data(struct kmem_cache *cache, +static inline void kasan_unpoison_new_object(struct kmem_cache *cache, void *object) {} -static inline void kasan_poison_object_data(struct kmem_cache *cache, +static inline void kasan_poison_new_object(struct kmem_cache *cache, void *object) {} static inline void *kasan_init_slab_obj(struct kmem_cache *cache, const void *object) diff --git a/mm/kasan/common.c b/mm/kasan/common.c index bf16c2dfa8e7..f4255e807b74 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -143,12 +143,12 @@ void __kasan_poison_slab(struct slab *slab) KASAN_SLAB_REDZONE, false); } -void __kasan_unpoison_object_data(struct kmem_cache *cache, void *object) +void __kasan_unpoison_new_object(struct kmem_cache *cache, void *object) { kasan_unpoison(object, cache->object_size, false); } -void __kasan_poison_object_data(struct kmem_cache *cache, void *object) +void __kasan_poison_new_object(struct kmem_cache *cache, void *object) { kasan_poison(object, round_up(cache->object_size, KASAN_GRANULE_SIZE), KASAN_SLAB_REDZONE, false); diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c index d687f09a7ae3..0154d200be40 100644 --- a/mm/kasan/shadow.c +++ b/mm/kasan/shadow.c @@ -130,7 +130,7 @@ void kasan_poison(const void *addr, size_t size, u8 value, bool init) /* * Perform shadow offset calculation based on untagged address, as - * some of the callers (e.g. kasan_poison_object_data) pass tagged + * some of the callers (e.g. kasan_poison_new_object) pass tagged * addresses to this function. */ addr = kasan_reset_tag(addr); @@ -170,7 +170,7 @@ void kasan_unpoison(const void *addr, size_t size, bool init) /* * Perform shadow offset calculation based on untagged address, as - * some of the callers (e.g. kasan_unpoison_object_data) pass tagged + * some of the callers (e.g. kasan_unpoison_new_object) pass tagged * addresses to this function. */ addr = kasan_reset_tag(addr); diff --git a/mm/slab.c b/mm/slab.c index 9ad3d0f2d1a5..773c79e153f3 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -2327,11 +2327,9 @@ static void cache_init_objs_debug(struct kmem_cache *cachep, struct slab *slab) * They must also be threaded. */ if (cachep->ctor && !(cachep->flags & SLAB_POISON)) { - kasan_unpoison_object_data(cachep, - objp + obj_offset(cachep)); + kasan_unpoison_new_object(cachep, objp + obj_offset(cachep)); cachep->ctor(objp + obj_offset(cachep)); - kasan_poison_object_data( - cachep, objp + obj_offset(cachep)); + kasan_poison_new_object(cachep, objp + obj_offset(cachep)); } if (cachep->flags & SLAB_RED_ZONE) { @@ -2472,9 +2470,9 @@ static void cache_init_objs(struct kmem_cache *cachep, /* constructor could break poison info */ if (DEBUG == 0 && cachep->ctor) { - kasan_unpoison_object_data(cachep, objp); + kasan_unpoison_new_object(cachep, objp); cachep->ctor(objp); - kasan_poison_object_data(cachep, objp); + kasan_poison_new_object(cachep, objp); } if (!shuffled) diff --git a/mm/slub.c b/mm/slub.c index 782bd8a6bd34..891742e5932a 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1860,9 +1860,9 @@ static void *setup_object(struct kmem_cache *s, void *object) setup_object_debug(s, object); object = kasan_init_slab_obj(s, object); if (unlikely(s->ctor)) { - kasan_unpoison_object_data(s, object); + kasan_unpoison_new_object(s, object); s->ctor(object); - kasan_poison_object_data(s, object); + kasan_poison_new_object(s, object); } return object; } diff --git a/net/core/skbuff.c b/net/core/skbuff.c index b157efea5dea..63bb6526399d 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -337,7 +337,7 @@ static struct sk_buff *napi_skb_cache_get(void) } skb = nc->skb_cache[--nc->skb_count]; - kasan_unpoison_object_data(skbuff_cache, skb); + kasan_unpoison_new_object(skbuff_cache, skb); return skb; } @@ -1309,13 +1309,13 @@ static void napi_skb_cache_put(struct sk_buff *skb) struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); u32 i; - kasan_poison_object_data(skbuff_cache, skb); + kasan_poison_new_object(skbuff_cache, skb); nc->skb_cache[nc->skb_count++] = skb; if (unlikely(nc->skb_count == NAPI_SKB_CACHE_SIZE)) { for (i = NAPI_SKB_CACHE_HALF; i < NAPI_SKB_CACHE_SIZE; i++) - kasan_unpoison_object_data(skbuff_cache, - nc->skb_cache[i]); + kasan_unpoison_new_object(skbuff_cache, + nc->skb_cache[i]); kmem_cache_free_bulk(skbuff_cache, NAPI_SKB_CACHE_HALF, nc->skb_cache + NAPI_SKB_CACHE_HALF); -- cgit v1.2.3 From 91349f541e7daa6cce15e01e7ffe4fd63731ead9 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Tue, 19 Dec 2023 22:19:53 +0100 Subject: lib/stackdepot: fix comment in include/linux/stackdepot.h As stack traces can now be evicted from the stack depot, remove the comment saying that they are never removed. Link: https://lkml.kernel.org/r/0ebe712d91f8d302a8947d3c9e9123bc2b1b8440.1703020707.git.andreyknvl@google.com Fixes: 108be8def46e ("lib/stackdepot: allow users to evict stack traces") Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Tetsuo Handa Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/stackdepot.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h index a6796f178913..adcbb8f23600 100644 --- a/include/linux/stackdepot.h +++ b/include/linux/stackdepot.h @@ -11,8 +11,6 @@ * SLUB_DEBUG needs 256 bytes per object for that). Since allocation and free * stack traces often repeat, using stack depot allows to save about 100x space. * - * Stack traces are never removed from the stack depot. - * * Author: Alexander Potapenko * Copyright (C) 2016 Google, Inc. * -- cgit v1.2.3 From 7fbb5e188248c50f737720825da1864ce42536d1 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Tue, 19 Dec 2023 21:41:23 -0800 Subject: mm: remove VM_EXEC requirement for THP eligibility Commit e6be37b2e7bd ("mm/huge_memory.c: add missing read-only THP checking in transparent_hugepage_enabled()") introduced the VM_EXEC requirement, which is not strictly needed. lld's default --rosegment option and GNU ld's -z separate-code option (default on Linux/x86 since binutils 2.31) create a read-only PT_LOAD segment without the PF_X flag, which should be eligible for THP. Certain architectures support medium and large code models, where .lrodata may be placed in a separate read-only PT_LOAD segment, which should be eligible for THP as well. Link: https://lkml.kernel.org/r/20231220054123.1266001-1-maskray@google.com Signed-off-by: Fangrui Song Acked-by: Yang Shi Cc: Miaohe Lin Cc: Song Liu Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index fa7a38a30fc6..5adb86af35fc 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -206,7 +206,6 @@ static inline bool file_thp_enabled(struct vm_area_struct *vma) inode = vma->vm_file->f_inode; return (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS)) && - (vma->vm_flags & VM_EXEC) && !inode_is_open_for_write(inode) && S_ISREG(inode->i_mode); } -- cgit v1.2.3 From 5ec8e8ea8b7783fab150cf86404fc38cb4db8800 Mon Sep 17 00:00:00 2001 From: Charan Teja Kalla Date: Fri, 13 Oct 2023 18:34:27 +0530 Subject: mm/sparsemem: fix race in accessing memory_section->usage The below race is observed on a PFN which falls into the device memory region with the system memory configuration where PFN's are such that [ZONE_NORMAL ZONE_DEVICE ZONE_NORMAL]. Since normal zone start and end pfn contains the device memory PFN's as well, the compaction triggered will try on the device memory PFN's too though they end up in NOP(because pfn_to_online_page() returns NULL for ZONE_DEVICE memory sections). When from other core, the section mappings are being removed for the ZONE_DEVICE region, that the PFN in question belongs to, on which compaction is currently being operated is resulting into the kernel crash with CONFIG_SPASEMEM_VMEMAP enabled. The crash logs can be seen at [1]. compact_zone() memunmap_pages ------------- --------------- __pageblock_pfn_to_page ...... (a)pfn_valid(): valid_section()//return true (b)__remove_pages()-> sparse_remove_section()-> section_deactivate(): [Free the array ms->usage and set ms->usage = NULL] pfn_section_valid() [Access ms->usage which is NULL] NOTE: From the above it can be said that the race is reduced to between the pfn_valid()/pfn_section_valid() and the section deactivate with SPASEMEM_VMEMAP enabled. The commit b943f045a9af("mm/sparse: fix kernel crash with pfn_section_valid check") tried to address the same problem by clearing the SECTION_HAS_MEM_MAP with the expectation of valid_section() returns false thus ms->usage is not accessed. Fix this issue by the below steps: a) Clear SECTION_HAS_MEM_MAP before freeing the ->usage. b) RCU protected read side critical section will either return NULL when SECTION_HAS_MEM_MAP is cleared or can successfully access ->usage. c) Free the ->usage with kfree_rcu() and set ms->usage = NULL. No attempt will be made to access ->usage after this as the SECTION_HAS_MEM_MAP is cleared thus valid_section() return false. Thanks to David/Pavan for their inputs on this patch. [1] https://lore.kernel.org/linux-mm/994410bb-89aa-d987-1f50-f514903c55aa@quicinc.com/ On Snapdragon SoC, with the mentioned memory configuration of PFN's as [ZONE_NORMAL ZONE_DEVICE ZONE_NORMAL], we are able to see bunch of issues daily while testing on a device farm. For this particular issue below is the log. Though the below log is not directly pointing to the pfn_section_valid(){ ms->usage;}, when we loaded this dump on T32 lauterbach tool, it is pointing. [ 540.578056] Unable to handle kernel NULL pointer dereference at virtual address 0000000000000000 [ 540.578068] Mem abort info: [ 540.578070] ESR = 0x0000000096000005 [ 540.578073] EC = 0x25: DABT (current EL), IL = 32 bits [ 540.578077] SET = 0, FnV = 0 [ 540.578080] EA = 0, S1PTW = 0 [ 540.578082] FSC = 0x05: level 1 translation fault [ 540.578085] Data abort info: [ 540.578086] ISV = 0, ISS = 0x00000005 [ 540.578088] CM = 0, WnR = 0 [ 540.579431] pstate: 82400005 (Nzcv daif +PAN -UAO +TCO -DIT -SSBSBTYPE=--) [ 540.579436] pc : __pageblock_pfn_to_page+0x6c/0x14c [ 540.579454] lr : compact_zone+0x994/0x1058 [ 540.579460] sp : ffffffc03579b510 [ 540.579463] x29: ffffffc03579b510 x28: 0000000000235800 x27:000000000000000c [ 540.579470] x26: 0000000000235c00 x25: 0000000000000068 x24:ffffffc03579b640 [ 540.579477] x23: 0000000000000001 x22: ffffffc03579b660 x21:0000000000000000 [ 540.579483] x20: 0000000000235bff x19: ffffffdebf7e3940 x18:ffffffdebf66d140 [ 540.579489] x17: 00000000739ba063 x16: 00000000739ba063 x15:00000000009f4bff [ 540.579495] x14: 0000008000000000 x13: 0000000000000000 x12:0000000000000001 [ 540.579501] x11: 0000000000000000 x10: 0000000000000000 x9 :ffffff897d2cd440 [ 540.579507] x8 : 0000000000000000 x7 : 0000000000000000 x6 :ffffffc03579b5b4 [ 540.579512] x5 : 0000000000027f25 x4 : ffffffc03579b5b8 x3 :0000000000000001 [ 540.579518] x2 : ffffffdebf7e3940 x1 : 0000000000235c00 x0 :0000000000235800 [ 540.579524] Call trace: [ 540.579527] __pageblock_pfn_to_page+0x6c/0x14c [ 540.579533] compact_zone+0x994/0x1058 [ 540.579536] try_to_compact_pages+0x128/0x378 [ 540.579540] __alloc_pages_direct_compact+0x80/0x2b0 [ 540.579544] __alloc_pages_slowpath+0x5c0/0xe10 [ 540.579547] __alloc_pages+0x250/0x2d0 [ 540.579550] __iommu_dma_alloc_noncontiguous+0x13c/0x3fc [ 540.579561] iommu_dma_alloc+0xa0/0x320 [ 540.579565] dma_alloc_attrs+0xd4/0x108 [quic_charante@quicinc.com: use kfree_rcu() in place of synchronize_rcu(), per David] Link: https://lkml.kernel.org/r/1698403778-20938-1-git-send-email-quic_charante@quicinc.com Link: https://lkml.kernel.org/r/1697202267-23600-1-git-send-email-quic_charante@quicinc.com Fixes: f46edbd1b151 ("mm/sparsemem: add helpers track active portions of a section at boot") Signed-off-by: Charan Teja Kalla Cc: Aneesh Kumar K.V Cc: Dan Williams Cc: David Hildenbrand Cc: Mel Gorman Cc: Oscar Salvador Cc: Vlastimil Babka Cc: Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 14 +++++++++++--- mm/sparse.c | 17 +++++++++-------- 2 files changed, 20 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index ec73582e7d27..2efd3be484fd 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1799,6 +1799,7 @@ static inline unsigned long section_nr_to_pfn(unsigned long sec) #define SUBSECTION_ALIGN_DOWN(pfn) ((pfn) & PAGE_SUBSECTION_MASK) struct mem_section_usage { + struct rcu_head rcu; #ifdef CONFIG_SPARSEMEM_VMEMMAP DECLARE_BITMAP(subsection_map, SUBSECTIONS_PER_SECTION); #endif @@ -1992,7 +1993,7 @@ static inline int pfn_section_valid(struct mem_section *ms, unsigned long pfn) { int idx = subsection_map_index(pfn); - return test_bit(idx, ms->usage->subsection_map); + return test_bit(idx, READ_ONCE(ms->usage)->subsection_map); } #else static inline int pfn_section_valid(struct mem_section *ms, unsigned long pfn) @@ -2016,6 +2017,7 @@ static inline int pfn_section_valid(struct mem_section *ms, unsigned long pfn) static inline int pfn_valid(unsigned long pfn) { struct mem_section *ms; + int ret; /* * Ensure the upper PAGE_SHIFT bits are clear in the @@ -2029,13 +2031,19 @@ static inline int pfn_valid(unsigned long pfn) if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS) return 0; ms = __pfn_to_section(pfn); - if (!valid_section(ms)) + rcu_read_lock(); + if (!valid_section(ms)) { + rcu_read_unlock(); return 0; + } /* * Traditionally early sections always returned pfn_valid() for * the entire section-sized span. */ - return early_section(ms) || pfn_section_valid(ms, pfn); + ret = early_section(ms) || pfn_section_valid(ms, pfn); + rcu_read_unlock(); + + return ret; } #endif diff --git a/mm/sparse.c b/mm/sparse.c index 77d91e565045..338cf946dee8 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -791,6 +791,13 @@ static void section_deactivate(unsigned long pfn, unsigned long nr_pages, if (empty) { unsigned long section_nr = pfn_to_section_nr(pfn); + /* + * Mark the section invalid so that valid_section() + * return false. This prevents code from dereferencing + * ms->usage array. + */ + ms->section_mem_map &= ~SECTION_HAS_MEM_MAP; + /* * When removing an early section, the usage map is kept (as the * usage maps of other sections fall into the same page). It @@ -799,16 +806,10 @@ static void section_deactivate(unsigned long pfn, unsigned long nr_pages, * was allocated during boot. */ if (!PageReserved(virt_to_page(ms->usage))) { - kfree(ms->usage); - ms->usage = NULL; + kfree_rcu(ms->usage, rcu); + WRITE_ONCE(ms->usage, NULL); } memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr); - /* - * Mark the section invalid so that valid_section() - * return false. This prevents code from dereferencing - * ms->usage array. - */ - ms->section_mem_map &= ~SECTION_HAS_MEM_MAP; } /* -- cgit v1.2.3 From 5cb6674b694b84803cbee8bfccaa2bfdfeb6eae4 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 21 Dec 2023 21:04:44 +0100 Subject: mm, kasan: use KASAN_TAG_KERNEL instead of 0xff Use the KASAN_TAG_KERNEL marco instead of open-coding 0xff in the mm code. This macro is provided by include/linux/kasan-tags.h, which does not include any other headers, so it's safe to include it into mm.h without causing circular include dependencies. Link: https://lkml.kernel.org/r/71db9087b0aebb6c4dccbc609cc0cd50621533c7.1703188911.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Marco Elver Signed-off-by: Andrew Morton --- include/linux/kasan.h | 1 + include/linux/mm.h | 4 ++-- mm/page_alloc.c | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/kasan.h b/include/linux/kasan.h index d49e3d4c099e..dbb06d789e74 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -4,6 +4,7 @@ #include #include +#include #include #include #include diff --git a/include/linux/mm.h b/include/linux/mm.h index b72bf25a45cf..2563ffdb51bc 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1815,7 +1815,7 @@ static inline void vma_set_access_pid_bit(struct vm_area_struct *vma) static inline u8 page_kasan_tag(const struct page *page) { - u8 tag = 0xff; + u8 tag = KASAN_TAG_KERNEL; if (kasan_enabled()) { tag = (page->flags >> KASAN_TAG_PGSHIFT) & KASAN_TAG_MASK; @@ -1844,7 +1844,7 @@ static inline void page_kasan_tag_set(struct page *page, u8 tag) static inline void page_kasan_tag_reset(struct page *page) { if (kasan_enabled()) - page_kasan_tag_set(page, 0xff); + page_kasan_tag_set(page, KASAN_TAG_KERNEL); } #else /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 692f33c764d2..5526797b7f96 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1059,7 +1059,7 @@ static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags) if (IS_ENABLED(CONFIG_KASAN_GENERIC)) return deferred_pages_enabled(); - return page_kasan_tag(page) == 0xff; + return page_kasan_tag(page) == KASAN_TAG_KERNEL; } static void kernel_init_pages(struct page *page, int numpages) -- cgit v1.2.3 From 9d5fafd5d882446999366f673ab06edba453f862 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 20 Dec 2023 23:44:25 +0100 Subject: mm/rmap: rename hugepage_add* to hugetlb_add* Patch series "mm/rmap: interface overhaul", v2. This series overhauls the rmap interface, to get rid of the "bool compound" / RMAP_COMPOUND parameter with the goal of making the interface less error prone, more future proof, and more natural to extend to "batching". Also, this converts the interface to always consume folio+subpage, which speeds up operations on large folios. Further, this series adds PTE-batching variants for 4 rmap functions, whereby only folio_add_anon_rmap_ptes() is used for batching in this series when PTE-remapping a PMD-mapped THP. folio_remove_rmap_ptes(), folio_try_dup_anon_rmap_ptes() and folio_dup_file_rmap_ptes() will soon come in handy[1,2]. This series performs a lot of folio conversion along the way. Most of the added LOC in the diff are only due to documentation. As we're moving to a pte/pmd interface where we clearly express the mapping granularity we are dealing with, we first get the remainder of hugetlb out of the way, as it is special and expected to remain special: it treats everything as a "single logical PTE" and only currently allows entire mappings. Even if we'd ever support partial mappings, I strongly assume the interface and implementation will still differ heavily: hopefull we can avoid working on subpages/subpage mapcounts completely and only add a "count" parameter for them to enable batching. New (extended) hugetlb interface that operates on entire folio: * hugetlb_add_new_anon_rmap() -> Already existed * hugetlb_add_anon_rmap() -> Already existed * hugetlb_try_dup_anon_rmap() * hugetlb_try_share_anon_rmap() * hugetlb_add_file_rmap() * hugetlb_remove_rmap() New "ordinary" interface for small folios / THP:: * folio_add_new_anon_rmap() -> Already existed * folio_add_anon_rmap_[pte|ptes|pmd]() * folio_try_dup_anon_rmap_[pte|ptes|pmd]() * folio_try_share_anon_rmap_[pte|pmd]() * folio_add_file_rmap_[pte|ptes|pmd]() * folio_dup_file_rmap_[pte|ptes|pmd]() * folio_remove_rmap_[pte|ptes|pmd]() folio_add_new_anon_rmap() will always map at the largest granularity possible (currently, a single PMD to cover a PMD-sized THP). Could be extended if ever required. In the future, we might want "_pud" variants and eventually "_pmds" variants for batching. I ran some simple microbenchmarks on an Intel(R) Xeon(R) Silver 4210R: measuring munmap(), fork(), cow, MADV_DONTNEED on each PTE ... and PTE remapping PMD-mapped THPs on 1 GiB of memory. For small folios, there is barely a change (< 1% improvement for me). For PTE-mapped THP: * PTE-remapping a PMD-mapped THP is more than 10% faster. * fork() is more than 4% faster. * MADV_DONTNEED is 2% faster * COW when writing only a single byte on a COW-shared PTE is 1% faster * munmap() barely changes (< 1%). [1] https://lkml.kernel.org/r/20230810103332.3062143-1-ryan.roberts@arm.com [2] https://lkml.kernel.org/r/20231204105440.61448-1-ryan.roberts@arm.com This patch (of 40): Let's just call it "hugetlb_". Yes, it's all already inconsistent and confusing because we have a lot of "hugepage_" functions for legacy reasons. But "hugetlb" cannot possibly be confused with transparent huge pages, and it matches "hugetlb.c" and "folio_test_hugetlb()". So let's minimize confusion in rmap code. Link: https://lkml.kernel.org/r/20231220224504.646757-1-david@redhat.com Link: https://lkml.kernel.org/r/20231220224504.646757-2-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Muchun Song Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Muchun Song Cc: Peter Xu Cc: Ryan Roberts Cc: Yin Fengwei Signed-off-by: Andrew Morton --- include/linux/rmap.h | 4 ++-- mm/hugetlb.c | 8 ++++---- mm/migrate.c | 4 ++-- mm/rmap.c | 8 ++++---- 4 files changed, 12 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 0ae2bb0e77f5..36096ba69bdc 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -206,9 +206,9 @@ void folio_add_file_rmap_range(struct folio *, struct page *, unsigned int nr, void page_remove_rmap(struct page *, struct vm_area_struct *, bool compound); -void hugepage_add_anon_rmap(struct folio *, struct vm_area_struct *, +void hugetlb_add_anon_rmap(struct folio *, struct vm_area_struct *, unsigned long address, rmap_t flags); -void hugepage_add_new_anon_rmap(struct folio *, struct vm_area_struct *, +void hugetlb_add_new_anon_rmap(struct folio *, struct vm_area_struct *, unsigned long address); static inline void __page_dup_rmap(struct page *page, bool compound) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 6feb3e0630d1..305f3ca1dee6 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5285,7 +5285,7 @@ hugetlb_install_folio(struct vm_area_struct *vma, pte_t *ptep, unsigned long add pte_t newpte = make_huge_pte(vma, &new_folio->page, 1); __folio_mark_uptodate(new_folio); - hugepage_add_new_anon_rmap(new_folio, vma, addr); + hugetlb_add_new_anon_rmap(new_folio, vma, addr); if (userfaultfd_wp(vma) && huge_pte_uffd_wp(old)) newpte = huge_pte_mkuffd_wp(newpte); set_huge_pte_at(vma->vm_mm, addr, ptep, newpte, sz); @@ -5988,7 +5988,7 @@ retry_avoidcopy: /* Break COW or unshare */ huge_ptep_clear_flush(vma, haddr, ptep); page_remove_rmap(&old_folio->page, vma, true); - hugepage_add_new_anon_rmap(new_folio, vma, haddr); + hugetlb_add_new_anon_rmap(new_folio, vma, haddr); if (huge_pte_uffd_wp(pte)) newpte = huge_pte_mkuffd_wp(newpte); set_huge_pte_at(mm, haddr, ptep, newpte, huge_page_size(h)); @@ -6277,7 +6277,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, goto backout; if (anon_rmap) - hugepage_add_new_anon_rmap(folio, vma, haddr); + hugetlb_add_new_anon_rmap(folio, vma, haddr); else page_dup_file_rmap(&folio->page, true); new_pte = make_huge_pte(vma, &folio->page, ((vma->vm_flags & VM_WRITE) @@ -6732,7 +6732,7 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte, if (folio_in_pagecache) page_dup_file_rmap(&folio->page, true); else - hugepage_add_new_anon_rmap(folio, dst_vma, dst_addr); + hugetlb_add_new_anon_rmap(folio, dst_vma, dst_addr); /* * For either: (1) CONTINUE on a non-shared VMA, or (2) UFFDIO_COPY diff --git a/mm/migrate.c b/mm/migrate.c index bad3039d165e..7d1c3f292d24 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -249,8 +249,8 @@ static bool remove_migration_pte(struct folio *folio, pte = arch_make_huge_pte(pte, shift, vma->vm_flags); if (folio_test_anon(folio)) - hugepage_add_anon_rmap(folio, vma, pvmw.address, - rmap_flags); + hugetlb_add_anon_rmap(folio, vma, pvmw.address, + rmap_flags); else page_dup_file_rmap(new, true); set_huge_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte, diff --git a/mm/rmap.c b/mm/rmap.c index 23da5b1ac33b..9845499b22f8 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -2625,8 +2625,8 @@ void rmap_walk_locked(struct folio *folio, struct rmap_walk_control *rwc) * * RMAP_COMPOUND is ignored. */ -void hugepage_add_anon_rmap(struct folio *folio, struct vm_area_struct *vma, - unsigned long address, rmap_t flags) +void hugetlb_add_anon_rmap(struct folio *folio, struct vm_area_struct *vma, + unsigned long address, rmap_t flags) { VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio); @@ -2637,8 +2637,8 @@ void hugepage_add_anon_rmap(struct folio *folio, struct vm_area_struct *vma, PageAnonExclusive(&folio->page), folio); } -void hugepage_add_new_anon_rmap(struct folio *folio, - struct vm_area_struct *vma, unsigned long address) +void hugetlb_add_new_anon_rmap(struct folio *folio, + struct vm_area_struct *vma, unsigned long address) { BUG_ON(address < vma->vm_start || address >= vma->vm_end); /* increment count (starts at -1) */ -- cgit v1.2.3 From e135826b2da0cf25305086dc9ac1e91718a148e1 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 20 Dec 2023 23:44:26 +0100 Subject: mm/rmap: introduce and use hugetlb_remove_rmap() hugetlb rmap handling differs quite a lot from "ordinary" rmap code. For example, hugetlb currently only supports entire mappings, and treats any mapping as mapped using a single "logical PTE". Let's move it out of the way so we can overhaul our "ordinary" rmap. implementation/interface. Let's introduce and use hugetlb_remove_rmap() and remove the hugetlb code from page_remove_rmap(). This effectively removes one check on the small-folio path as well. Add sanity checks that we end up with the right folios in the right functions. Note: all possible candidates that need care are page_remove_rmap() that pass compound=true. Link: https://lkml.kernel.org/r/20231220224504.646757-3-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Yin Fengwei Reviewed-by: Ryan Roberts Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: Muchun Song Cc: Hugh Dickins Cc: Muchun Song Cc: Peter Xu Signed-off-by: Andrew Morton --- include/linux/rmap.h | 7 +++++++ mm/hugetlb.c | 4 ++-- mm/rmap.c | 18 +++++++++--------- 3 files changed, 18 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 36096ba69bdc..64ae6c4d7272 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -211,6 +211,13 @@ void hugetlb_add_anon_rmap(struct folio *, struct vm_area_struct *, void hugetlb_add_new_anon_rmap(struct folio *, struct vm_area_struct *, unsigned long address); +static inline void hugetlb_remove_rmap(struct folio *folio) +{ + VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio); + + atomic_dec(&folio->_entire_mapcount); +} + static inline void __page_dup_rmap(struct page *page, bool compound) { if (compound) { diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 305f3ca1dee6..ef48ae673890 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5676,7 +5676,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, make_pte_marker(PTE_MARKER_UFFD_WP), sz); hugetlb_count_sub(pages_per_huge_page(h), mm); - page_remove_rmap(page, vma, true); + hugetlb_remove_rmap(page_folio(page)); spin_unlock(ptl); tlb_remove_page_size(tlb, page, huge_page_size(h)); @@ -5987,7 +5987,7 @@ retry_avoidcopy: /* Break COW or unshare */ huge_ptep_clear_flush(vma, haddr, ptep); - page_remove_rmap(&old_folio->page, vma, true); + hugetlb_remove_rmap(old_folio); hugetlb_add_new_anon_rmap(new_folio, vma, haddr); if (huge_pte_uffd_wp(pte)) newpte = huge_pte_mkuffd_wp(newpte); diff --git a/mm/rmap.c b/mm/rmap.c index 9845499b22f8..261e1af0d254 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1480,15 +1480,9 @@ void page_remove_rmap(struct page *page, struct vm_area_struct *vma, bool last; enum node_stat_item idx; + VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio); VM_BUG_ON_PAGE(compound && !PageHead(page), page); - /* Hugetlb pages are not counted in NR_*MAPPED */ - if (unlikely(folio_test_hugetlb(folio))) { - /* hugetlb pages are always mapped with pmds */ - atomic_dec(&folio->_entire_mapcount); - return; - } - /* Is page being unmapped by PTE? Is this its last map to be removed? */ if (likely(!compound)) { last = atomic_add_negative(-1, &page->_mapcount); @@ -1846,7 +1840,10 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, dec_mm_counter(mm, mm_counter_file(&folio->page)); } discard: - page_remove_rmap(subpage, vma, folio_test_hugetlb(folio)); + if (unlikely(folio_test_hugetlb(folio))) + hugetlb_remove_rmap(folio); + else + page_remove_rmap(subpage, vma, false); if (vma->vm_flags & VM_LOCKED) mlock_drain_local(); folio_put(folio); @@ -2199,7 +2196,10 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, */ } - page_remove_rmap(subpage, vma, folio_test_hugetlb(folio)); + if (unlikely(folio_test_hugetlb(folio))) + hugetlb_remove_rmap(folio); + else + page_remove_rmap(subpage, vma, false); if (vma->vm_flags & VM_LOCKED) mlock_drain_local(); folio_put(folio); -- cgit v1.2.3 From 44887f39945519fa8405133b1acd098fda9c9746 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 20 Dec 2023 23:44:27 +0100 Subject: mm/rmap: introduce and use hugetlb_add_file_rmap() hugetlb rmap handling differs quite a lot from "ordinary" rmap code. For example, hugetlb currently only supports entire mappings, and treats any mapping as mapped using a single "logical PTE". Let's move it out of the way so we can overhaul our "ordinary" rmap. implementation/interface. Right now we're using page_dup_file_rmap() in some cases where "ordinary" rmap code would have used page_add_file_rmap(). So let's introduce and use hugetlb_add_file_rmap() instead. We won't be adding a "hugetlb_dup_file_rmap()" functon for the fork() case, as it would be doing the same: "dup" is just an optimization for "add". What remains is a single page_dup_file_rmap() call in fork() code. Add sanity checks that we end up with the right folios in the right functions. Link: https://lkml.kernel.org/r/20231220224504.646757-4-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Yin Fengwei Reviewed-by: Ryan Roberts Reviewed-by: Muchun Song Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Muchun Song Cc: Peter Xu Signed-off-by: Andrew Morton --- include/linux/rmap.h | 8 ++++++++ mm/hugetlb.c | 6 +++--- mm/migrate.c | 2 +- mm/rmap.c | 1 + 4 files changed, 13 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 64ae6c4d7272..56900a16f41a 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -211,6 +211,14 @@ void hugetlb_add_anon_rmap(struct folio *, struct vm_area_struct *, void hugetlb_add_new_anon_rmap(struct folio *, struct vm_area_struct *, unsigned long address); +static inline void hugetlb_add_file_rmap(struct folio *folio) +{ + VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio); + VM_WARN_ON_FOLIO(folio_test_anon(folio), folio); + + atomic_inc(&folio->_entire_mapcount); +} + static inline void hugetlb_remove_rmap(struct folio *folio) { VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index ef48ae673890..57e898187931 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5408,7 +5408,7 @@ again: * sleep during the process. */ if (!folio_test_anon(pte_folio)) { - page_dup_file_rmap(&pte_folio->page, true); + hugetlb_add_file_rmap(pte_folio); } else if (page_try_dup_anon_rmap(&pte_folio->page, true, src_vma)) { pte_t src_pte_old = entry; @@ -6279,7 +6279,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, if (anon_rmap) hugetlb_add_new_anon_rmap(folio, vma, haddr); else - page_dup_file_rmap(&folio->page, true); + hugetlb_add_file_rmap(folio); new_pte = make_huge_pte(vma, &folio->page, ((vma->vm_flags & VM_WRITE) && (vma->vm_flags & VM_SHARED))); /* @@ -6730,7 +6730,7 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte, goto out_release_unlock; if (folio_in_pagecache) - page_dup_file_rmap(&folio->page, true); + hugetlb_add_file_rmap(folio); else hugetlb_add_new_anon_rmap(folio, dst_vma, dst_addr); diff --git a/mm/migrate.c b/mm/migrate.c index 7d1c3f292d24..0e912443a18c 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -252,7 +252,7 @@ static bool remove_migration_pte(struct folio *folio, hugetlb_add_anon_rmap(folio, vma, pvmw.address, rmap_flags); else - page_dup_file_rmap(new, true); + hugetlb_add_file_rmap(folio); set_huge_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte, psize); } else diff --git a/mm/rmap.c b/mm/rmap.c index 261e1af0d254..a57ec926daf0 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1395,6 +1395,7 @@ void folio_add_file_rmap_range(struct folio *folio, struct page *page, unsigned int nr_pmdmapped = 0, first; int nr = 0; + VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio); VM_WARN_ON_FOLIO(compound && !folio_test_pmd_mappable(folio), folio); /* Is page being mapped by PTE? Is this its first map to be added? */ -- cgit v1.2.3 From ebe2e35ec0f256372c158a18de459fb60070b313 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 20 Dec 2023 23:44:28 +0100 Subject: mm/rmap: introduce and use hugetlb_try_dup_anon_rmap() hugetlb rmap handling differs quite a lot from "ordinary" rmap code. For example, hugetlb currently only supports entire mappings, and treats any mapping as mapped using a single "logical PTE". Let's move it out of the way so we can overhaul our "ordinary" rmap. implementation/interface. So let's introduce and use hugetlb_try_dup_anon_rmap() to make all hugetlb handling use dedicated hugetlb_* rmap functions. Add sanity checks that we end up with the right folios in the right functions. Note that is_device_private_page() does not apply to hugetlb. Link: https://lkml.kernel.org/r/20231220224504.646757-5-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Yin Fengwei Reviewed-by: Ryan Roberts Reviewed-by: Muchun Song Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Muchun Song Cc: Peter Xu Signed-off-by: Andrew Morton --- include/linux/mm.h | 12 +++++++++--- include/linux/rmap.h | 18 ++++++++++++++++++ mm/hugetlb.c | 3 +-- 3 files changed, 28 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 2563ffdb51bc..75bba6102825 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1964,15 +1964,21 @@ static inline bool page_maybe_dma_pinned(struct page *page) * * The caller has to hold the PT lock and the vma->vm_mm->->write_protect_seq. */ -static inline bool page_needs_cow_for_dma(struct vm_area_struct *vma, - struct page *page) +static inline bool folio_needs_cow_for_dma(struct vm_area_struct *vma, + struct folio *folio) { VM_BUG_ON(!(raw_read_seqcount(&vma->vm_mm->write_protect_seq) & 1)); if (!test_bit(MMF_HAS_PINNED, &vma->vm_mm->flags)) return false; - return page_maybe_dma_pinned(page); + return folio_maybe_dma_pinned(folio); +} + +static inline bool page_needs_cow_for_dma(struct vm_area_struct *vma, + struct page *page) +{ + return folio_needs_cow_for_dma(vma, page_folio(page)); } /** diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 56900a16f41a..5f26752de945 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -211,6 +211,22 @@ void hugetlb_add_anon_rmap(struct folio *, struct vm_area_struct *, void hugetlb_add_new_anon_rmap(struct folio *, struct vm_area_struct *, unsigned long address); +/* See page_try_dup_anon_rmap() */ +static inline int hugetlb_try_dup_anon_rmap(struct folio *folio, + struct vm_area_struct *vma) +{ + VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio); + VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio); + + if (PageAnonExclusive(&folio->page)) { + if (unlikely(folio_needs_cow_for_dma(vma, folio))) + return -EBUSY; + ClearPageAnonExclusive(&folio->page); + } + atomic_inc(&folio->_entire_mapcount); + return 0; +} + static inline void hugetlb_add_file_rmap(struct folio *folio) { VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio); @@ -228,6 +244,8 @@ static inline void hugetlb_remove_rmap(struct folio *folio) static inline void __page_dup_rmap(struct page *page, bool compound) { + VM_WARN_ON(folio_test_hugetlb(page_folio(page))); + if (compound) { struct folio *folio = (struct folio *)page; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 57e898187931..378e460a6ab4 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5409,8 +5409,7 @@ again: */ if (!folio_test_anon(pte_folio)) { hugetlb_add_file_rmap(pte_folio); - } else if (page_try_dup_anon_rmap(&pte_folio->page, - true, src_vma)) { + } else if (hugetlb_try_dup_anon_rmap(pte_folio, src_vma)) { pte_t src_pte_old = entry; struct folio *new_folio; -- cgit v1.2.3 From 0c2ec32bf0b2f0d7ccb98c53ee5d255d68e73595 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 20 Dec 2023 23:44:29 +0100 Subject: mm/rmap: introduce and use hugetlb_try_share_anon_rmap() hugetlb rmap handling differs quite a lot from "ordinary" rmap code. For example, hugetlb currently only supports entire mappings, and treats any mapping as mapped using a single "logical PTE". Let's move it out of the way so we can overhaul our "ordinary" rmap. implementation/interface. So let's introduce and use hugetlb_try_dup_anon_rmap() to make all hugetlb handling use dedicated hugetlb_* rmap functions. Add sanity checks that we end up with the right folios in the right functions. Note that try_to_unmap_one() does not need care. Easy to spot because among all that nasty hugetlb special-casing in that function, we're not using set_huge_pte_at() on the anon path -- well, and that code assumes that we would want to swapout. Link: https://lkml.kernel.org/r/20231220224504.646757-6-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Yin Fengwei Reviewed-by: Ryan Roberts Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Muchun Song Cc: Muchun Song Cc: Peter Xu Signed-off-by: Andrew Morton --- include/linux/rmap.h | 25 +++++++++++++++++++++++++ mm/rmap.c | 15 ++++++++++----- 2 files changed, 35 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 5f26752de945..d6fefa0f0410 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -227,6 +227,30 @@ static inline int hugetlb_try_dup_anon_rmap(struct folio *folio, return 0; } +/* See page_try_share_anon_rmap() */ +static inline int hugetlb_try_share_anon_rmap(struct folio *folio) +{ + VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio); + VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio); + VM_WARN_ON_FOLIO(!PageAnonExclusive(&folio->page), folio); + + /* Paired with the memory barrier in try_grab_folio(). */ + if (IS_ENABLED(CONFIG_HAVE_FAST_GUP)) + smp_mb(); + + if (unlikely(folio_maybe_dma_pinned(folio))) + return -EBUSY; + ClearPageAnonExclusive(&folio->page); + + /* + * This is conceptually a smp_wmb() paired with the smp_rmb() in + * gup_must_unshare(). + */ + if (IS_ENABLED(CONFIG_HAVE_FAST_GUP)) + smp_mb__after_atomic(); + return 0; +} + static inline void hugetlb_add_file_rmap(struct folio *folio) { VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio); @@ -331,6 +355,7 @@ dup: */ static inline int page_try_share_anon_rmap(struct page *page) { + VM_WARN_ON(folio_test_hugetlb(page_folio(page))); VM_BUG_ON_PAGE(!PageAnon(page) || !PageAnonExclusive(page), page); /* device private pages cannot get pinned via GUP. */ diff --git a/mm/rmap.c b/mm/rmap.c index a57ec926daf0..c229e48cf5a9 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -2149,13 +2149,18 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, !anon_exclusive, subpage); /* See page_try_share_anon_rmap(): clear PTE first. */ - if (anon_exclusive && - page_try_share_anon_rmap(subpage)) { - if (folio_test_hugetlb(folio)) + if (folio_test_hugetlb(folio)) { + if (anon_exclusive && + hugetlb_try_share_anon_rmap(folio)) { set_huge_pte_at(mm, address, pvmw.pte, pteval, hsz); - else - set_pte_at(mm, address, pvmw.pte, pteval); + ret = false; + page_vma_mapped_walk_done(&pvmw); + break; + } + } else if (anon_exclusive && + page_try_share_anon_rmap(subpage)) { + set_pte_at(mm, address, pvmw.pte, pteval); ret = false; page_vma_mapped_walk_done(&pvmw); break; -- cgit v1.2.3 From 68f0320824fa59c5429cbc811e6c46e7a30ea32c Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 20 Dec 2023 23:44:31 +0100 Subject: mm/rmap: convert folio_add_file_rmap_range() into folio_add_file_rmap_[pte|ptes|pmd]() Let's get rid of the compound parameter and instead define explicitly which mappings we're adding. That is more future proof, easier to read and harder to mess up. Use an enum to express the granularity internally. Make the compiler always special-case on the granularity by using __always_inline. Replace the "compound" check by a switch-case that will be removed by the compiler completely. Add plenty of sanity checks with CONFIG_DEBUG_VM. Replace the folio_test_pmd_mappable() check by a config check in the caller and sanity checks. Convert the single user of folio_add_file_rmap_range(). While at it, consistently use "int" instead of "unisgned int" in rmap code when dealing with mapcounts and the number of pages. This function design can later easily be extended to PUDs and to batch PMDs. Note that for now we don't support anything bigger than PMD-sized folios (as we cleanly separated hugetlb handling). Sanity checks will catch if that ever changes. Next up is removing page_remove_rmap() along with its "compound" parameter and smilarly converting all other rmap functions. Link: https://lkml.kernel.org/r/20231220224504.646757-8-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Yin Fengwei Reviewed-by: Ryan Roberts Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Muchun Song Cc: Muchun Song Cc: Peter Xu Signed-off-by: Andrew Morton --- include/linux/rmap.h | 46 ++++++++++++++++++++++++++++-- mm/memory.c | 2 +- mm/rmap.c | 79 +++++++++++++++++++++++++++++++++------------------- 3 files changed, 95 insertions(+), 32 deletions(-) (limited to 'include') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index d6fefa0f0410..3d86a76b2836 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -191,6 +191,44 @@ typedef int __bitwise rmap_t; */ #define RMAP_COMPOUND ((__force rmap_t)BIT(1)) +/* + * Internally, we're using an enum to specify the granularity. We make the + * compiler emit specialized code for each granularity. + */ +enum rmap_level { + RMAP_LEVEL_PTE = 0, + RMAP_LEVEL_PMD, +}; + +static inline void __folio_rmap_sanity_checks(struct folio *folio, + struct page *page, int nr_pages, enum rmap_level level) +{ + /* hugetlb folios are handled separately. */ + VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio); + VM_WARN_ON_FOLIO(folio_test_large(folio) && + !folio_test_large_rmappable(folio), folio); + + VM_WARN_ON_ONCE(nr_pages <= 0); + VM_WARN_ON_FOLIO(page_folio(page) != folio, folio); + VM_WARN_ON_FOLIO(page_folio(page + nr_pages - 1) != folio, folio); + + switch (level) { + case RMAP_LEVEL_PTE: + break; + case RMAP_LEVEL_PMD: + /* + * We don't support folios larger than a single PMD yet. So + * when RMAP_LEVEL_PMD is set, we assume that we are creating + * a single "entire" mapping of the folio. + */ + VM_WARN_ON_FOLIO(folio_nr_pages(folio) != HPAGE_PMD_NR, folio); + VM_WARN_ON_FOLIO(nr_pages != HPAGE_PMD_NR, folio); + break; + default: + VM_WARN_ON_ONCE(true); + } +} + /* * rmap interfaces called when adding or removing pte of page */ @@ -201,8 +239,12 @@ void folio_add_new_anon_rmap(struct folio *, struct vm_area_struct *, unsigned long address); void page_add_file_rmap(struct page *, struct vm_area_struct *, bool compound); -void folio_add_file_rmap_range(struct folio *, struct page *, unsigned int nr, - struct vm_area_struct *, bool compound); +void folio_add_file_rmap_ptes(struct folio *, struct page *, int nr_pages, + struct vm_area_struct *); +#define folio_add_file_rmap_pte(folio, page, vma) \ + folio_add_file_rmap_ptes(folio, page, 1, vma) +void folio_add_file_rmap_pmd(struct folio *, struct page *, + struct vm_area_struct *); void page_remove_rmap(struct page *, struct vm_area_struct *, bool compound); diff --git a/mm/memory.c b/mm/memory.c index cfcaf4c0198c..9b977b2cf893 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4516,7 +4516,7 @@ void set_pte_range(struct vm_fault *vmf, struct folio *folio, folio_add_lru_vma(folio, vma); } else { add_mm_counter(vma->vm_mm, mm_counter_file(page), nr); - folio_add_file_rmap_range(folio, page, nr, vma, false); + folio_add_file_rmap_ptes(folio, page, nr, vma); } set_ptes(vma->vm_mm, addr, vmf->pte, entry, nr); diff --git a/mm/rmap.c b/mm/rmap.c index 6a1829324053..cc1fc2d570f0 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1378,31 +1378,18 @@ void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma, __lruvec_stat_mod_folio(folio, NR_ANON_MAPPED, nr); } -/** - * folio_add_file_rmap_range - add pte mapping to page range of a folio - * @folio: The folio to add the mapping to - * @page: The first page to add - * @nr_pages: The number of pages which will be mapped - * @vma: the vm area in which the mapping is added - * @compound: charge the page as compound or small page - * - * The page range of folio is defined by [first_page, first_page + nr_pages) - * - * The caller needs to hold the pte lock. - */ -void folio_add_file_rmap_range(struct folio *folio, struct page *page, - unsigned int nr_pages, struct vm_area_struct *vma, - bool compound) +static __always_inline void __folio_add_file_rmap(struct folio *folio, + struct page *page, int nr_pages, struct vm_area_struct *vma, + enum rmap_level level) { atomic_t *mapped = &folio->_nr_pages_mapped; - unsigned int nr_pmdmapped = 0, first; - int nr = 0; + int nr = 0, nr_pmdmapped = 0, first; - VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio); - VM_WARN_ON_FOLIO(compound && !folio_test_pmd_mappable(folio), folio); + VM_WARN_ON_FOLIO(folio_test_anon(folio), folio); + __folio_rmap_sanity_checks(folio, page, nr_pages, level); - /* Is page being mapped by PTE? Is this its first map to be added? */ - if (likely(!compound)) { + switch (level) { + case RMAP_LEVEL_PTE: do { first = atomic_inc_and_test(&page->_mapcount); if (first && folio_test_large(folio)) { @@ -1413,9 +1400,8 @@ void folio_add_file_rmap_range(struct folio *folio, struct page *page, if (first) nr++; } while (page++, --nr_pages > 0); - } else if (folio_test_pmd_mappable(folio)) { - /* That test is redundant: it's for safety or to optimize out */ - + break; + case RMAP_LEVEL_PMD: first = atomic_inc_and_test(&folio->_entire_mapcount); if (first) { nr = atomic_add_return_relaxed(COMPOUND_MAPPED, mapped); @@ -1430,6 +1416,7 @@ void folio_add_file_rmap_range(struct folio *folio, struct page *page, nr = 0; } } + break; } if (nr_pmdmapped) @@ -1443,6 +1430,43 @@ void folio_add_file_rmap_range(struct folio *folio, struct page *page, mlock_vma_folio(folio, vma); } +/** + * folio_add_file_rmap_ptes - add PTE mappings to a page range of a folio + * @folio: The folio to add the mappings to + * @page: The first page to add + * @nr_pages: The number of pages that will be mapped using PTEs + * @vma: The vm area in which the mappings are added + * + * The page range of the folio is defined by [page, page + nr_pages) + * + * The caller needs to hold the page table lock. + */ +void folio_add_file_rmap_ptes(struct folio *folio, struct page *page, + int nr_pages, struct vm_area_struct *vma) +{ + __folio_add_file_rmap(folio, page, nr_pages, vma, RMAP_LEVEL_PTE); +} + +/** + * folio_add_file_rmap_pmd - add a PMD mapping to a page range of a folio + * @folio: The folio to add the mapping to + * @page: The first page to add + * @vma: The vm area in which the mapping is added + * + * The page range of the folio is defined by [page, page + HPAGE_PMD_NR) + * + * The caller needs to hold the page table lock. + */ +void folio_add_file_rmap_pmd(struct folio *folio, struct page *page, + struct vm_area_struct *vma) +{ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + __folio_add_file_rmap(folio, page, HPAGE_PMD_NR, vma, RMAP_LEVEL_PMD); +#else + WARN_ON_ONCE(true); +#endif +} + /** * page_add_file_rmap - add pte mapping to a file page * @page: the page to add the mapping to @@ -1455,16 +1479,13 @@ void page_add_file_rmap(struct page *page, struct vm_area_struct *vma, bool compound) { struct folio *folio = page_folio(page); - unsigned int nr_pages; VM_WARN_ON_ONCE_PAGE(compound && !PageTransHuge(page), page); if (likely(!compound)) - nr_pages = 1; + folio_add_file_rmap_pte(folio, page, vma); else - nr_pages = folio_nr_pages(folio); - - folio_add_file_rmap_range(folio, page, nr_pages, vma, compound); + folio_add_file_rmap_pmd(folio, page, vma); } /** -- cgit v1.2.3 From be6e57cfabe99a5d3b3869103c4ea0ed4a9692d4 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 20 Dec 2023 23:44:36 +0100 Subject: mm/rmap: remove page_add_file_rmap() All users are gone, let's remove it. Link: https://lkml.kernel.org/r/20231220224504.646757-13-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Yin Fengwei Reviewed-by: Ryan Roberts Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Muchun Song Cc: Muchun Song Cc: Peter Xu Signed-off-by: Andrew Morton --- include/linux/rmap.h | 2 -- mm/rmap.c | 21 --------------------- 2 files changed, 23 deletions(-) (limited to 'include') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 3d86a76b2836..6a4db6933e7d 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -237,8 +237,6 @@ void page_add_anon_rmap(struct page *, struct vm_area_struct *, unsigned long address, rmap_t flags); void folio_add_new_anon_rmap(struct folio *, struct vm_area_struct *, unsigned long address); -void page_add_file_rmap(struct page *, struct vm_area_struct *, - bool compound); void folio_add_file_rmap_ptes(struct folio *, struct page *, int nr_pages, struct vm_area_struct *); #define folio_add_file_rmap_pte(folio, page, vma) \ diff --git a/mm/rmap.c b/mm/rmap.c index cc1fc2d570f0..5ab5ef10fbf5 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1467,27 +1467,6 @@ void folio_add_file_rmap_pmd(struct folio *folio, struct page *page, #endif } -/** - * page_add_file_rmap - add pte mapping to a file page - * @page: the page to add the mapping to - * @vma: the vm area in which the mapping is added - * @compound: charge the page as compound or small page - * - * The caller needs to hold the pte lock. - */ -void page_add_file_rmap(struct page *page, struct vm_area_struct *vma, - bool compound) -{ - struct folio *folio = page_folio(page); - - VM_WARN_ON_ONCE_PAGE(compound && !PageTransHuge(page), page); - - if (likely(!compound)) - folio_add_file_rmap_pte(folio, page, vma); - else - folio_add_file_rmap_pmd(folio, page, vma); -} - /** * page_remove_rmap - take down pte mapping from a page * @page: page to remove mapping from -- cgit v1.2.3 From 8bd5130070fbf2247a97c5361427a810522ac98a Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 20 Dec 2023 23:44:38 +0100 Subject: mm/rmap: introduce folio_add_anon_rmap_[pte|ptes|pmd]() Let's mimic what we did with folio_add_file_rmap_*() so we can similarly replace page_add_anon_rmap() next. Make the compiler always special-case on the granularity by using __always_inline. For the PageAnonExclusive sanity checks, when adding a PMD mapping, we're now also checking each individual subpage covered by that PMD, instead of only the head page. Note that the new functions ignore the RMAP_COMPOUND flag, which we will remove as soon as page_add_anon_rmap() is gone. Link: https://lkml.kernel.org/r/20231220224504.646757-15-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Yin Fengwei Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Muchun Song Cc: Muchun Song Cc: Peter Xu Cc: Ryan Roberts Signed-off-by: Andrew Morton --- include/linux/rmap.h | 6 +++ mm/rmap.c | 120 +++++++++++++++++++++++++++++++++++---------------- 2 files changed, 88 insertions(+), 38 deletions(-) (limited to 'include') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 6a4db6933e7d..b5da3d86200e 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -233,6 +233,12 @@ static inline void __folio_rmap_sanity_checks(struct folio *folio, * rmap interfaces called when adding or removing pte of page */ void folio_move_anon_rmap(struct folio *, struct vm_area_struct *); +void folio_add_anon_rmap_ptes(struct folio *, struct page *, int nr_pages, + struct vm_area_struct *, unsigned long address, rmap_t flags); +#define folio_add_anon_rmap_pte(folio, page, vma, address, flags) \ + folio_add_anon_rmap_ptes(folio, page, 1, vma, address, flags) +void folio_add_anon_rmap_pmd(struct folio *, struct page *, + struct vm_area_struct *, unsigned long address, rmap_t flags); void page_add_anon_rmap(struct page *, struct vm_area_struct *, unsigned long address, rmap_t flags); void folio_add_new_anon_rmap(struct folio *, struct vm_area_struct *, diff --git a/mm/rmap.c b/mm/rmap.c index 895a8534a935..7f380f5a34c9 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1299,40 +1299,20 @@ void page_add_anon_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address, rmap_t flags) { struct folio *folio = page_folio(page); - atomic_t *mapped = &folio->_nr_pages_mapped; - int nr = 0, nr_pmdmapped = 0; - bool compound = flags & RMAP_COMPOUND; - bool first; - - VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio); - /* Is page being mapped by PTE? Is this its first map to be added? */ - if (likely(!compound)) { - first = atomic_inc_and_test(&page->_mapcount); - nr = first; - if (first && folio_test_large(folio)) { - nr = atomic_inc_return_relaxed(mapped); - nr = (nr < COMPOUND_MAPPED); - } - } else if (folio_test_pmd_mappable(folio)) { - /* That test is redundant: it's for safety or to optimize out */ + if (likely(!(flags & RMAP_COMPOUND))) + folio_add_anon_rmap_pte(folio, page, vma, address, flags); + else + folio_add_anon_rmap_pmd(folio, page, vma, address, flags); +} - first = atomic_inc_and_test(&folio->_entire_mapcount); - if (first) { - nr = atomic_add_return_relaxed(COMPOUND_MAPPED, mapped); - if (likely(nr < COMPOUND_MAPPED + COMPOUND_MAPPED)) { - nr_pmdmapped = folio_nr_pages(folio); - nr = nr_pmdmapped - (nr & FOLIO_PAGES_MAPPED); - /* Raced ahead of a remove and another add? */ - if (unlikely(nr < 0)) - nr = 0; - } else { - /* Raced ahead of a remove of COMPOUND_MAPPED */ - nr = 0; - } - } - } +static __always_inline void __folio_add_anon_rmap(struct folio *folio, + struct page *page, int nr_pages, struct vm_area_struct *vma, + unsigned long address, rmap_t flags, enum rmap_level level) +{ + int i, nr, nr_pmdmapped = 0; + nr = __folio_add_rmap(folio, page, nr_pages, level, &nr_pmdmapped); if (nr_pmdmapped) __lruvec_stat_mod_folio(folio, NR_ANON_THPS, nr_pmdmapped); if (nr) @@ -1346,18 +1326,34 @@ void page_add_anon_rmap(struct page *page, struct vm_area_struct *vma, * folio->index right when not given the address of the head * page. */ - VM_WARN_ON_FOLIO(folio_test_large(folio) && !compound, folio); + VM_WARN_ON_FOLIO(folio_test_large(folio) && + level != RMAP_LEVEL_PMD, folio); __folio_set_anon(folio, vma, address, !!(flags & RMAP_EXCLUSIVE)); } else if (likely(!folio_test_ksm(folio))) { __page_check_anon_rmap(folio, page, vma, address); } - if (flags & RMAP_EXCLUSIVE) - SetPageAnonExclusive(page); - /* While PTE-mapping a THP we have a PMD and a PTE mapping. */ - VM_WARN_ON_FOLIO((atomic_read(&page->_mapcount) > 0 || - (folio_test_large(folio) && folio_entire_mapcount(folio) > 1)) && - PageAnonExclusive(page), folio); + + if (flags & RMAP_EXCLUSIVE) { + switch (level) { + case RMAP_LEVEL_PTE: + for (i = 0; i < nr_pages; i++) + SetPageAnonExclusive(page + i); + break; + case RMAP_LEVEL_PMD: + SetPageAnonExclusive(page); + break; + } + } + for (i = 0; i < nr_pages; i++) { + struct page *cur_page = page + i; + + /* While PTE-mapping a THP we have a PMD and a PTE mapping. */ + VM_WARN_ON_FOLIO((atomic_read(&cur_page->_mapcount) > 0 || + (folio_test_large(folio) && + folio_entire_mapcount(folio) > 1)) && + PageAnonExclusive(cur_page), folio); + } /* * For large folio, only mlock it if it's fully mapped to VMA. It's @@ -1369,6 +1365,54 @@ void page_add_anon_rmap(struct page *page, struct vm_area_struct *vma, mlock_vma_folio(folio, vma); } +/** + * folio_add_anon_rmap_ptes - add PTE mappings to a page range of an anon folio + * @folio: The folio to add the mappings to + * @page: The first page to add + * @nr_pages: The number of pages which will be mapped + * @vma: The vm area in which the mappings are added + * @address: The user virtual address of the first page to map + * @flags: The rmap flags + * + * The page range of folio is defined by [first_page, first_page + nr_pages) + * + * The caller needs to hold the page table lock, and the page must be locked in + * the anon_vma case: to serialize mapping,index checking after setting, + * and to ensure that an anon folio is not being upgraded racily to a KSM folio + * (but KSM folios are never downgraded). + */ +void folio_add_anon_rmap_ptes(struct folio *folio, struct page *page, + int nr_pages, struct vm_area_struct *vma, unsigned long address, + rmap_t flags) +{ + __folio_add_anon_rmap(folio, page, nr_pages, vma, address, flags, + RMAP_LEVEL_PTE); +} + +/** + * folio_add_anon_rmap_pmd - add a PMD mapping to a page range of an anon folio + * @folio: The folio to add the mapping to + * @page: The first page to add + * @vma: The vm area in which the mapping is added + * @address: The user virtual address of the first page to map + * @flags: The rmap flags + * + * The page range of folio is defined by [first_page, first_page + HPAGE_PMD_NR) + * + * The caller needs to hold the page table lock, and the page must be locked in + * the anon_vma case: to serialize mapping,index checking after setting. + */ +void folio_add_anon_rmap_pmd(struct folio *folio, struct page *page, + struct vm_area_struct *vma, unsigned long address, rmap_t flags) +{ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + __folio_add_anon_rmap(folio, page, HPAGE_PMD_NR, vma, address, flags, + RMAP_LEVEL_PMD); +#else + WARN_ON_ONCE(true); +#endif +} + /** * folio_add_new_anon_rmap - Add mapping to a new anonymous folio. * @folio: The folio to add the mapping to. -- cgit v1.2.3 From 84f0169e6c8a613012722e0d63302f9da4a72099 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 20 Dec 2023 23:44:45 +0100 Subject: mm/rmap: remove page_add_anon_rmap() All users are gone, remove it and all traces. Link: https://lkml.kernel.org/r/20231220224504.646757-22-david@redhat.com Signed-off-by: David Hildenbrand Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Muchun Song Cc: Muchun Song Cc: Peter Xu Cc: Ryan Roberts Cc: Yin Fengwei Signed-off-by: Andrew Morton --- include/linux/rmap.h | 2 -- mm/rmap.c | 31 ++++--------------------------- 2 files changed, 4 insertions(+), 29 deletions(-) (limited to 'include') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index b5da3d86200e..fe7b5a8b0e75 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -239,8 +239,6 @@ void folio_add_anon_rmap_ptes(struct folio *, struct page *, int nr_pages, folio_add_anon_rmap_ptes(folio, page, 1, vma, address, flags) void folio_add_anon_rmap_pmd(struct folio *, struct page *, struct vm_area_struct *, unsigned long address, rmap_t flags); -void page_add_anon_rmap(struct page *, struct vm_area_struct *, - unsigned long address, rmap_t flags); void folio_add_new_anon_rmap(struct folio *, struct vm_area_struct *, unsigned long address); void folio_add_file_rmap_ptes(struct folio *, struct page *, int nr_pages, diff --git a/mm/rmap.c b/mm/rmap.c index 7f380f5a34c9..87415bbf2402 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1270,7 +1270,7 @@ static void __page_check_anon_rmap(struct folio *folio, struct page *page, * The page's anon-rmap details (mapping and index) are guaranteed to * be set up correctly at this point. * - * We have exclusion against page_add_anon_rmap because the caller + * We have exclusion against folio_add_anon_rmap_*() because the caller * always holds the page locked. * * We have exclusion against folio_add_new_anon_rmap because those pages @@ -1283,29 +1283,6 @@ static void __page_check_anon_rmap(struct folio *folio, struct page *page, page); } -/** - * page_add_anon_rmap - add pte mapping to an anonymous page - * @page: the page to add the mapping to - * @vma: the vm area in which the mapping is added - * @address: the user virtual address mapped - * @flags: the rmap flags - * - * The caller needs to hold the pte lock, and the page must be locked in - * the anon_vma case: to serialize mapping,index checking after setting, - * and to ensure that PageAnon is not being upgraded racily to PageKsm - * (but PageKsm is never downgraded to PageAnon). - */ -void page_add_anon_rmap(struct page *page, struct vm_area_struct *vma, - unsigned long address, rmap_t flags) -{ - struct folio *folio = page_folio(page); - - if (likely(!(flags & RMAP_COMPOUND))) - folio_add_anon_rmap_pte(folio, page, vma, address, flags); - else - folio_add_anon_rmap_pmd(folio, page, vma, address, flags); -} - static __always_inline void __folio_add_anon_rmap(struct folio *folio, struct page *page, int nr_pages, struct vm_area_struct *vma, unsigned long address, rmap_t flags, enum rmap_level level) @@ -1419,7 +1396,7 @@ void folio_add_anon_rmap_pmd(struct folio *folio, struct page *page, * @vma: the vm area in which the mapping is added * @address: the user virtual address mapped * - * Like page_add_anon_rmap() but must only be called on *new* folios. + * Like folio_add_anon_rmap_*() but must only be called on *new* folios. * This means the inc-and-test can be bypassed. * The folio does not have to be locked. * @@ -1479,7 +1456,7 @@ static __always_inline void __folio_add_file_rmap(struct folio *folio, if (nr) __lruvec_stat_mod_folio(folio, NR_FILE_MAPPED, nr); - /* See comments in page_add_anon_rmap() */ + /* See comments in folio_add_anon_rmap_*() */ if (!folio_test_large(folio)) mlock_vma_folio(folio, vma); } @@ -1593,7 +1570,7 @@ void page_remove_rmap(struct page *page, struct vm_area_struct *vma, /* * It would be tidy to reset folio_test_anon mapping when fully - * unmapped, but that might overwrite a racing page_add_anon_rmap + * unmapped, but that might overwrite a racing folio_add_anon_rmap_*() * which increments mapcount after us but sets mapping before us: * so leave the reset to free_pages_prepare, and remember that * it's only reliable while mapped. -- cgit v1.2.3 From 0cae959e3abf19ba62805f6e6a8b42b6cd9ed3e3 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 20 Dec 2023 23:44:46 +0100 Subject: mm/rmap: remove RMAP_COMPOUND No longer used, let's remove it and clarify RMAP_NONE/RMAP_EXCLUSIVE a bit. Link: https://lkml.kernel.org/r/20231220224504.646757-23-david@redhat.com Signed-off-by: David Hildenbrand Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Muchun Song Cc: Muchun Song Cc: Peter Xu Cc: Ryan Roberts Cc: Yin Fengwei Signed-off-by: Andrew Morton --- include/linux/rmap.h | 12 +++--------- mm/rmap.c | 2 -- 2 files changed, 3 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index fe7b5a8b0e75..bf6cb79aa7a0 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -177,20 +177,14 @@ struct anon_vma *folio_get_anon_vma(struct folio *folio); typedef int __bitwise rmap_t; /* - * No special request: if the page is a subpage of a compound page, it is - * mapped via a PTE. The mapped (sub)page is possibly shared between processes. + * No special request: A mapped anonymous (sub)page is possibly shared between + * processes. */ #define RMAP_NONE ((__force rmap_t)0) -/* The (sub)page is exclusive to a single process. */ +/* The anonymous (sub)page is exclusive to a single process. */ #define RMAP_EXCLUSIVE ((__force rmap_t)BIT(0)) -/* - * The compound page is not mapped via PTEs, but instead via a single PMD and - * should be accounted accordingly. - */ -#define RMAP_COMPOUND ((__force rmap_t)BIT(1)) - /* * Internally, we're using an enum to specify the granularity. We make the * compiler emit specialized code for each granularity. diff --git a/mm/rmap.c b/mm/rmap.c index 87415bbf2402..2b386b9f6791 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -2662,8 +2662,6 @@ void rmap_walk_locked(struct folio *folio, struct rmap_walk_control *rwc) * The following two functions are for anonymous (private mapped) hugepages. * Unlike common anonymous pages, anonymous hugepages have no accounting code * and no lru code, because we handle hugepages differently from common pages. - * - * RMAP_COMPOUND is ignored. */ void hugetlb_add_anon_rmap(struct folio *folio, struct vm_area_struct *vma, unsigned long address, rmap_t flags) -- cgit v1.2.3 From b06dc281aa9901076898d4d0a7bde588f11bc204 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 20 Dec 2023 23:44:47 +0100 Subject: mm/rmap: introduce folio_remove_rmap_[pte|ptes|pmd]() Let's mimic what we did with folio_add_file_rmap_*() and folio_add_anon_rmap_*() so we can similarly replace page_remove_rmap() next. Make the compiler always special-case on the granularity by using __always_inline. We're adding folio_remove_rmap_ptes() handling right away, as we want to use that soon for batching rmap operations when unmapping PTE-mapped large folios. Link: https://lkml.kernel.org/r/20231220224504.646757-24-david@redhat.com Signed-off-by: David Hildenbrand Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Muchun Song Cc: Muchun Song Cc: Peter Xu Cc: Ryan Roberts Cc: Yin Fengwei Signed-off-by: Andrew Morton --- include/linux/rmap.h | 6 ++++ mm/rmap.c | 82 ++++++++++++++++++++++++++++++++++++++++++---------- 2 files changed, 72 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index bf6cb79aa7a0..57e045093f04 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -243,6 +243,12 @@ void folio_add_file_rmap_pmd(struct folio *, struct page *, struct vm_area_struct *); void page_remove_rmap(struct page *, struct vm_area_struct *, bool compound); +void folio_remove_rmap_ptes(struct folio *, struct page *, int nr_pages, + struct vm_area_struct *); +#define folio_remove_rmap_pte(folio, page, vma) \ + folio_remove_rmap_ptes(folio, page, 1, vma) +void folio_remove_rmap_pmd(struct folio *, struct page *, + struct vm_area_struct *); void hugetlb_add_anon_rmap(struct folio *, struct vm_area_struct *, unsigned long address, rmap_t flags); diff --git a/mm/rmap.c b/mm/rmap.c index 2b386b9f6791..127318075395 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1510,25 +1510,37 @@ void page_remove_rmap(struct page *page, struct vm_area_struct *vma, bool compound) { struct folio *folio = page_folio(page); + + if (likely(!compound)) + folio_remove_rmap_pte(folio, page, vma); + else + folio_remove_rmap_pmd(folio, page, vma); +} + +static __always_inline void __folio_remove_rmap(struct folio *folio, + struct page *page, int nr_pages, struct vm_area_struct *vma, + enum rmap_level level) +{ atomic_t *mapped = &folio->_nr_pages_mapped; - int nr = 0, nr_pmdmapped = 0; - bool last; + int last, nr = 0, nr_pmdmapped = 0; enum node_stat_item idx; - VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio); - VM_BUG_ON_PAGE(compound && !PageHead(page), page); - - /* Is page being unmapped by PTE? Is this its last map to be removed? */ - if (likely(!compound)) { - last = atomic_add_negative(-1, &page->_mapcount); - nr = last; - if (last && folio_test_large(folio)) { - nr = atomic_dec_return_relaxed(mapped); - nr = (nr < COMPOUND_MAPPED); - } - } else if (folio_test_pmd_mappable(folio)) { - /* That test is redundant: it's for safety or to optimize out */ + __folio_rmap_sanity_checks(folio, page, nr_pages, level); + + switch (level) { + case RMAP_LEVEL_PTE: + do { + last = atomic_add_negative(-1, &page->_mapcount); + if (last && folio_test_large(folio)) { + last = atomic_dec_return_relaxed(mapped); + last = (last < COMPOUND_MAPPED); + } + if (last) + nr++; + } while (page++, --nr_pages > 0); + break; + case RMAP_LEVEL_PMD: last = atomic_add_negative(-1, &folio->_entire_mapcount); if (last) { nr = atomic_sub_return_relaxed(COMPOUND_MAPPED, mapped); @@ -1543,6 +1555,7 @@ void page_remove_rmap(struct page *page, struct vm_area_struct *vma, nr = 0; } } + break; } if (nr_pmdmapped) { @@ -1564,7 +1577,7 @@ void page_remove_rmap(struct page *page, struct vm_area_struct *vma, * is still mapped. */ if (folio_test_large(folio) && folio_test_anon(folio)) - if (!compound || nr < nr_pmdmapped) + if (level == RMAP_LEVEL_PTE || nr < nr_pmdmapped) deferred_split_folio(folio); } @@ -1579,6 +1592,43 @@ void page_remove_rmap(struct page *page, struct vm_area_struct *vma, munlock_vma_folio(folio, vma); } +/** + * folio_remove_rmap_ptes - remove PTE mappings from a page range of a folio + * @folio: The folio to remove the mappings from + * @page: The first page to remove + * @nr_pages: The number of pages that will be removed from the mapping + * @vma: The vm area from which the mappings are removed + * + * The page range of the folio is defined by [page, page + nr_pages) + * + * The caller needs to hold the page table lock. + */ +void folio_remove_rmap_ptes(struct folio *folio, struct page *page, + int nr_pages, struct vm_area_struct *vma) +{ + __folio_remove_rmap(folio, page, nr_pages, vma, RMAP_LEVEL_PTE); +} + +/** + * folio_remove_rmap_pmd - remove a PMD mapping from a page range of a folio + * @folio: The folio to remove the mapping from + * @page: The first page to remove + * @vma: The vm area from which the mapping is removed + * + * The page range of the folio is defined by [page, page + HPAGE_PMD_NR) + * + * The caller needs to hold the page table lock. + */ +void folio_remove_rmap_pmd(struct folio *folio, struct page *page, + struct vm_area_struct *vma) +{ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + __folio_remove_rmap(folio, page, HPAGE_PMD_NR, vma, RMAP_LEVEL_PMD); +#else + WARN_ON_ONCE(true); +#endif +} + /* * @arg: enum ttu_flags will be passed to this argument */ -- cgit v1.2.3 From 4d8f7418e8ba36036c8486d92d9591c368ab9b85 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 20 Dec 2023 23:44:56 +0100 Subject: mm/rmap: remove page_remove_rmap() All callers are gone, let's remove it and some leftover traces. Link: https://lkml.kernel.org/r/20231220224504.646757-33-david@redhat.com Signed-off-by: David Hildenbrand Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Muchun Song Cc: Muchun Song Cc: Peter Xu Cc: Ryan Roberts Cc: Yin Fengwei Signed-off-by: Andrew Morton --- include/linux/rmap.h | 4 +--- mm/filemap.c | 10 +++++----- mm/internal.h | 2 +- mm/memory-failure.c | 4 ++-- mm/rmap.c | 23 ++--------------------- 5 files changed, 11 insertions(+), 32 deletions(-) (limited to 'include') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 57e045093f04..fef369e37039 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -241,8 +241,6 @@ void folio_add_file_rmap_ptes(struct folio *, struct page *, int nr_pages, folio_add_file_rmap_ptes(folio, page, 1, vma) void folio_add_file_rmap_pmd(struct folio *, struct page *, struct vm_area_struct *); -void page_remove_rmap(struct page *, struct vm_area_struct *, - bool compound); void folio_remove_rmap_ptes(struct folio *, struct page *, int nr_pages, struct vm_area_struct *); #define folio_remove_rmap_pte(folio, page, vma) \ @@ -389,7 +387,7 @@ dup: * * This is similar to page_try_dup_anon_rmap(), however, not used during fork() * to duplicate a mapping, but instead to prepare for KSM or temporarily - * unmapping a page (swap, migration) via page_remove_rmap(). + * unmapping a page (swap, migration) via folio_remove_rmap_*(). * * Marking the page shared can only fail if the page may be pinned; device * private pages cannot get pinned and consequently this function cannot fail. diff --git a/mm/filemap.c b/mm/filemap.c index 67ba56ecdd32..c8dafe70d4cc 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -113,11 +113,11 @@ * ->i_pages lock (try_to_unmap_one) * ->lruvec->lru_lock (follow_page->mark_page_accessed) * ->lruvec->lru_lock (check_pte_range->isolate_lru_page) - * ->private_lock (page_remove_rmap->set_page_dirty) - * ->i_pages lock (page_remove_rmap->set_page_dirty) - * bdi.wb->list_lock (page_remove_rmap->set_page_dirty) - * ->inode->i_lock (page_remove_rmap->set_page_dirty) - * ->memcg->move_lock (page_remove_rmap->folio_memcg_lock) + * ->private_lock (folio_remove_rmap_pte->set_page_dirty) + * ->i_pages lock (folio_remove_rmap_pte->set_page_dirty) + * bdi.wb->list_lock (folio_remove_rmap_pte->set_page_dirty) + * ->inode->i_lock (folio_remove_rmap_pte->set_page_dirty) + * ->memcg->move_lock (folio_remove_rmap_pte->folio_memcg_lock) * bdi.wb->list_lock (zap_pte_range->set_page_dirty) * ->inode->i_lock (zap_pte_range->set_page_dirty) * ->private_lock (zap_pte_range->block_dirty_folio) diff --git a/mm/internal.h b/mm/internal.h index 222e63b2dea4..a94355e70bd7 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -651,7 +651,7 @@ folio_within_vma(struct folio *folio, struct vm_area_struct *vma) * under page table lock for the pte/pmd being added or removed. * * mlock is usually called at the end of page_add_*_rmap(), munlock at - * the end of page_remove_rmap(); but new anon folios are managed by + * the end of folio_remove_rmap_*(); but new anon folios are managed by * folio_add_lru_vma() calling mlock_new_folio(). */ void mlock_folio(struct folio *folio); diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 5a23da5eb8c1..a0d9b4ac7d54 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -2315,8 +2315,8 @@ try_again: * We use page flags to determine what action should be taken, but * the flags can be modified by the error containment action. One * example is an mlocked page, where PG_mlocked is cleared by - * page_remove_rmap() in try_to_unmap_one(). So to determine page status - * correctly, we save a copy of the page flags at this time. + * folio_remove_rmap_*() in try_to_unmap_one(). So to determine page + * status correctly, we save a copy of the page flags at this time. */ page_flags = p->flags; diff --git a/mm/rmap.c b/mm/rmap.c index a3ec2be484cf..3ee254a99622 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -470,7 +470,7 @@ void __init anon_vma_init(void) /* * Getting a lock on a stable anon_vma from a page off the LRU is tricky! * - * Since there is no serialization what so ever against page_remove_rmap() + * Since there is no serialization what so ever against folio_remove_rmap_*() * the best this function can do is return a refcount increased anon_vma * that might have been relevant to this page. * @@ -487,7 +487,7 @@ void __init anon_vma_init(void) * [ something equivalent to page_mapped_in_vma() ]. * * Since anon_vma's slab is SLAB_TYPESAFE_BY_RCU and we know from - * page_remove_rmap() that the anon_vma pointer from page->mapping is valid + * folio_remove_rmap_*() that the anon_vma pointer from page->mapping is valid * if there is a mapcount, we can dereference the anon_vma after observing * those. * @@ -1498,25 +1498,6 @@ void folio_add_file_rmap_pmd(struct folio *folio, struct page *page, #endif } -/** - * page_remove_rmap - take down pte mapping from a page - * @page: page to remove mapping from - * @vma: the vm area from which the mapping is removed - * @compound: uncharge the page as compound or small page - * - * The caller needs to hold the pte lock. - */ -void page_remove_rmap(struct page *page, struct vm_area_struct *vma, - bool compound) -{ - struct folio *folio = page_folio(page); - - if (likely(!compound)) - folio_remove_rmap_pte(folio, page, vma); - else - folio_remove_rmap_pmd(folio, page, vma); -} - static __always_inline void __folio_remove_rmap(struct folio *folio, struct page *page, int nr_pages, struct vm_area_struct *vma, enum rmap_level level) -- cgit v1.2.3 From d8ef5e311d7bfde54b60ab45026f206eff31b2d2 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 20 Dec 2023 23:44:57 +0100 Subject: mm/rmap: convert page_dup_file_rmap() to folio_dup_file_rmap_[pte|ptes|pmd]() Let's convert page_dup_file_rmap() like the other rmap functions. As there is only a single caller, convert that single caller right away and remove page_dup_file_rmap(). Add folio_dup_file_rmap_ptes() right away, we want to perform rmap baching during fork() soon. Link: https://lkml.kernel.org/r/20231220224504.646757-34-david@redhat.com Signed-off-by: David Hildenbrand Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Muchun Song Cc: Muchun Song Cc: Peter Xu Cc: Ryan Roberts Cc: Yin Fengwei Signed-off-by: Andrew Morton --- include/linux/rmap.h | 59 +++++++++++++++++++++++++++++++++++++++++++++++----- mm/memory.c | 2 +- 2 files changed, 55 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index fef369e37039..7607f862e795 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -308,6 +308,60 @@ static inline void hugetlb_remove_rmap(struct folio *folio) atomic_dec(&folio->_entire_mapcount); } +static __always_inline void __folio_dup_file_rmap(struct folio *folio, + struct page *page, int nr_pages, enum rmap_level level) +{ + __folio_rmap_sanity_checks(folio, page, nr_pages, level); + + switch (level) { + case RMAP_LEVEL_PTE: + do { + atomic_inc(&page->_mapcount); + } while (page++, --nr_pages > 0); + break; + case RMAP_LEVEL_PMD: + atomic_inc(&folio->_entire_mapcount); + break; + } +} + +/** + * folio_dup_file_rmap_ptes - duplicate PTE mappings of a page range of a folio + * @folio: The folio to duplicate the mappings of + * @page: The first page to duplicate the mappings of + * @nr_pages: The number of pages of which the mapping will be duplicated + * + * The page range of the folio is defined by [page, page + nr_pages) + * + * The caller needs to hold the page table lock. + */ +static inline void folio_dup_file_rmap_ptes(struct folio *folio, + struct page *page, int nr_pages) +{ + __folio_dup_file_rmap(folio, page, nr_pages, RMAP_LEVEL_PTE); +} +#define folio_dup_file_rmap_pte(folio, page) \ + folio_dup_file_rmap_ptes(folio, page, 1) + +/** + * folio_dup_file_rmap_pmd - duplicate a PMD mapping of a page range of a folio + * @folio: The folio to duplicate the mapping of + * @page: The first page to duplicate the mapping of + * + * The page range of the folio is defined by [page, page + HPAGE_PMD_NR) + * + * The caller needs to hold the page table lock. + */ +static inline void folio_dup_file_rmap_pmd(struct folio *folio, + struct page *page) +{ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + __folio_dup_file_rmap(folio, page, HPAGE_PMD_NR, RMAP_LEVEL_PTE); +#else + WARN_ON_ONCE(true); +#endif +} + static inline void __page_dup_rmap(struct page *page, bool compound) { VM_WARN_ON(folio_test_hugetlb(page_folio(page))); @@ -322,11 +376,6 @@ static inline void __page_dup_rmap(struct page *page, bool compound) } } -static inline void page_dup_file_rmap(struct page *page, bool compound) -{ - __page_dup_rmap(page, compound); -} - /** * page_try_dup_anon_rmap - try duplicating a mapping of an already mapped * anonymous page diff --git a/mm/memory.c b/mm/memory.c index e52c6e97444a..fdc87bf15545 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -965,7 +965,7 @@ copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, rss[MM_ANONPAGES]++; } else if (page) { folio_get(folio); - page_dup_file_rmap(page, false); + folio_dup_file_rmap_pte(folio, page); rss[mm_counter_file(page)]++; } -- cgit v1.2.3 From 61d90309b7156d54c5d358cb5d8bf55b33d233d2 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 20 Dec 2023 23:44:58 +0100 Subject: mm/rmap: introduce folio_try_dup_anon_rmap_[pte|ptes|pmd]() The last user of page_needs_cow_for_dma() and __page_dup_rmap() are gone, remove them. Add folio_try_dup_anon_rmap_ptes() right away, we want to perform rmap baching during fork() soon. Link: https://lkml.kernel.org/r/20231220224504.646757-35-david@redhat.com Signed-off-by: David Hildenbrand Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Muchun Song Cc: Muchun Song Cc: Peter Xu Cc: Ryan Roberts Cc: Yin Fengwei Signed-off-by: Andrew Morton --- include/linux/mm.h | 6 --- include/linux/rmap.h | 150 ++++++++++++++++++++++++++++++++++++--------------- 2 files changed, 106 insertions(+), 50 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 75bba6102825..896c0079f64f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1975,12 +1975,6 @@ static inline bool folio_needs_cow_for_dma(struct vm_area_struct *vma, return folio_maybe_dma_pinned(folio); } -static inline bool page_needs_cow_for_dma(struct vm_area_struct *vma, - struct page *page) -{ - return folio_needs_cow_for_dma(vma, page_folio(page)); -} - /** * is_zero_page - Query if a page is a zero page * @page: The page to query diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 7607f862e795..850aa74b6724 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -362,68 +362,130 @@ static inline void folio_dup_file_rmap_pmd(struct folio *folio, #endif } -static inline void __page_dup_rmap(struct page *page, bool compound) +static __always_inline int __folio_try_dup_anon_rmap(struct folio *folio, + struct page *page, int nr_pages, struct vm_area_struct *src_vma, + enum rmap_level level) { - VM_WARN_ON(folio_test_hugetlb(page_folio(page))); + bool maybe_pinned; + int i; + + VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio); + __folio_rmap_sanity_checks(folio, page, nr_pages, level); - if (compound) { - struct folio *folio = (struct folio *)page; + /* + * If this folio may have been pinned by the parent process, + * don't allow to duplicate the mappings but instead require to e.g., + * copy the subpage immediately for the child so that we'll always + * guarantee the pinned folio won't be randomly replaced in the + * future on write faults. + */ + maybe_pinned = likely(!folio_is_device_private(folio)) && + unlikely(folio_needs_cow_for_dma(src_vma, folio)); - VM_BUG_ON_PAGE(compound && !PageHead(page), page); + /* + * No need to check+clear for already shared PTEs/PMDs of the + * folio. But if any page is PageAnonExclusive, we must fallback to + * copying if the folio maybe pinned. + */ + switch (level) { + case RMAP_LEVEL_PTE: + if (unlikely(maybe_pinned)) { + for (i = 0; i < nr_pages; i++) + if (PageAnonExclusive(page + i)) + return -EBUSY; + } + do { + if (PageAnonExclusive(page)) + ClearPageAnonExclusive(page); + atomic_inc(&page->_mapcount); + } while (page++, --nr_pages > 0); + break; + case RMAP_LEVEL_PMD: + if (PageAnonExclusive(page)) { + if (unlikely(maybe_pinned)) + return -EBUSY; + ClearPageAnonExclusive(page); + } atomic_inc(&folio->_entire_mapcount); - } else { - atomic_inc(&page->_mapcount); + break; } + return 0; } /** - * page_try_dup_anon_rmap - try duplicating a mapping of an already mapped - * anonymous page - * @page: the page to duplicate the mapping for - * @compound: the page is mapped as compound or as a small page - * @vma: the source vma + * folio_try_dup_anon_rmap_ptes - try duplicating PTE mappings of a page range + * of a folio + * @folio: The folio to duplicate the mappings of + * @page: The first page to duplicate the mappings of + * @nr_pages: The number of pages of which the mapping will be duplicated + * @src_vma: The vm area from which the mappings are duplicated * - * The caller needs to hold the PT lock and the vma->vma_mm->write_protect_seq. + * The page range of the folio is defined by [page, page + nr_pages) * - * Duplicating the mapping can only fail if the page may be pinned; device - * private pages cannot get pinned and consequently this function cannot fail. + * The caller needs to hold the page table lock and the + * vma->vma_mm->write_protect_seq. + * + * Duplicating the mappings can only fail if the folio may be pinned; device + * private folios cannot get pinned and consequently this function cannot fail + * for them. + * + * If duplicating the mappings succeeded, the duplicated PTEs have to be R/O in + * the parent and the child. They must *not* be writable after this call + * succeeded. + * + * Returns 0 if duplicating the mappings succeeded. Returns -EBUSY otherwise. + */ +static inline int folio_try_dup_anon_rmap_ptes(struct folio *folio, + struct page *page, int nr_pages, struct vm_area_struct *src_vma) +{ + return __folio_try_dup_anon_rmap(folio, page, nr_pages, src_vma, + RMAP_LEVEL_PTE); +} +#define folio_try_dup_anon_rmap_pte(folio, page, vma) \ + folio_try_dup_anon_rmap_ptes(folio, page, 1, vma) + +/** + * folio_try_dup_anon_rmap_pmd - try duplicating a PMD mapping of a page range + * of a folio + * @folio: The folio to duplicate the mapping of + * @page: The first page to duplicate the mapping of + * @src_vma: The vm area from which the mapping is duplicated + * + * The page range of the folio is defined by [page, page + HPAGE_PMD_NR) * - * If duplicating the mapping succeeds, the page has to be mapped R/O into - * the parent and the child. It must *not* get mapped writable after this call. + * The caller needs to hold the page table lock and the + * vma->vma_mm->write_protect_seq. + * + * Duplicating the mapping can only fail if the folio may be pinned; device + * private folios cannot get pinned and consequently this function cannot fail + * for them. + * + * If duplicating the mapping succeeds, the duplicated PMD has to be R/O in + * the parent and the child. They must *not* be writable after this call + * succeeded. * * Returns 0 if duplicating the mapping succeeded. Returns -EBUSY otherwise. */ +static inline int folio_try_dup_anon_rmap_pmd(struct folio *folio, + struct page *page, struct vm_area_struct *src_vma) +{ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + return __folio_try_dup_anon_rmap(folio, page, HPAGE_PMD_NR, src_vma, + RMAP_LEVEL_PMD); +#else + WARN_ON_ONCE(true); + return -EBUSY; +#endif +} + static inline int page_try_dup_anon_rmap(struct page *page, bool compound, struct vm_area_struct *vma) { - VM_BUG_ON_PAGE(!PageAnon(page), page); - - /* - * No need to check+clear for already shared pages, including KSM - * pages. - */ - if (!PageAnonExclusive(page)) - goto dup; - - /* - * If this page may have been pinned by the parent process, - * don't allow to duplicate the mapping but instead require to e.g., - * copy the page immediately for the child so that we'll always - * guarantee the pinned page won't be randomly replaced in the - * future on write faults. - */ - if (likely(!is_device_private_page(page)) && - unlikely(page_needs_cow_for_dma(vma, page))) - return -EBUSY; + struct folio *folio = page_folio(page); - ClearPageAnonExclusive(page); - /* - * It's okay to share the anon page between both processes, mapping - * the page R/O into both processes. - */ -dup: - __page_dup_rmap(page, compound); - return 0; + if (likely(!compound)) + return folio_try_dup_anon_rmap_pte(folio, page, vma); + return folio_try_dup_anon_rmap_pmd(folio, page, vma); } /** -- cgit v1.2.3 From a13d096471ec0ac5c6fc90fbcd57e8430024046a Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 20 Dec 2023 23:45:01 +0100 Subject: mm/rmap: remove page_try_dup_anon_rmap() All users are gone, remove page_try_dup_anon_rmap() and any remaining traces. Link: https://lkml.kernel.org/r/20231220224504.646757-38-david@redhat.com Signed-off-by: David Hildenbrand Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Muchun Song Cc: Muchun Song Cc: Peter Xu Cc: Ryan Roberts Cc: Yin Fengwei Signed-off-by: Andrew Morton --- include/linux/rmap.h | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 850aa74b6724..0ad2ea2734e4 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -253,7 +253,7 @@ void hugetlb_add_anon_rmap(struct folio *, struct vm_area_struct *, void hugetlb_add_new_anon_rmap(struct folio *, struct vm_area_struct *, unsigned long address); -/* See page_try_dup_anon_rmap() */ +/* See folio_try_dup_anon_rmap_*() */ static inline int hugetlb_try_dup_anon_rmap(struct folio *folio, struct vm_area_struct *vma) { @@ -478,16 +478,6 @@ static inline int folio_try_dup_anon_rmap_pmd(struct folio *folio, #endif } -static inline int page_try_dup_anon_rmap(struct page *page, bool compound, - struct vm_area_struct *vma) -{ - struct folio *folio = page_folio(page); - - if (likely(!compound)) - return folio_try_dup_anon_rmap_pte(folio, page, vma); - return folio_try_dup_anon_rmap_pmd(folio, page, vma); -} - /** * page_try_share_anon_rmap - try marking an exclusive anonymous page possibly * shared to prepare for KSM or temporary unmapping @@ -496,8 +486,8 @@ static inline int page_try_dup_anon_rmap(struct page *page, bool compound, * The caller needs to hold the PT lock and has to have the page table entry * cleared/invalidated. * - * This is similar to page_try_dup_anon_rmap(), however, not used during fork() - * to duplicate a mapping, but instead to prepare for KSM or temporarily + * This is similar to folio_try_dup_anon_rmap_*(), however, not used during + * fork() to duplicate a mapping, but instead to prepare for KSM or temporarily * unmapping a page (swap, migration) via folio_remove_rmap_*(). * * Marking the page shared can only fail if the page may be pinned; device -- cgit v1.2.3 From e3b4b1374f87c71e9309efc6149f113cdd17af72 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 20 Dec 2023 23:45:02 +0100 Subject: mm: convert page_try_share_anon_rmap() to folio_try_share_anon_rmap_[pte|pmd]() Let's convert it like we converted all the other rmap functions. Don't introduce folio_try_share_anon_rmap_ptes() for now, as we don't have a user that wants rmap batching in sight. Pretty easy to add later. All users are easy to convert -- only ksm.c doesn't use folios yet but that is left for future work -- so let's just do it in a single shot. While at it, turn the BUG_ON into a WARN_ON_ONCE. Note that page_try_share_anon_rmap() so far didn't care about pte/pmd mappings (no compound parameter). We're changing that so we can perform better sanity checks and make the code actually more readable/consistent. For example, __folio_rmap_sanity_checks() will make sure that a PMD range actually falls completely into the folio. Link: https://lkml.kernel.org/r/20231220224504.646757-39-david@redhat.com Signed-off-by: David Hildenbrand Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Muchun Song Cc: Muchun Song Cc: Peter Xu Cc: Ryan Roberts Cc: Yin Fengwei Signed-off-by: Andrew Morton --- include/linux/rmap.h | 96 ++++++++++++++++++++++++++++++++++++++-------------- mm/gup.c | 2 +- mm/huge_memory.c | 9 ++--- mm/internal.h | 4 +-- mm/ksm.c | 5 +-- mm/migrate_device.c | 2 +- mm/rmap.c | 11 +++--- 7 files changed, 89 insertions(+), 40 deletions(-) (limited to 'include') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 0ad2ea2734e4..fd6fe16fa358 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -269,7 +269,7 @@ static inline int hugetlb_try_dup_anon_rmap(struct folio *folio, return 0; } -/* See page_try_share_anon_rmap() */ +/* See folio_try_share_anon_rmap_*() */ static inline int hugetlb_try_share_anon_rmap(struct folio *folio) { VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio); @@ -478,31 +478,15 @@ static inline int folio_try_dup_anon_rmap_pmd(struct folio *folio, #endif } -/** - * page_try_share_anon_rmap - try marking an exclusive anonymous page possibly - * shared to prepare for KSM or temporary unmapping - * @page: the exclusive anonymous page to try marking possibly shared - * - * The caller needs to hold the PT lock and has to have the page table entry - * cleared/invalidated. - * - * This is similar to folio_try_dup_anon_rmap_*(), however, not used during - * fork() to duplicate a mapping, but instead to prepare for KSM or temporarily - * unmapping a page (swap, migration) via folio_remove_rmap_*(). - * - * Marking the page shared can only fail if the page may be pinned; device - * private pages cannot get pinned and consequently this function cannot fail. - * - * Returns 0 if marking the page possibly shared succeeded. Returns -EBUSY - * otherwise. - */ -static inline int page_try_share_anon_rmap(struct page *page) +static __always_inline int __folio_try_share_anon_rmap(struct folio *folio, + struct page *page, int nr_pages, enum rmap_level level) { - VM_WARN_ON(folio_test_hugetlb(page_folio(page))); - VM_BUG_ON_PAGE(!PageAnon(page) || !PageAnonExclusive(page), page); + VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio); + VM_WARN_ON_FOLIO(!PageAnonExclusive(page), folio); + __folio_rmap_sanity_checks(folio, page, nr_pages, level); - /* device private pages cannot get pinned via GUP. */ - if (unlikely(is_device_private_page(page))) { + /* device private folios cannot get pinned via GUP. */ + if (unlikely(folio_is_device_private(folio))) { ClearPageAnonExclusive(page); return 0; } @@ -553,7 +537,7 @@ static inline int page_try_share_anon_rmap(struct page *page) if (IS_ENABLED(CONFIG_HAVE_FAST_GUP)) smp_mb(); - if (unlikely(page_maybe_dma_pinned(page))) + if (unlikely(folio_maybe_dma_pinned(folio))) return -EBUSY; ClearPageAnonExclusive(page); @@ -566,6 +550,68 @@ static inline int page_try_share_anon_rmap(struct page *page) return 0; } +/** + * folio_try_share_anon_rmap_pte - try marking an exclusive anonymous page + * mapped by a PTE possibly shared to prepare + * for KSM or temporary unmapping + * @folio: The folio to share a mapping of + * @page: The mapped exclusive page + * + * The caller needs to hold the page table lock and has to have the page table + * entries cleared/invalidated. + * + * This is similar to folio_try_dup_anon_rmap_pte(), however, not used during + * fork() to duplicate mappings, but instead to prepare for KSM or temporarily + * unmapping parts of a folio (swap, migration) via folio_remove_rmap_pte(). + * + * Marking the mapped page shared can only fail if the folio maybe pinned; + * device private folios cannot get pinned and consequently this function cannot + * fail. + * + * Returns 0 if marking the mapped page possibly shared succeeded. Returns + * -EBUSY otherwise. + */ +static inline int folio_try_share_anon_rmap_pte(struct folio *folio, + struct page *page) +{ + return __folio_try_share_anon_rmap(folio, page, 1, RMAP_LEVEL_PTE); +} + +/** + * folio_try_share_anon_rmap_pmd - try marking an exclusive anonymous page + * range mapped by a PMD possibly shared to + * prepare for temporary unmapping + * @folio: The folio to share the mapping of + * @page: The first page to share the mapping of + * + * The page range of the folio is defined by [page, page + HPAGE_PMD_NR) + * + * The caller needs to hold the page table lock and has to have the page table + * entries cleared/invalidated. + * + * This is similar to folio_try_dup_anon_rmap_pmd(), however, not used during + * fork() to duplicate a mapping, but instead to prepare for temporarily + * unmapping parts of a folio (swap, migration) via folio_remove_rmap_pmd(). + * + * Marking the mapped pages shared can only fail if the folio maybe pinned; + * device private folios cannot get pinned and consequently this function cannot + * fail. + * + * Returns 0 if marking the mapped pages possibly shared succeeded. Returns + * -EBUSY otherwise. + */ +static inline int folio_try_share_anon_rmap_pmd(struct folio *folio, + struct page *page) +{ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + return __folio_try_share_anon_rmap(folio, page, HPAGE_PMD_NR, + RMAP_LEVEL_PMD); +#else + WARN_ON_ONCE(true); + return -EBUSY; +#endif +} + /* * Called from mm/vmscan.c to handle paging out */ diff --git a/mm/gup.c b/mm/gup.c index 0a5f0e91bfec..df83182ec72d 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -177,7 +177,7 @@ struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags) /* * Adjust the pincount before re-checking the PTE for changes. * This is essentially a smp_mb() and is paired with a memory - * barrier in page_try_share_anon_rmap(). + * barrier in folio_try_share_anon_rmap_*(). */ smp_mb__after_atomic(); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index de623b942b6e..1a588e29d287 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2523,10 +2523,11 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, * In case we cannot clear PageAnonExclusive(), split the PMD * only and let try_to_migrate_one() fail later. * - * See page_try_share_anon_rmap(): invalidate PMD first. + * See folio_try_share_anon_rmap_pmd(): invalidate PMD first. */ anon_exclusive = PageAnonExclusive(page); - if (freeze && anon_exclusive && page_try_share_anon_rmap(page)) + if (freeze && anon_exclusive && + folio_try_share_anon_rmap_pmd(folio, page)) freeze = false; if (!freeze) { rmap_t rmap_flags = RMAP_NONE; @@ -3554,9 +3555,9 @@ int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, flush_cache_range(vma, address, address + HPAGE_PMD_SIZE); pmdval = pmdp_invalidate(vma, address, pvmw->pmd); - /* See page_try_share_anon_rmap(): invalidate PMD first. */ + /* See folio_try_share_anon_rmap_pmd(): invalidate PMD first. */ anon_exclusive = folio_test_anon(folio) && PageAnonExclusive(page); - if (anon_exclusive && page_try_share_anon_rmap(page)) { + if (anon_exclusive && folio_try_share_anon_rmap_pmd(folio, page)) { set_pmd_at(mm, address, pvmw->pmd, pmdval); return -EBUSY; } diff --git a/mm/internal.h b/mm/internal.h index a94355e70bd7..29589bc3f046 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1047,7 +1047,7 @@ enum { * * Ordinary GUP: Using the PT lock * * GUP-fast and fork(): mm->write_protect_seq * * GUP-fast and KSM or temporary unmapping (swap, migration): see - * page_try_share_anon_rmap() + * folio_try_share_anon_rmap_*() * * Must be called with the (sub)page that's actually referenced via the * page table entry, which might not necessarily be the head page for a @@ -1090,7 +1090,7 @@ static inline bool gup_must_unshare(struct vm_area_struct *vma, return is_cow_mapping(vma->vm_flags); } - /* Paired with a memory barrier in page_try_share_anon_rmap(). */ + /* Paired with a memory barrier in folio_try_share_anon_rmap_*(). */ if (IS_ENABLED(CONFIG_HAVE_FAST_GUP)) smp_rmb(); diff --git a/mm/ksm.c b/mm/ksm.c index 716e2f87dd79..8c001819cf10 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1331,8 +1331,9 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, goto out_unlock; } - /* See page_try_share_anon_rmap(): clear PTE first. */ - if (anon_exclusive && page_try_share_anon_rmap(page)) { + /* See folio_try_share_anon_rmap_pte(): clear PTE first. */ + if (anon_exclusive && + folio_try_share_anon_rmap_pte(page_folio(page), page)) { set_pte_at(mm, pvmw.address, pvmw.pte, entry); goto out_unlock; } diff --git a/mm/migrate_device.c b/mm/migrate_device.c index 39b7754480c6..b6c27c76e1a0 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -202,7 +202,7 @@ again: if (anon_exclusive) { pte = ptep_clear_flush(vma, addr, ptep); - if (page_try_share_anon_rmap(page)) { + if (folio_try_share_anon_rmap_pte(folio, page)) { set_pte_at(mm, addr, ptep, pte); folio_unlock(folio); folio_put(folio); diff --git a/mm/rmap.c b/mm/rmap.c index 3ee254a99622..6209e65985a2 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1866,9 +1866,9 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, break; } - /* See page_try_share_anon_rmap(): clear PTE first. */ + /* See folio_try_share_anon_rmap(): clear PTE first. */ if (anon_exclusive && - page_try_share_anon_rmap(subpage)) { + folio_try_share_anon_rmap_pte(folio, subpage)) { swap_free(entry); set_pte_at(mm, address, pvmw.pte, pteval); ret = false; @@ -2142,7 +2142,8 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, pte_t swp_pte; if (anon_exclusive) - BUG_ON(page_try_share_anon_rmap(subpage)); + WARN_ON_ONCE(folio_try_share_anon_rmap_pte(folio, + subpage)); /* * Store the pfn of the page in a special migration @@ -2213,7 +2214,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, VM_BUG_ON_PAGE(pte_write(pteval) && folio_test_anon(folio) && !anon_exclusive, subpage); - /* See page_try_share_anon_rmap(): clear PTE first. */ + /* See folio_try_share_anon_rmap_pte(): clear PTE first. */ if (folio_test_hugetlb(folio)) { if (anon_exclusive && hugetlb_try_share_anon_rmap(folio)) { @@ -2224,7 +2225,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, break; } } else if (anon_exclusive && - page_try_share_anon_rmap(subpage)) { + folio_try_share_anon_rmap_pte(folio, subpage)) { set_pte_at(mm, address, pvmw.pte, pteval); ret = false; page_vma_mapped_walk_done(&pvmw); -- cgit v1.2.3 From 501a06fe8e4c185bbda371b8cedbdf1b23a633d8 Mon Sep 17 00:00:00 2001 From: Nhat Pham Date: Thu, 7 Dec 2023 11:24:06 -0800 Subject: zswap: memcontrol: implement zswap writeback disabling During our experiment with zswap, we sometimes observe swap IOs due to occasional zswap store failures and writebacks-to-swap. These swapping IOs prevent many users who cannot tolerate swapping from adopting zswap to save memory and improve performance where possible. This patch adds the option to disable this behavior entirely: do not writeback to backing swapping device when a zswap store attempt fail, and do not write pages in the zswap pool back to the backing swap device (both when the pool is full, and when the new zswap shrinker is called). This new behavior can be opted-in/out on a per-cgroup basis via a new cgroup file. By default, writebacks to swap device is enabled, which is the previous behavior. Initially, writeback is enabled for the root cgroup, and a newly created cgroup will inherit the current setting of its parent. Note that this is subtly different from setting memory.swap.max to 0, as it still allows for pages to be stored in the zswap pool (which itself consumes swap space in its current form). This patch should be applied on top of the zswap shrinker series: https://lore.kernel.org/linux-mm/20231130194023.4102148-1-nphamcs@gmail.com/ as it also disables the zswap shrinker, a major source of zswap writebacks. For the most part, this feature is motivated by internal parties who have already established their opinions regarding swapping - the workloads that are highly sensitive to IO, and especially those who are using servers with really slow disk performance (for instance, massive but slow HDDs). For these folks, it's impossible to convince them to even entertain zswap if swapping also comes as a packaged deal. Writeback disabling is quite a useful feature in these situations - on a mixed workloads deployment, they can disable writeback for the more IO-sensitive workloads, and enable writeback for other background workloads. For instance, on a server with HDD, I allocate memories and populate them with random values (so that zswap store will always fail), and specify memory.high low enough to trigger reclaim. The time it takes to allocate the memories and just read through it a couple of times (doing silly things like computing the values' average etc.): zswap.writeback disabled: real 0m30.537s user 0m23.687s sys 0m6.637s 0 pages swapped in 0 pages swapped out zswap.writeback enabled: real 0m45.061s user 0m24.310s sys 0m8.892s 712686 pages swapped in 461093 pages swapped out (the last two lines are from vmstat -s). [nphamcs@gmail.com: add a comment about recurring zswap store failures leading to reclaim inefficiency] Link: https://lkml.kernel.org/r/20231221005725.3446672-1-nphamcs@gmail.com Link: https://lkml.kernel.org/r/20231207192406.3809579-1-nphamcs@gmail.com Signed-off-by: Nhat Pham Suggested-by: Johannes Weiner Reviewed-by: Yosry Ahmed Acked-by: Chris Li Cc: Dan Streetman Cc: David Heidelberg Cc: Domenico Cerasuolo Cc: Hugh Dickins Cc: Jonathan Corbet Cc: Konrad Rzeszutek Wilk Cc: Michal Hocko Cc: Mike Rapoport (IBM) Cc: Muchun Song Cc: Roman Gushchin Cc: Sergey Senozhatsky Cc: Seth Jennings Cc: Shakeel Butt Cc: Tejun Heo Cc: Vitaly Wool Cc: Zefan Li Signed-off-by: Andrew Morton --- Documentation/admin-guide/cgroup-v2.rst | 15 +++++++++++++ Documentation/admin-guide/mm/zswap.rst | 10 +++++++++ include/linux/memcontrol.h | 12 +++++++++++ include/linux/zswap.h | 7 ++++++ mm/memcontrol.c | 38 +++++++++++++++++++++++++++++++++ mm/page_io.c | 5 +++++ mm/shmem.c | 3 +-- mm/zswap.c | 13 +++++++++-- 8 files changed, 99 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 3f85254f3cef..5ec7dd753cd1 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1679,6 +1679,21 @@ PAGE_SIZE multiple when read back. limit, it will refuse to take any more stores before existing entries fault back in or are written out to disk. + memory.zswap.writeback + A read-write single value file. The default value is "1". The + initial value of the root cgroup is 1, and when a new cgroup is + created, it inherits the current value of its parent. + + When this is set to 0, all swapping attempts to swapping devices + are disabled. This included both zswap writebacks, and swapping due + to zswap store failures. If the zswap store failures are recurring + (for e.g if the pages are incompressible), users can observe + reclaim inefficiency after disabling writeback (because the same + pages might be rejected again and again). + + Note that this is subtly different from setting memory.swap.max to + 0, as it still allows for pages to be written to the zswap pool. + memory.pressure A read-only nested-keyed file. diff --git a/Documentation/admin-guide/mm/zswap.rst b/Documentation/admin-guide/mm/zswap.rst index 62fc244ec702..b42132969e31 100644 --- a/Documentation/admin-guide/mm/zswap.rst +++ b/Documentation/admin-guide/mm/zswap.rst @@ -153,6 +153,16 @@ attribute, e. g.:: Setting this parameter to 100 will disable the hysteresis. +Some users cannot tolerate the swapping that comes with zswap store failures +and zswap writebacks. Swapping can be disabled entirely (without disabling +zswap itself) on a cgroup-basis as follows: + + echo 0 > /sys/fs/cgroup//memory.zswap.writeback + +Note that if the store failures are recurring (for e.g if the pages are +incompressible), users can observe reclaim inefficiency after disabling +writeback (because the same pages might be rejected again and again). + When there is a sizable amount of cold memory residing in the zswap pool, it can be advantageous to proactively write these cold pages to swap and reclaim the memory for other use cases. By default, the zswap shrinker is disabled. diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 43b77363ab8e..5de775e6cdd9 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -219,6 +219,12 @@ struct mem_cgroup { #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) unsigned long zswap_max; + + /* + * Prevent pages from this memcg from being written back from zswap to + * swap, and from being swapped out on zswap store failures. + */ + bool zswap_writeback; #endif unsigned long soft_limit; @@ -1941,6 +1947,7 @@ static inline void count_objcg_event(struct obj_cgroup *objcg, bool obj_cgroup_may_zswap(struct obj_cgroup *objcg); void obj_cgroup_charge_zswap(struct obj_cgroup *objcg, size_t size); void obj_cgroup_uncharge_zswap(struct obj_cgroup *objcg, size_t size); +bool mem_cgroup_zswap_writeback_enabled(struct mem_cgroup *memcg); #else static inline bool obj_cgroup_may_zswap(struct obj_cgroup *objcg) { @@ -1954,6 +1961,11 @@ static inline void obj_cgroup_uncharge_zswap(struct obj_cgroup *objcg, size_t size) { } +static inline bool mem_cgroup_zswap_writeback_enabled(struct mem_cgroup *memcg) +{ + /* if zswap is disabled, do not block pages going to the swapping device */ + return true; +} #endif #endif /* _LINUX_MEMCONTROL_H */ diff --git a/include/linux/zswap.h b/include/linux/zswap.h index e88572d4c720..0b709f5bc65f 100644 --- a/include/linux/zswap.h +++ b/include/linux/zswap.h @@ -35,6 +35,7 @@ void zswap_swapoff(int type); void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg); void zswap_lruvec_state_init(struct lruvec *lruvec); void zswap_folio_swapin(struct folio *folio); +bool is_zswap_enabled(void); #else struct zswap_lruvec_state {}; @@ -55,6 +56,12 @@ static inline void zswap_swapoff(int type) {} static inline void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg) {} static inline void zswap_lruvec_state_init(struct lruvec *lruvec) {} static inline void zswap_folio_swapin(struct folio *folio) {} + +static inline bool is_zswap_enabled(void) +{ + return false; +} + #endif #endif /* _LINUX_ZSWAP_H */ diff --git a/mm/memcontrol.c b/mm/memcontrol.c index b08b9cd4a3a8..3ca691fb5b49 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5538,6 +5538,8 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) WRITE_ONCE(memcg->soft_limit, PAGE_COUNTER_MAX); #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) memcg->zswap_max = PAGE_COUNTER_MAX; + WRITE_ONCE(memcg->zswap_writeback, + !parent || READ_ONCE(parent->zswap_writeback)); #endif page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX); if (parent) { @@ -8166,6 +8168,12 @@ void obj_cgroup_uncharge_zswap(struct obj_cgroup *objcg, size_t size) rcu_read_unlock(); } +bool mem_cgroup_zswap_writeback_enabled(struct mem_cgroup *memcg) +{ + /* if zswap is disabled, do not block pages going to the swapping device */ + return !is_zswap_enabled() || !memcg || READ_ONCE(memcg->zswap_writeback); +} + static u64 zswap_current_read(struct cgroup_subsys_state *css, struct cftype *cft) { @@ -8198,6 +8206,31 @@ static ssize_t zswap_max_write(struct kernfs_open_file *of, return nbytes; } +static int zswap_writeback_show(struct seq_file *m, void *v) +{ + struct mem_cgroup *memcg = mem_cgroup_from_seq(m); + + seq_printf(m, "%d\n", READ_ONCE(memcg->zswap_writeback)); + return 0; +} + +static ssize_t zswap_writeback_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + int zswap_writeback; + ssize_t parse_ret = kstrtoint(strstrip(buf), 0, &zswap_writeback); + + if (parse_ret) + return parse_ret; + + if (zswap_writeback != 0 && zswap_writeback != 1) + return -EINVAL; + + WRITE_ONCE(memcg->zswap_writeback, zswap_writeback); + return nbytes; +} + static struct cftype zswap_files[] = { { .name = "zswap.current", @@ -8210,6 +8243,11 @@ static struct cftype zswap_files[] = { .seq_show = zswap_max_show, .write = zswap_max_write, }, + { + .name = "zswap.writeback", + .seq_show = zswap_writeback_show, + .write = zswap_writeback_write, + }, { } /* terminate */ }; #endif /* CONFIG_MEMCG_KMEM && CONFIG_ZSWAP */ diff --git a/mm/page_io.c b/mm/page_io.c index 09c6a4f316f3..ae2b49055e43 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -201,6 +201,11 @@ int swap_writepage(struct page *page, struct writeback_control *wbc) folio_end_writeback(folio); return 0; } + if (!mem_cgroup_zswap_writeback_enabled(folio_memcg(folio))) { + folio_mark_dirty(folio); + return AOP_WRITEPAGE_ACTIVATE; + } + __swap_writepage(folio, wbc); return 0; } diff --git a/mm/shmem.c b/mm/shmem.c index a4d388973021..928aa2304932 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1514,8 +1514,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) mutex_unlock(&shmem_swaplist_mutex); BUG_ON(folio_mapped(folio)); - swap_writepage(&folio->page, wbc); - return 0; + return swap_writepage(&folio->page, wbc); } mutex_unlock(&shmem_swaplist_mutex); diff --git a/mm/zswap.c b/mm/zswap.c index f760801a3ea8..ca25b676048e 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -153,6 +153,11 @@ static bool zswap_shrinker_enabled = IS_ENABLED( CONFIG_ZSWAP_SHRINKER_DEFAULT_ON); module_param_named(shrinker_enabled, zswap_shrinker_enabled, bool, 0644); +bool is_zswap_enabled(void) +{ + return zswap_enabled; +} + /********************************* * data structures **********************************/ @@ -596,7 +601,8 @@ static unsigned long zswap_shrinker_scan(struct shrinker *shrinker, struct zswap_pool *pool = shrinker->private_data; bool encountered_page_in_swapcache = false; - if (!zswap_shrinker_enabled) { + if (!zswap_shrinker_enabled || + !mem_cgroup_zswap_writeback_enabled(sc->memcg)) { sc->nr_scanned = 0; return SHRINK_STOP; } @@ -637,7 +643,7 @@ static unsigned long zswap_shrinker_count(struct shrinker *shrinker, struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(sc->nid)); unsigned long nr_backing, nr_stored, nr_freeable, nr_protected; - if (!zswap_shrinker_enabled) + if (!zswap_shrinker_enabled || !mem_cgroup_zswap_writeback_enabled(memcg)) return 0; #ifdef CONFIG_MEMCG_KMEM @@ -923,6 +929,9 @@ static int shrink_memcg(struct mem_cgroup *memcg) struct zswap_pool *pool; int nid, shrunk = 0; + if (!mem_cgroup_zswap_writeback_enabled(memcg)) + return -EINVAL; + /* * Skip zombies because their LRUs are reparented and we would be * reclaiming from the parent instead of the dead memcg. -- cgit v1.2.3 From 9c5938694cd0e9e00bdfb7e60900673263daf4d5 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 5 Jan 2024 16:57:29 +0100 Subject: mm/rmap: silence VM_WARN_ON_FOLIO() in __folio_rmap_sanity_checks() Unfortunately, vm_insert_page() and friends and up passing driver-allocated folios into folio_add_file_rmap_pte() using insert_page_into_pte_locked(). While these driver-allocated folios can be compound pages (large folios), they are not proper "rmappable" folios. In these VM_MIXEDMAP VMAs, there isn't really the concept of a reverse mapping, so long-term, we should clean that up and not call into rmap code. For the time being, document how we can end up in rmap code with large folios that are not marked rmappable. Link: https://lkml.kernel.org/r/793c5cee-d5fc-4eb1-86a2-39e05686233d@redhat.com Fixes: 68f0320824fa ("mm/rmap: convert folio_add_file_rmap_range() into folio_add_file_rmap_[pte|ptes|pmd]()") Reported-by: syzbot+50ef73537bbc393a25bb@syzkaller.appspotmail.com Closes: https://lkml.kernel.org/r/000000000000014174060e09316e@google.com Signed-off-by: David Hildenbrand Cc: Matthew Wilcox (Oracle) Cc: Ryan Roberts Cc: Yin Fengwei Signed-off-by: Andrew Morton --- include/linux/rmap.h | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index fd6fe16fa358..b7944a833668 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -199,8 +199,15 @@ static inline void __folio_rmap_sanity_checks(struct folio *folio, { /* hugetlb folios are handled separately. */ VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio); - VM_WARN_ON_FOLIO(folio_test_large(folio) && - !folio_test_large_rmappable(folio), folio); + + /* + * TODO: we get driver-allocated folios that have nothing to do with + * the rmap using vm_insert_page(); therefore, we cannot assume that + * folio_test_large_rmappable() holds for large folios. We should + * handle any desired mapcount+stats accounting for these folios in + * VM_MIXEDMAP VMAs separately, and then sanity-check here that + * we really only get rmappable folios. + */ VM_WARN_ON_ONCE(nr_pages <= 0); VM_WARN_ON_FOLIO(page_folio(page) != folio, folio); -- cgit v1.2.3 From 71ce1ab54a505736786d9c5921e6c2718c7ec535 Mon Sep 17 00:00:00 2001 From: Kinsey Ho Date: Wed, 27 Dec 2023 14:12:01 +0000 Subject: mm/mglru: add CONFIG_ARCH_HAS_HW_PTE_YOUNG Patch series "mm/mglru: Kconfig cleanup", v4. This series is the result of the following discussion: https://lore.kernel.org/47066176-bd93-55dd-c2fa-002299d9e034@linux.ibm.com/ It mainly avoids building the code that walks page tables on CPUs that use it, i.e., those don't support hardware accessed bit. Specifically, it introduces a new Kconfig to guard some of functions added by commit bd74fdaea146 ("mm: multi-gen LRU: support page table walks") on CPUs like POWER9, on which the series was tested. This patch (of 5): Some architectures are able to set the accessed bit in PTEs when PTEs are used as part of linear address translations. Add CONFIG_ARCH_HAS_HW_PTE_YOUNG for such architectures to be able to override arch_has_hw_pte_young(). Link: https://lkml.kernel.org/r/20231227141205.2200125-1-kinseyho@google.com Link: https://lkml.kernel.org/r/20231227141205.2200125-2-kinseyho@google.com Signed-off-by: Kinsey Ho Co-developed-by: Aneesh Kumar K.V Signed-off-by: Aneesh Kumar K.V Tested-by: Donet Tom Acked-by: Yu Zhao Cc: kernel test robot Signed-off-by: Andrew Morton --- arch/Kconfig | 8 ++++++++ arch/arm64/Kconfig | 1 + arch/x86/Kconfig | 1 + arch/x86/include/asm/pgtable.h | 6 ------ include/linux/pgtable.h | 2 +- 5 files changed, 11 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/arch/Kconfig b/arch/Kconfig index f4b210ab0612..8c8901f80586 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -1470,6 +1470,14 @@ config DYNAMIC_SIGFRAME config HAVE_ARCH_NODE_DEV_GROUP bool +config ARCH_HAS_HW_PTE_YOUNG + bool + help + Architectures that select this option are capable of setting the + accessed bit in PTE entries when using them as part of linear address + translations. Architectures that require runtime check should select + this option and override arch_has_hw_pte_young(). + config ARCH_HAS_NONLEAF_PMD_YOUNG bool help diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 7b071a00425d..12d611f3da5d 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -36,6 +36,7 @@ config ARM64 select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE select ARCH_HAS_PTE_DEVMAP select ARCH_HAS_PTE_SPECIAL + select ARCH_HAS_HW_PTE_YOUNG select ARCH_HAS_SETUP_DMA_OPS select ARCH_HAS_SET_DIRECT_MAP select ARCH_HAS_SET_MEMORY diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 1566748f16c4..04941a1ffc0a 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -88,6 +88,7 @@ config X86 select ARCH_HAS_PMEM_API if X86_64 select ARCH_HAS_PTE_DEVMAP if X86_64 select ARCH_HAS_PTE_SPECIAL + select ARCH_HAS_HW_PTE_YOUNG select ARCH_HAS_NONLEAF_PMD_YOUNG if PGTABLE_LEVELS > 2 select ARCH_HAS_UACCESS_FLUSHCACHE if X86_64 select ARCH_HAS_COPY_MC if X86_64 diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 57bab91bbf50..08b5cb22d9a6 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1679,12 +1679,6 @@ static inline bool arch_has_pfn_modify_check(void) return boot_cpu_has_bug(X86_BUG_L1TF); } -#define arch_has_hw_pte_young arch_has_hw_pte_young -static inline bool arch_has_hw_pte_young(void) -{ - return true; -} - #define arch_check_zapped_pte arch_check_zapped_pte void arch_check_zapped_pte(struct vm_area_struct *vma, pte_t pte); diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index af7639c3b0a3..9ecc20fa6269 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -375,7 +375,7 @@ static inline bool arch_has_hw_nonleaf_pmd_young(void) */ static inline bool arch_has_hw_pte_young(void) { - return false; + return IS_ENABLED(CONFIG_ARCH_HAS_HW_PTE_YOUNG); } #endif -- cgit v1.2.3 From 61dd3f246b3adaabff3241c586f2210ac91b05a4 Mon Sep 17 00:00:00 2001 From: Kinsey Ho Date: Wed, 27 Dec 2023 14:12:02 +0000 Subject: mm/mglru: add CONFIG_LRU_GEN_WALKS_MMU Add CONFIG_LRU_GEN_WALKS_MMU such that if disabled, the code that walks page tables to promote pages into the youngest generation will not be built. Also improves code readability by adding two helper functions get_mm_state() and get_next_mm(). Link: https://lkml.kernel.org/r/20231227141205.2200125-3-kinseyho@google.com Signed-off-by: Kinsey Ho Co-developed-by: Aneesh Kumar K.V Signed-off-by: Aneesh Kumar K.V Tested-by: Donet Tom Acked-by: Yu Zhao Cc: kernel test robot Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 2 +- include/linux/mm_types.h | 12 ++- include/linux/mmzone.h | 2 + kernel/fork.c | 2 +- mm/Kconfig | 4 + mm/vmscan.c | 192 +++++++++++++++++++++++++++++---------------- 6 files changed, 139 insertions(+), 75 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 5de775e6cdd9..20ff87f8e001 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -330,7 +330,7 @@ struct mem_cgroup { struct deferred_split deferred_split_queue; #endif -#ifdef CONFIG_LRU_GEN +#ifdef CONFIG_LRU_GEN_WALKS_MMU /* per-memcg mm_struct list */ struct lru_gen_mm_list mm_list; #endif diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index a66534c78c4d..552fa2d11c57 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -958,7 +958,7 @@ struct mm_struct { */ unsigned long ksm_zero_pages; #endif /* CONFIG_KSM */ -#ifdef CONFIG_LRU_GEN +#ifdef CONFIG_LRU_GEN_WALKS_MMU struct { /* this mm_struct is on lru_gen_mm_list */ struct list_head list; @@ -973,7 +973,7 @@ struct mm_struct { struct mem_cgroup *memcg; #endif } lru_gen; -#endif /* CONFIG_LRU_GEN */ +#endif /* CONFIG_LRU_GEN_WALKS_MMU */ } __randomize_layout; /* @@ -1011,6 +1011,10 @@ struct lru_gen_mm_list { spinlock_t lock; }; +#endif /* CONFIG_LRU_GEN */ + +#ifdef CONFIG_LRU_GEN_WALKS_MMU + void lru_gen_add_mm(struct mm_struct *mm); void lru_gen_del_mm(struct mm_struct *mm); #ifdef CONFIG_MEMCG @@ -1036,7 +1040,7 @@ static inline void lru_gen_use_mm(struct mm_struct *mm) WRITE_ONCE(mm->lru_gen.bitmap, -1); } -#else /* !CONFIG_LRU_GEN */ +#else /* !CONFIG_LRU_GEN_WALKS_MMU */ static inline void lru_gen_add_mm(struct mm_struct *mm) { @@ -1060,7 +1064,7 @@ static inline void lru_gen_use_mm(struct mm_struct *mm) { } -#endif /* CONFIG_LRU_GEN */ +#endif /* CONFIG_LRU_GEN_WALKS_MMU */ struct vma_iterator { struct ma_state mas; diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 2efd3be484fd..bc3f63ec4291 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -640,9 +640,11 @@ struct lruvec { #ifdef CONFIG_LRU_GEN /* evictable pages divided into generations */ struct lru_gen_folio lrugen; +#ifdef CONFIG_LRU_GEN_WALKS_MMU /* to concurrently iterate lru_gen_mm_list */ struct lru_gen_mm_state mm_state; #endif +#endif /* CONFIG_LRU_GEN */ #ifdef CONFIG_MEMCG struct pglist_data *pgdat; #endif diff --git a/kernel/fork.c b/kernel/fork.c index 93924392a5c3..56cf276432c8 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2946,7 +2946,7 @@ pid_t kernel_clone(struct kernel_clone_args *args) get_task_struct(p); } - if (IS_ENABLED(CONFIG_LRU_GEN) && !(clone_flags & CLONE_VM)) { + if (IS_ENABLED(CONFIG_LRU_GEN_WALKS_MMU) && !(clone_flags & CLONE_VM)) { /* lock the task to synchronize with memcg migration */ task_lock(p); lru_gen_add_mm(p->mm); diff --git a/mm/Kconfig b/mm/Kconfig index b072664b889a..79d563d8f9e0 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -1274,6 +1274,10 @@ config LRU_GEN_STATS from evicted generations for debugging purpose. This option has a per-memcg and per-node memory overhead. + +config LRU_GEN_WALKS_MMU + def_bool y + depends on LRU_GEN && ARCH_HAS_HW_PTE_YOUNG # } config ARCH_SUPPORTS_PER_VMA_LOCK diff --git a/mm/vmscan.c b/mm/vmscan.c index b4ca3563bcf4..aa7ea09ffb4c 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2671,13 +2671,14 @@ static void get_item_key(void *item, int *key) key[1] = hash >> BLOOM_FILTER_SHIFT; } -static bool test_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item) +static bool test_bloom_filter(struct lru_gen_mm_state *mm_state, unsigned long seq, + void *item) { int key[2]; unsigned long *filter; int gen = filter_gen_from_seq(seq); - filter = READ_ONCE(lruvec->mm_state.filters[gen]); + filter = READ_ONCE(mm_state->filters[gen]); if (!filter) return true; @@ -2686,13 +2687,14 @@ static bool test_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *it return test_bit(key[0], filter) && test_bit(key[1], filter); } -static void update_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item) +static void update_bloom_filter(struct lru_gen_mm_state *mm_state, unsigned long seq, + void *item) { int key[2]; unsigned long *filter; int gen = filter_gen_from_seq(seq); - filter = READ_ONCE(lruvec->mm_state.filters[gen]); + filter = READ_ONCE(mm_state->filters[gen]); if (!filter) return; @@ -2704,12 +2706,12 @@ static void update_bloom_filter(struct lruvec *lruvec, unsigned long seq, void * set_bit(key[1], filter); } -static void reset_bloom_filter(struct lruvec *lruvec, unsigned long seq) +static void reset_bloom_filter(struct lru_gen_mm_state *mm_state, unsigned long seq) { unsigned long *filter; int gen = filter_gen_from_seq(seq); - filter = lruvec->mm_state.filters[gen]; + filter = mm_state->filters[gen]; if (filter) { bitmap_clear(filter, 0, BIT(BLOOM_FILTER_SHIFT)); return; @@ -2717,13 +2719,15 @@ static void reset_bloom_filter(struct lruvec *lruvec, unsigned long seq) filter = bitmap_zalloc(BIT(BLOOM_FILTER_SHIFT), __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN); - WRITE_ONCE(lruvec->mm_state.filters[gen], filter); + WRITE_ONCE(mm_state->filters[gen], filter); } /****************************************************************************** * mm_struct list ******************************************************************************/ +#ifdef CONFIG_LRU_GEN_WALKS_MMU + static struct lru_gen_mm_list *get_mm_list(struct mem_cgroup *memcg) { static struct lru_gen_mm_list mm_list = { @@ -2740,6 +2744,29 @@ static struct lru_gen_mm_list *get_mm_list(struct mem_cgroup *memcg) return &mm_list; } +static struct lru_gen_mm_state *get_mm_state(struct lruvec *lruvec) +{ + return &lruvec->mm_state; +} + +static struct mm_struct *get_next_mm(struct lru_gen_mm_walk *walk) +{ + int key; + struct mm_struct *mm; + struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec); + struct lru_gen_mm_state *mm_state = get_mm_state(walk->lruvec); + + mm = list_entry(mm_state->head, struct mm_struct, lru_gen.list); + key = pgdat->node_id % BITS_PER_TYPE(mm->lru_gen.bitmap); + + if (!walk->force_scan && !test_bit(key, &mm->lru_gen.bitmap)) + return NULL; + + clear_bit(key, &mm->lru_gen.bitmap); + + return mmget_not_zero(mm) ? mm : NULL; +} + void lru_gen_add_mm(struct mm_struct *mm) { int nid; @@ -2755,10 +2782,11 @@ void lru_gen_add_mm(struct mm_struct *mm) for_each_node_state(nid, N_MEMORY) { struct lruvec *lruvec = get_lruvec(memcg, nid); + struct lru_gen_mm_state *mm_state = get_mm_state(lruvec); /* the first addition since the last iteration */ - if (lruvec->mm_state.tail == &mm_list->fifo) - lruvec->mm_state.tail = &mm->lru_gen.list; + if (mm_state->tail == &mm_list->fifo) + mm_state->tail = &mm->lru_gen.list; } list_add_tail(&mm->lru_gen.list, &mm_list->fifo); @@ -2784,14 +2812,15 @@ void lru_gen_del_mm(struct mm_struct *mm) for_each_node(nid) { struct lruvec *lruvec = get_lruvec(memcg, nid); + struct lru_gen_mm_state *mm_state = get_mm_state(lruvec); /* where the current iteration continues after */ - if (lruvec->mm_state.head == &mm->lru_gen.list) - lruvec->mm_state.head = lruvec->mm_state.head->prev; + if (mm_state->head == &mm->lru_gen.list) + mm_state->head = mm_state->head->prev; /* where the last iteration ended before */ - if (lruvec->mm_state.tail == &mm->lru_gen.list) - lruvec->mm_state.tail = lruvec->mm_state.tail->next; + if (mm_state->tail == &mm->lru_gen.list) + mm_state->tail = mm_state->tail->next; } list_del_init(&mm->lru_gen.list); @@ -2834,10 +2863,30 @@ void lru_gen_migrate_mm(struct mm_struct *mm) } #endif +#else /* !CONFIG_LRU_GEN_WALKS_MMU */ + +static struct lru_gen_mm_list *get_mm_list(struct mem_cgroup *memcg) +{ + return NULL; +} + +static struct lru_gen_mm_state *get_mm_state(struct lruvec *lruvec) +{ + return NULL; +} + +static struct mm_struct *get_next_mm(struct lru_gen_mm_walk *walk) +{ + return NULL; +} + +#endif + static void reset_mm_stats(struct lruvec *lruvec, struct lru_gen_mm_walk *walk, bool last) { int i; int hist; + struct lru_gen_mm_state *mm_state = get_mm_state(lruvec); lockdep_assert_held(&get_mm_list(lruvec_memcg(lruvec))->lock); @@ -2845,44 +2894,20 @@ static void reset_mm_stats(struct lruvec *lruvec, struct lru_gen_mm_walk *walk, hist = lru_hist_from_seq(walk->max_seq); for (i = 0; i < NR_MM_STATS; i++) { - WRITE_ONCE(lruvec->mm_state.stats[hist][i], - lruvec->mm_state.stats[hist][i] + walk->mm_stats[i]); + WRITE_ONCE(mm_state->stats[hist][i], + mm_state->stats[hist][i] + walk->mm_stats[i]); walk->mm_stats[i] = 0; } } if (NR_HIST_GENS > 1 && last) { - hist = lru_hist_from_seq(lruvec->mm_state.seq + 1); + hist = lru_hist_from_seq(mm_state->seq + 1); for (i = 0; i < NR_MM_STATS; i++) - WRITE_ONCE(lruvec->mm_state.stats[hist][i], 0); + WRITE_ONCE(mm_state->stats[hist][i], 0); } } -static bool should_skip_mm(struct mm_struct *mm, struct lru_gen_mm_walk *walk) -{ - int type; - unsigned long size = 0; - struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec); - int key = pgdat->node_id % BITS_PER_TYPE(mm->lru_gen.bitmap); - - if (!walk->force_scan && !test_bit(key, &mm->lru_gen.bitmap)) - return true; - - clear_bit(key, &mm->lru_gen.bitmap); - - for (type = !walk->can_swap; type < ANON_AND_FILE; type++) { - size += type ? get_mm_counter(mm, MM_FILEPAGES) : - get_mm_counter(mm, MM_ANONPAGES) + - get_mm_counter(mm, MM_SHMEMPAGES); - } - - if (size < MIN_LRU_BATCH) - return true; - - return !mmget_not_zero(mm); -} - static bool iterate_mm_list(struct lruvec *lruvec, struct lru_gen_mm_walk *walk, struct mm_struct **iter) { @@ -2891,7 +2916,7 @@ static bool iterate_mm_list(struct lruvec *lruvec, struct lru_gen_mm_walk *walk, struct mm_struct *mm = NULL; struct mem_cgroup *memcg = lruvec_memcg(lruvec); struct lru_gen_mm_list *mm_list = get_mm_list(memcg); - struct lru_gen_mm_state *mm_state = &lruvec->mm_state; + struct lru_gen_mm_state *mm_state = get_mm_state(lruvec); /* * mm_state->seq is incremented after each iteration of mm_list. There @@ -2929,11 +2954,7 @@ static bool iterate_mm_list(struct lruvec *lruvec, struct lru_gen_mm_walk *walk, mm_state->tail = mm_state->head->next; walk->force_scan = true; } - - mm = list_entry(mm_state->head, struct mm_struct, lru_gen.list); - if (should_skip_mm(mm, walk)) - mm = NULL; - } while (!mm); + } while (!(mm = get_next_mm(walk))); done: if (*iter || last) reset_mm_stats(lruvec, walk, last); @@ -2941,7 +2962,7 @@ done: spin_unlock(&mm_list->lock); if (mm && first) - reset_bloom_filter(lruvec, walk->max_seq + 1); + reset_bloom_filter(mm_state, walk->max_seq + 1); if (*iter) mmput_async(*iter); @@ -2956,7 +2977,7 @@ static bool iterate_mm_list_nowalk(struct lruvec *lruvec, unsigned long max_seq) bool success = false; struct mem_cgroup *memcg = lruvec_memcg(lruvec); struct lru_gen_mm_list *mm_list = get_mm_list(memcg); - struct lru_gen_mm_state *mm_state = &lruvec->mm_state; + struct lru_gen_mm_state *mm_state = get_mm_state(lruvec); spin_lock(&mm_list->lock); @@ -3469,6 +3490,7 @@ static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end, DECLARE_BITMAP(bitmap, MIN_LRU_BATCH); unsigned long first = -1; struct lru_gen_mm_walk *walk = args->private; + struct lru_gen_mm_state *mm_state = get_mm_state(walk->lruvec); VM_WARN_ON_ONCE(pud_leaf(*pud)); @@ -3520,7 +3542,7 @@ restart: walk_pmd_range_locked(pud, addr, vma, args, bitmap, &first); } - if (!walk->force_scan && !test_bloom_filter(walk->lruvec, walk->max_seq, pmd + i)) + if (!walk->force_scan && !test_bloom_filter(mm_state, walk->max_seq, pmd + i)) continue; walk->mm_stats[MM_NONLEAF_FOUND]++; @@ -3531,7 +3553,7 @@ restart: walk->mm_stats[MM_NONLEAF_ADDED]++; /* carry over to the next generation */ - update_bloom_filter(walk->lruvec, walk->max_seq + 1, pmd + i); + update_bloom_filter(mm_state, walk->max_seq + 1, pmd + i); } walk_pmd_range_locked(pud, -1, vma, args, bitmap, &first); @@ -3738,16 +3760,25 @@ next: return success; } -static void inc_max_seq(struct lruvec *lruvec, bool can_swap, bool force_scan) +static bool inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, + bool can_swap, bool force_scan) { + bool success; int prev, next; int type, zone; struct lru_gen_folio *lrugen = &lruvec->lrugen; restart: + if (max_seq < READ_ONCE(lrugen->max_seq)) + return false; + spin_lock_irq(&lruvec->lru_lock); VM_WARN_ON_ONCE(!seq_is_valid(lruvec)); + success = max_seq == lrugen->max_seq; + if (!success) + goto unlock; + for (type = ANON_AND_FILE - 1; type >= 0; type--) { if (get_nr_gens(lruvec, type) != MAX_NR_GENS) continue; @@ -3791,8 +3822,10 @@ restart: WRITE_ONCE(lrugen->timestamps[next], jiffies); /* make sure preceding modifications appear */ smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1); - +unlock: spin_unlock_irq(&lruvec->lru_lock); + + return success; } static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, @@ -3802,14 +3835,16 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, struct lru_gen_mm_walk *walk; struct mm_struct *mm = NULL; struct lru_gen_folio *lrugen = &lruvec->lrugen; + struct lru_gen_mm_state *mm_state = get_mm_state(lruvec); VM_WARN_ON_ONCE(max_seq > READ_ONCE(lrugen->max_seq)); + if (!mm_state) + return inc_max_seq(lruvec, max_seq, can_swap, force_scan); + /* see the comment in iterate_mm_list() */ - if (max_seq <= READ_ONCE(lruvec->mm_state.seq)) { - success = false; - goto done; - } + if (max_seq <= READ_ONCE(mm_state->seq)) + return false; /* * If the hardware doesn't automatically set the accessed bit, fallback @@ -3839,8 +3874,10 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, walk_mm(lruvec, mm, walk); } while (mm); done: - if (success) - inc_max_seq(lruvec, can_swap, force_scan); + if (success) { + success = inc_max_seq(lruvec, max_seq, can_swap, force_scan); + WARN_ON_ONCE(!success); + } return success; } @@ -3964,6 +4001,7 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) struct mem_cgroup *memcg = folio_memcg(folio); struct pglist_data *pgdat = folio_pgdat(folio); struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); + struct lru_gen_mm_state *mm_state = get_mm_state(lruvec); DEFINE_MAX_SEQ(lruvec); int old_gen, new_gen = lru_gen_from_seq(max_seq); @@ -4042,8 +4080,8 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) mem_cgroup_unlock_pages(); /* feedback from rmap walkers to page table walkers */ - if (suitable_to_scan(i, young)) - update_bloom_filter(lruvec, max_seq, pvmw->pmd); + if (mm_state && suitable_to_scan(i, young)) + update_bloom_filter(mm_state, max_seq, pvmw->pmd); } /****************************************************************************** @@ -5219,6 +5257,7 @@ static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec, int type, tier; int hist = lru_hist_from_seq(seq); struct lru_gen_folio *lrugen = &lruvec->lrugen; + struct lru_gen_mm_state *mm_state = get_mm_state(lruvec); for (tier = 0; tier < MAX_NR_TIERS; tier++) { seq_printf(m, " %10d", tier); @@ -5244,6 +5283,9 @@ static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec, seq_putc(m, '\n'); } + if (!mm_state) + return; + seq_puts(m, " "); for (i = 0; i < NR_MM_STATS; i++) { const char *s = " "; @@ -5251,10 +5293,10 @@ static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec, if (seq == max_seq && NR_HIST_GENS == 1) { s = "LOYNFA"; - n = READ_ONCE(lruvec->mm_state.stats[hist][i]); + n = READ_ONCE(mm_state->stats[hist][i]); } else if (seq != max_seq && NR_HIST_GENS > 1) { s = "loynfa"; - n = READ_ONCE(lruvec->mm_state.stats[hist][i]); + n = READ_ONCE(mm_state->stats[hist][i]); } seq_printf(m, " %10lu%c", n, s[i]); @@ -5523,6 +5565,7 @@ void lru_gen_init_lruvec(struct lruvec *lruvec) int i; int gen, type, zone; struct lru_gen_folio *lrugen = &lruvec->lrugen; + struct lru_gen_mm_state *mm_state = get_mm_state(lruvec); lrugen->max_seq = MIN_NR_GENS + 1; lrugen->enabled = lru_gen_enabled(); @@ -5533,7 +5576,8 @@ void lru_gen_init_lruvec(struct lruvec *lruvec) for_each_gen_type_zone(gen, type, zone) INIT_LIST_HEAD(&lrugen->folios[gen][type][zone]); - lruvec->mm_state.seq = MIN_NR_GENS; + if (mm_state) + mm_state->seq = MIN_NR_GENS; } #ifdef CONFIG_MEMCG @@ -5552,28 +5596,38 @@ void lru_gen_init_pgdat(struct pglist_data *pgdat) void lru_gen_init_memcg(struct mem_cgroup *memcg) { - INIT_LIST_HEAD(&memcg->mm_list.fifo); - spin_lock_init(&memcg->mm_list.lock); + struct lru_gen_mm_list *mm_list = get_mm_list(memcg); + + if (!mm_list) + return; + + INIT_LIST_HEAD(&mm_list->fifo); + spin_lock_init(&mm_list->lock); } void lru_gen_exit_memcg(struct mem_cgroup *memcg) { int i; int nid; + struct lru_gen_mm_list *mm_list = get_mm_list(memcg); - VM_WARN_ON_ONCE(!list_empty(&memcg->mm_list.fifo)); + VM_WARN_ON_ONCE(mm_list && !list_empty(&mm_list->fifo)); for_each_node(nid) { struct lruvec *lruvec = get_lruvec(memcg, nid); + struct lru_gen_mm_state *mm_state = get_mm_state(lruvec); VM_WARN_ON_ONCE(memchr_inv(lruvec->lrugen.nr_pages, 0, sizeof(lruvec->lrugen.nr_pages))); lruvec->lrugen.list.next = LIST_POISON1; + if (!mm_state) + continue; + for (i = 0; i < NR_BLOOM_FILTERS; i++) { - bitmap_free(lruvec->mm_state.filters[i]); - lruvec->mm_state.filters[i] = NULL; + bitmap_free(mm_state->filters[i]); + mm_state->filters[i] = NULL; } } } -- cgit v1.2.3 From 745b13e647cd119e70d16b57698e12b7c86ca264 Mon Sep 17 00:00:00 2001 From: Kinsey Ho Date: Wed, 27 Dec 2023 14:12:03 +0000 Subject: mm/mglru: remove CONFIG_MEMCG Remove CONFIG_MEMCG in a refactoring to improve code readability at the cost of a few bytes in struct lru_gen_folio per node when CONFIG_MEMCG=n. Link: https://lkml.kernel.org/r/20231227141205.2200125-4-kinseyho@google.com Signed-off-by: Kinsey Ho Co-developed-by: Aneesh Kumar K.V Signed-off-by: Aneesh Kumar K.V Tested-by: Donet Tom Acked-by: Yu Zhao Cc: kernel test robot Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 4 --- include/linux/mmzone.h | 26 ++----------------- mm/vmscan.c | 67 +++++++++++++++--------------------------------- 3 files changed, 23 insertions(+), 74 deletions(-) (limited to 'include') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 552fa2d11c57..55b7121809ff 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1017,9 +1017,7 @@ struct lru_gen_mm_list { void lru_gen_add_mm(struct mm_struct *mm); void lru_gen_del_mm(struct mm_struct *mm); -#ifdef CONFIG_MEMCG void lru_gen_migrate_mm(struct mm_struct *mm); -#endif static inline void lru_gen_init_mm(struct mm_struct *mm) { @@ -1050,11 +1048,9 @@ static inline void lru_gen_del_mm(struct mm_struct *mm) { } -#ifdef CONFIG_MEMCG static inline void lru_gen_migrate_mm(struct mm_struct *mm) { } -#endif static inline void lru_gen_init_mm(struct mm_struct *mm) { diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index bc3f63ec4291..28665e1b8475 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -440,14 +440,12 @@ struct lru_gen_folio { atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS]; /* whether the multi-gen LRU is enabled */ bool enabled; -#ifdef CONFIG_MEMCG /* the memcg generation this lru_gen_folio belongs to */ u8 gen; /* the list segment this lru_gen_folio belongs to */ u8 seg; /* per-node lru_gen_folio list for global reclaim */ struct hlist_nulls_node list; -#endif }; enum { @@ -493,11 +491,6 @@ struct lru_gen_mm_walk { bool force_scan; }; -void lru_gen_init_lruvec(struct lruvec *lruvec); -void lru_gen_look_around(struct page_vma_mapped_walk *pvmw); - -#ifdef CONFIG_MEMCG - /* * For each node, memcgs are divided into two generations: the old and the * young. For each generation, memcgs are randomly sharded into multiple bins @@ -555,6 +548,8 @@ struct lru_gen_memcg { }; void lru_gen_init_pgdat(struct pglist_data *pgdat); +void lru_gen_init_lruvec(struct lruvec *lruvec); +void lru_gen_look_around(struct page_vma_mapped_walk *pvmw); void lru_gen_init_memcg(struct mem_cgroup *memcg); void lru_gen_exit_memcg(struct mem_cgroup *memcg); @@ -563,19 +558,6 @@ void lru_gen_offline_memcg(struct mem_cgroup *memcg); void lru_gen_release_memcg(struct mem_cgroup *memcg); void lru_gen_soft_reclaim(struct mem_cgroup *memcg, int nid); -#else /* !CONFIG_MEMCG */ - -#define MEMCG_NR_GENS 1 - -struct lru_gen_memcg { -}; - -static inline void lru_gen_init_pgdat(struct pglist_data *pgdat) -{ -} - -#endif /* CONFIG_MEMCG */ - #else /* !CONFIG_LRU_GEN */ static inline void lru_gen_init_pgdat(struct pglist_data *pgdat) @@ -590,8 +572,6 @@ static inline void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) { } -#ifdef CONFIG_MEMCG - static inline void lru_gen_init_memcg(struct mem_cgroup *memcg) { } @@ -616,8 +596,6 @@ static inline void lru_gen_soft_reclaim(struct mem_cgroup *memcg, int nid) { } -#endif /* CONFIG_MEMCG */ - #endif /* CONFIG_LRU_GEN */ struct lruvec { diff --git a/mm/vmscan.c b/mm/vmscan.c index aa7ea09ffb4c..351a0b5043c0 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -4097,13 +4097,6 @@ enum { MEMCG_LRU_YOUNG, }; -#ifdef CONFIG_MEMCG - -static int lru_gen_memcg_seg(struct lruvec *lruvec) -{ - return READ_ONCE(lruvec->lrugen.seg); -} - static void lru_gen_rotate_memcg(struct lruvec *lruvec, int op) { int seg; @@ -4150,6 +4143,8 @@ static void lru_gen_rotate_memcg(struct lruvec *lruvec, int op) spin_unlock_irqrestore(&pgdat->memcg_lru.lock, flags); } +#ifdef CONFIG_MEMCG + void lru_gen_online_memcg(struct mem_cgroup *memcg) { int gen; @@ -4217,18 +4212,11 @@ void lru_gen_soft_reclaim(struct mem_cgroup *memcg, int nid) struct lruvec *lruvec = get_lruvec(memcg, nid); /* see the comment on MEMCG_NR_GENS */ - if (lru_gen_memcg_seg(lruvec) != MEMCG_LRU_HEAD) + if (READ_ONCE(lruvec->lrugen.seg) != MEMCG_LRU_HEAD) lru_gen_rotate_memcg(lruvec, MEMCG_LRU_HEAD); } -#else /* !CONFIG_MEMCG */ - -static int lru_gen_memcg_seg(struct lruvec *lruvec) -{ - return 0; -} - -#endif +#endif /* CONFIG_MEMCG */ /****************************************************************************** * the eviction @@ -4776,7 +4764,7 @@ static int shrink_one(struct lruvec *lruvec, struct scan_control *sc) if (mem_cgroup_below_low(NULL, memcg)) { /* see the comment on MEMCG_NR_GENS */ - if (lru_gen_memcg_seg(lruvec) != MEMCG_LRU_TAIL) + if (READ_ONCE(lruvec->lrugen.seg) != MEMCG_LRU_TAIL) return MEMCG_LRU_TAIL; memcg_memory_event(memcg, MEMCG_LOW); @@ -4799,12 +4787,10 @@ static int shrink_one(struct lruvec *lruvec, struct scan_control *sc) return 0; /* one retry if offlined or too small */ - return lru_gen_memcg_seg(lruvec) != MEMCG_LRU_TAIL ? + return READ_ONCE(lruvec->lrugen.seg) != MEMCG_LRU_TAIL ? MEMCG_LRU_TAIL : MEMCG_LRU_YOUNG; } -#ifdef CONFIG_MEMCG - static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc) { int op; @@ -4896,20 +4882,6 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc blk_finish_plug(&plug); } -#else /* !CONFIG_MEMCG */ - -static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc) -{ - BUILD_BUG(); -} - -static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) -{ - BUILD_BUG(); -} - -#endif - static void set_initial_priority(struct pglist_data *pgdat, struct scan_control *sc) { int priority; @@ -5560,6 +5532,18 @@ static const struct file_operations lru_gen_ro_fops = { * initialization ******************************************************************************/ +void lru_gen_init_pgdat(struct pglist_data *pgdat) +{ + int i, j; + + spin_lock_init(&pgdat->memcg_lru.lock); + + for (i = 0; i < MEMCG_NR_GENS; i++) { + for (j = 0; j < MEMCG_NR_BINS; j++) + INIT_HLIST_NULLS_HEAD(&pgdat->memcg_lru.fifo[i][j], i); + } +} + void lru_gen_init_lruvec(struct lruvec *lruvec) { int i; @@ -5582,18 +5566,6 @@ void lru_gen_init_lruvec(struct lruvec *lruvec) #ifdef CONFIG_MEMCG -void lru_gen_init_pgdat(struct pglist_data *pgdat) -{ - int i, j; - - spin_lock_init(&pgdat->memcg_lru.lock); - - for (i = 0; i < MEMCG_NR_GENS; i++) { - for (j = 0; j < MEMCG_NR_BINS; j++) - INIT_HLIST_NULLS_HEAD(&pgdat->memcg_lru.fifo[i][j], i); - } -} - void lru_gen_init_memcg(struct mem_cgroup *memcg) { struct lru_gen_mm_list *mm_list = get_mm_list(memcg); @@ -5653,14 +5625,17 @@ late_initcall(init_lru_gen); static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) { + BUILD_BUG(); } static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) { + BUILD_BUG(); } static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *sc) { + BUILD_BUG(); } #endif /* CONFIG_LRU_GEN */ -- cgit v1.2.3 From 533c67e6358406727145efae32882c4dc355d6c5 Mon Sep 17 00:00:00 2001 From: Kinsey Ho Date: Wed, 27 Dec 2023 14:12:04 +0000 Subject: mm/mglru: add dummy pmd_dirty() Add dummy pmd_dirty() for architectures that don't provide it. This is similar to commit 6617da8fb565 ("mm: add dummy pmd_young() for architectures not having it"). Link: https://lkml.kernel.org/r/20231227141205.2200125-5-kinseyho@google.com Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202312210606.1Etqz3M4-lkp@intel.com/ Closes: https://lore.kernel.org/oe-kbuild-all/202312210042.xQEiqlEh-lkp@intel.com/ Signed-off-by: Kinsey Ho Suggested-by: Yu Zhao Cc: Aneesh Kumar K.V Cc: Donet Tom Signed-off-by: Andrew Morton --- arch/loongarch/include/asm/pgtable.h | 1 + arch/mips/include/asm/pgtable.h | 1 + arch/riscv/include/asm/pgtable.h | 1 + arch/s390/include/asm/pgtable.h | 1 + arch/sparc/include/asm/pgtable_64.h | 1 + arch/x86/include/asm/pgtable.h | 1 + include/linux/pgtable.h | 7 +++++++ 7 files changed, 13 insertions(+) (limited to 'include') diff --git a/arch/loongarch/include/asm/pgtable.h b/arch/loongarch/include/asm/pgtable.h index 29d9b12298bc..8b5df1bbf9e9 100644 --- a/arch/loongarch/include/asm/pgtable.h +++ b/arch/loongarch/include/asm/pgtable.h @@ -523,6 +523,7 @@ static inline pmd_t pmd_wrprotect(pmd_t pmd) return pmd; } +#define pmd_dirty pmd_dirty static inline int pmd_dirty(pmd_t pmd) { return !!(pmd_val(pmd) & (_PAGE_DIRTY | _PAGE_MODIFIED)); diff --git a/arch/mips/include/asm/pgtable.h b/arch/mips/include/asm/pgtable.h index 430b208c0130..e27a4c83c548 100644 --- a/arch/mips/include/asm/pgtable.h +++ b/arch/mips/include/asm/pgtable.h @@ -655,6 +655,7 @@ static inline pmd_t pmd_mkwrite_novma(pmd_t pmd) return pmd; } +#define pmd_dirty pmd_dirty static inline int pmd_dirty(pmd_t pmd) { return !!(pmd_val(pmd) & _PAGE_MODIFIED); diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index ab00235b018f..7b4287f36054 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -673,6 +673,7 @@ static inline int pmd_write(pmd_t pmd) return pte_write(pmd_pte(pmd)); } +#define pmd_dirty pmd_dirty static inline int pmd_dirty(pmd_t pmd) { return pte_dirty(pmd_pte(pmd)); diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 601e87fa8a9a..1299b56e43f6 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -770,6 +770,7 @@ static inline int pud_write(pud_t pud) return (pud_val(pud) & _REGION3_ENTRY_WRITE) != 0; } +#define pmd_dirty pmd_dirty static inline int pmd_dirty(pmd_t pmd) { return (pmd_val(pmd) & _SEGMENT_ENTRY_DIRTY) != 0; diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h index 5e41033bf4ca..a8c871b7d786 100644 --- a/arch/sparc/include/asm/pgtable_64.h +++ b/arch/sparc/include/asm/pgtable_64.h @@ -706,6 +706,7 @@ static inline unsigned long pmd_write(pmd_t pmd) #define pud_write(pud) pte_write(__pte(pud_val(pud))) #ifdef CONFIG_TRANSPARENT_HUGEPAGE +#define pmd_dirty pmd_dirty static inline unsigned long pmd_dirty(pmd_t pmd) { pte_t pte = __pte(pmd_val(pmd)); diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 08b5cb22d9a6..9d077bca6a10 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -141,6 +141,7 @@ static inline int pte_young(pte_t pte) return pte_flags(pte) & _PAGE_ACCESSED; } +#define pmd_dirty pmd_dirty static inline bool pmd_dirty(pmd_t pmd) { return pmd_flags(pmd) & _PAGE_DIRTY_BITS; diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 9ecc20fa6269..466cf477551a 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -184,6 +184,13 @@ static inline int pmd_young(pmd_t pmd) } #endif +#ifndef pmd_dirty +static inline int pmd_dirty(pmd_t pmd) +{ + return 0; +} +#endif + /* * A facility to provide lazy MMU batching. This allows PTE updates and * page invalidations to be delayed until a call to leave lazy MMU mode -- cgit v1.2.3 From e435ca87882167dda78776ce4bd6eb2094eb864b Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 28 Dec 2023 08:57:43 +0000 Subject: mm: remove inc/dec lruvec page state functions Patch series "Remove some lruvec page accounting functions", v2. Some functions are now unused; remove them. Make __mod_lruvec_page_state() unused and then remove it. This patch (of 6): All callers of these have been converted to their folio equivalents. Link: https://lkml.kernel.org/r/20231228085748.1083901-1-willy@infradead.org Link: https://lkml.kernel.org/r/20231228085748.1083901-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Johannes Weiner Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/vmstat.h | 24 ------------------------ 1 file changed, 24 deletions(-) (limited to 'include') diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index fed855bae6d8..147ae73e0ee7 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -597,18 +597,6 @@ static inline void mod_lruvec_page_state(struct page *page, #endif /* CONFIG_MEMCG */ -static inline void __inc_lruvec_page_state(struct page *page, - enum node_stat_item idx) -{ - __mod_lruvec_page_state(page, idx, 1); -} - -static inline void __dec_lruvec_page_state(struct page *page, - enum node_stat_item idx) -{ - __mod_lruvec_page_state(page, idx, -1); -} - static inline void __lruvec_stat_mod_folio(struct folio *folio, enum node_stat_item idx, int val) { @@ -627,18 +615,6 @@ static inline void __lruvec_stat_sub_folio(struct folio *folio, __lruvec_stat_mod_folio(folio, idx, -folio_nr_pages(folio)); } -static inline void inc_lruvec_page_state(struct page *page, - enum node_stat_item idx) -{ - mod_lruvec_page_state(page, idx, 1); -} - -static inline void dec_lruvec_page_state(struct page *page, - enum node_stat_item idx) -{ - mod_lruvec_page_state(page, idx, -1); -} - static inline void lruvec_stat_mod_folio(struct folio *folio, enum node_stat_item idx, int val) { -- cgit v1.2.3 From c701123bd68bf1cc3bc167b4f597cb1f4995c39c Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 28 Dec 2023 08:57:48 +0000 Subject: mm/memcontrol: remove __mod_lruvec_page_state() There are no more callers of __mod_lruvec_page_state(), so convert the implementation to __lruvec_stat_mod_folio(), removing two calls to compound_head() (one explicit, one hidden inside page_memcg()). Link: https://lkml.kernel.org/r/20231228085748.1083901-7-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zi Yan Acked-by: Shakeel Butt Reviewed-by: Vlastimil Babka Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Johannes Weiner Signed-off-by: Andrew Morton --- include/linux/vmstat.h | 36 ++++++++++++++++++------------------ mm/memcontrol.c | 9 ++++----- 2 files changed, 22 insertions(+), 23 deletions(-) (limited to 'include') diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 147ae73e0ee7..343906a98d6e 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -556,19 +556,25 @@ static inline void mod_lruvec_state(struct lruvec *lruvec, local_irq_restore(flags); } -void __mod_lruvec_page_state(struct page *page, +void __lruvec_stat_mod_folio(struct folio *folio, enum node_stat_item idx, int val); -static inline void mod_lruvec_page_state(struct page *page, +static inline void lruvec_stat_mod_folio(struct folio *folio, enum node_stat_item idx, int val) { unsigned long flags; local_irq_save(flags); - __mod_lruvec_page_state(page, idx, val); + __lruvec_stat_mod_folio(folio, idx, val); local_irq_restore(flags); } +static inline void mod_lruvec_page_state(struct page *page, + enum node_stat_item idx, int val) +{ + lruvec_stat_mod_folio(page_folio(page), idx, val); +} + #else static inline void __mod_lruvec_state(struct lruvec *lruvec, @@ -583,10 +589,16 @@ static inline void mod_lruvec_state(struct lruvec *lruvec, mod_node_page_state(lruvec_pgdat(lruvec), idx, val); } -static inline void __mod_lruvec_page_state(struct page *page, - enum node_stat_item idx, int val) +static inline void __lruvec_stat_mod_folio(struct folio *folio, + enum node_stat_item idx, int val) { - __mod_node_page_state(page_pgdat(page), idx, val); + __mod_node_page_state(folio_pgdat(folio), idx, val); +} + +static inline void lruvec_stat_mod_folio(struct folio *folio, + enum node_stat_item idx, int val) +{ + mod_node_page_state(folio_pgdat(folio), idx, val); } static inline void mod_lruvec_page_state(struct page *page, @@ -597,12 +609,6 @@ static inline void mod_lruvec_page_state(struct page *page, #endif /* CONFIG_MEMCG */ -static inline void __lruvec_stat_mod_folio(struct folio *folio, - enum node_stat_item idx, int val) -{ - __mod_lruvec_page_state(&folio->page, idx, val); -} - static inline void __lruvec_stat_add_folio(struct folio *folio, enum node_stat_item idx) { @@ -615,12 +621,6 @@ static inline void __lruvec_stat_sub_folio(struct folio *folio, __lruvec_stat_mod_folio(folio, idx, -folio_nr_pages(folio)); } -static inline void lruvec_stat_mod_folio(struct folio *folio, - enum node_stat_item idx, int val) -{ - mod_lruvec_page_state(&folio->page, idx, val); -} - static inline void lruvec_stat_add_folio(struct folio *folio, enum node_stat_item idx) { diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 3ca691fb5b49..0082cef6e1fd 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -890,16 +890,15 @@ void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, __mod_memcg_lruvec_state(lruvec, idx, val); } -void __mod_lruvec_page_state(struct page *page, enum node_stat_item idx, +void __lruvec_stat_mod_folio(struct folio *folio, enum node_stat_item idx, int val) { - struct page *head = compound_head(page); /* rmap on tail pages */ struct mem_cgroup *memcg; - pg_data_t *pgdat = page_pgdat(page); + pg_data_t *pgdat = folio_pgdat(folio); struct lruvec *lruvec; rcu_read_lock(); - memcg = page_memcg(head); + memcg = folio_memcg(folio); /* Untracked pages have no memcg, no lruvec. Update only the node */ if (!memcg) { rcu_read_unlock(); @@ -911,7 +910,7 @@ void __mod_lruvec_page_state(struct page *page, enum node_stat_item idx, __mod_lruvec_state(lruvec, idx, val); rcu_read_unlock(); } -EXPORT_SYMBOL(__mod_lruvec_page_state); +EXPORT_SYMBOL(__lruvec_stat_mod_folio); void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val) { -- cgit v1.2.3 From b805ab3c6935d14654ccc28f16ffce7a13c2c528 Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Fri, 29 Dec 2023 10:26:51 +0800 Subject: mm/vmstat: move pgdemote_* out of CONFIG_NUMA_BALANCING Demotion can work well without CONFIG_NUMA_BALANCING. But the commit 23e9f0138963 ("mm/vmstat: move pgdemote_* to per-node stats") wrongly hid it behind CONFIG_NUMA_BALANCING. Fix it by moving them out of CONFIG_NUMA_BALANCING. Link: https://lkml.kernel.org/r/20231229022651.3229174-1-lizhijian@fujitsu.com Fixes: 23e9f0138963 ("mm/vmstat: move pgdemote_* to per-node stats") Signed-off-by: Li Zhijian Cc: "Huang, Ying" Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 2 +- mm/vmscan.c | 5 +---- mm/vmstat.c | 2 +- 3 files changed, 3 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 28665e1b8475..c18c53353b50 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -207,11 +207,11 @@ enum node_stat_item { #ifdef CONFIG_NUMA_BALANCING PGPROMOTE_SUCCESS, /* promote successfully */ PGPROMOTE_CANDIDATE, /* candidate pages to promote */ +#endif /* PGDEMOTE_*: pages demoted */ PGDEMOTE_KSWAPD, PGDEMOTE_DIRECT, PGDEMOTE_KHUGEPAGED, -#endif NR_VM_NODE_STAT_ITEMS }; diff --git a/mm/vmscan.c b/mm/vmscan.c index ceba905e5630..600ed3cbf7cb 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -409,12 +409,10 @@ void drop_slab(void) static int reclaimer_offset(void) { -#ifdef CONFIG_NUMA_BALANCING BUILD_BUG_ON(PGSTEAL_DIRECT - PGSTEAL_KSWAPD != PGDEMOTE_DIRECT - PGDEMOTE_KSWAPD); BUILD_BUG_ON(PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD != PGDEMOTE_KHUGEPAGED - PGDEMOTE_KSWAPD); -#endif BUILD_BUG_ON(PGSTEAL_DIRECT - PGSTEAL_KSWAPD != PGSCAN_DIRECT - PGSCAN_KSWAPD); BUILD_BUG_ON(PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD != @@ -978,10 +976,9 @@ static unsigned int demote_folio_list(struct list_head *demote_folios, migrate_pages(demote_folios, alloc_demote_folio, NULL, (unsigned long)&mtc, MIGRATE_ASYNC, MR_DEMOTION, &nr_succeeded); -#ifdef CONFIG_NUMA_BALANCING + mod_node_page_state(pgdat, PGDEMOTE_KSWAPD + reclaimer_offset(), nr_succeeded); -#endif return nr_succeeded; } diff --git a/mm/vmstat.c b/mm/vmstat.c index cfd8d8256f8e..1437ca2f28c5 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1248,10 +1248,10 @@ const char * const vmstat_text[] = { #ifdef CONFIG_NUMA_BALANCING "pgpromote_success", "pgpromote_candidate", +#endif "pgdemote_kswapd", "pgdemote_direct", "pgdemote_khugepaged", -#endif /* enum writeback_stat_item counters */ "nr_dirty_threshold", -- cgit v1.2.3 From fd37721803c6e73619108f76ad2e12a9aa5fafaf Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 28 Dec 2023 17:47:03 +0300 Subject: mm, treewide: introduce NR_PAGE_ORDERS NR_PAGE_ORDERS defines the number of page orders supported by the page allocator, ranging from 0 to MAX_ORDER, MAX_ORDER + 1 in total. NR_PAGE_ORDERS assists in defining arrays of page orders and allows for more natural iteration over them. [kirill.shutemov@linux.intel.com: fixup for kerneldoc warning] Link: https://lkml.kernel.org/r/20240101111512.7empzyifq7kxtzk3@box Link: https://lkml.kernel.org/r/20231228144704.14033-1-kirill.shutemov@linux.intel.com Signed-off-by: Kirill A. Shutemov Reviewed-by: Zi Yan Cc: Linus Torvalds Signed-off-by: Andrew Morton --- Documentation/admin-guide/kdump/vmcoreinfo.rst | 6 +++--- arch/arm64/kvm/hyp/include/nvhe/gfp.h | 2 +- arch/sparc/kernel/traps_64.c | 2 +- drivers/gpu/drm/ttm/tests/ttm_device_test.c | 2 +- drivers/gpu/drm/ttm/ttm_pool.c | 20 ++++++++++---------- include/drm/ttm/ttm_pool.h | 2 +- include/linux/mmzone.h | 6 ++++-- kernel/crash_core.c | 2 +- lib/test_meminit.c | 2 +- mm/compaction.c | 2 +- mm/kmsan/init.c | 2 +- mm/page_alloc.c | 13 ++++++------- mm/page_reporting.c | 2 +- mm/show_mem.c | 8 ++++---- mm/vmstat.c | 12 ++++++------ 15 files changed, 42 insertions(+), 41 deletions(-) (limited to 'include') diff --git a/Documentation/admin-guide/kdump/vmcoreinfo.rst b/Documentation/admin-guide/kdump/vmcoreinfo.rst index 78e4d2e7ba14..3f8769e46b07 100644 --- a/Documentation/admin-guide/kdump/vmcoreinfo.rst +++ b/Documentation/admin-guide/kdump/vmcoreinfo.rst @@ -172,7 +172,7 @@ variables. Offset of the free_list's member. This value is used to compute the number of free pages. -Each zone has a free_area structure array called free_area[MAX_ORDER + 1]. +Each zone has a free_area structure array called free_area[NR_PAGE_ORDERS]. The free_list represents a linked list of free page blocks. (list_head, next|prev) @@ -189,8 +189,8 @@ Offsets of the vmap_area's members. They carry vmalloc-specific information. Makedumpfile gets the start address of the vmalloc region from this. -(zone.free_area, MAX_ORDER + 1) -------------------------------- +(zone.free_area, NR_PAGE_ORDERS) +-------------------------------- Free areas descriptor. User-space tools use this value to iterate the free_area ranges. MAX_ORDER is used by the zone buddy allocator. diff --git a/arch/arm64/kvm/hyp/include/nvhe/gfp.h b/arch/arm64/kvm/hyp/include/nvhe/gfp.h index fe5472a184a3..97c527ef53c2 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/gfp.h +++ b/arch/arm64/kvm/hyp/include/nvhe/gfp.h @@ -16,7 +16,7 @@ struct hyp_pool { * API at EL2. */ hyp_spinlock_t lock; - struct list_head free_area[MAX_ORDER + 1]; + struct list_head free_area[NR_PAGE_ORDERS]; phys_addr_t range_start; phys_addr_t range_end; unsigned short max_order; diff --git a/arch/sparc/kernel/traps_64.c b/arch/sparc/kernel/traps_64.c index 08ffd17d5ec3..523a6e5ee925 100644 --- a/arch/sparc/kernel/traps_64.c +++ b/arch/sparc/kernel/traps_64.c @@ -897,7 +897,7 @@ void __init cheetah_ecache_flush_init(void) /* Now allocate error trap reporting scoreboard. */ sz = NR_CPUS * (2 * sizeof(struct cheetah_err_info)); - for (order = 0; order <= MAX_ORDER; order++) { + for (order = 0; order < NR_PAGE_ORDERS; order++) { if ((PAGE_SIZE << order) >= sz) break; } diff --git a/drivers/gpu/drm/ttm/tests/ttm_device_test.c b/drivers/gpu/drm/ttm/tests/ttm_device_test.c index b1b423b68cdf..19eaff22e6ae 100644 --- a/drivers/gpu/drm/ttm/tests/ttm_device_test.c +++ b/drivers/gpu/drm/ttm/tests/ttm_device_test.c @@ -175,7 +175,7 @@ static void ttm_device_init_pools(struct kunit *test) if (params->pools_init_expected) { for (int i = 0; i < TTM_NUM_CACHING_TYPES; ++i) { - for (int j = 0; j <= MAX_ORDER; ++j) { + for (int j = 0; j < NR_PAGE_ORDERS; ++j) { pt = pool->caching[i].orders[j]; KUNIT_EXPECT_PTR_EQ(test, pt.pool, pool); KUNIT_EXPECT_EQ(test, pt.caching, i); diff --git a/drivers/gpu/drm/ttm/ttm_pool.c b/drivers/gpu/drm/ttm/ttm_pool.c index fe610a3cace0..d183bb97c526 100644 --- a/drivers/gpu/drm/ttm/ttm_pool.c +++ b/drivers/gpu/drm/ttm/ttm_pool.c @@ -65,11 +65,11 @@ module_param(page_pool_size, ulong, 0644); static atomic_long_t allocated_pages; -static struct ttm_pool_type global_write_combined[MAX_ORDER + 1]; -static struct ttm_pool_type global_uncached[MAX_ORDER + 1]; +static struct ttm_pool_type global_write_combined[NR_PAGE_ORDERS]; +static struct ttm_pool_type global_uncached[NR_PAGE_ORDERS]; -static struct ttm_pool_type global_dma32_write_combined[MAX_ORDER + 1]; -static struct ttm_pool_type global_dma32_uncached[MAX_ORDER + 1]; +static struct ttm_pool_type global_dma32_write_combined[NR_PAGE_ORDERS]; +static struct ttm_pool_type global_dma32_uncached[NR_PAGE_ORDERS]; static spinlock_t shrinker_lock; static struct list_head shrinker_list; @@ -568,7 +568,7 @@ void ttm_pool_init(struct ttm_pool *pool, struct device *dev, if (use_dma_alloc || nid != NUMA_NO_NODE) { for (i = 0; i < TTM_NUM_CACHING_TYPES; ++i) - for (j = 0; j <= MAX_ORDER; ++j) + for (j = 0; j < NR_PAGE_ORDERS; ++j) ttm_pool_type_init(&pool->caching[i].orders[j], pool, i, j); } @@ -601,7 +601,7 @@ void ttm_pool_fini(struct ttm_pool *pool) if (pool->use_dma_alloc || pool->nid != NUMA_NO_NODE) { for (i = 0; i < TTM_NUM_CACHING_TYPES; ++i) - for (j = 0; j <= MAX_ORDER; ++j) + for (j = 0; j < NR_PAGE_ORDERS; ++j) ttm_pool_type_fini(&pool->caching[i].orders[j]); } @@ -656,7 +656,7 @@ static void ttm_pool_debugfs_header(struct seq_file *m) unsigned int i; seq_puts(m, "\t "); - for (i = 0; i <= MAX_ORDER; ++i) + for (i = 0; i < NR_PAGE_ORDERS; ++i) seq_printf(m, " ---%2u---", i); seq_puts(m, "\n"); } @@ -667,7 +667,7 @@ static void ttm_pool_debugfs_orders(struct ttm_pool_type *pt, { unsigned int i; - for (i = 0; i <= MAX_ORDER; ++i) + for (i = 0; i < NR_PAGE_ORDERS; ++i) seq_printf(m, " %8u", ttm_pool_type_count(&pt[i])); seq_puts(m, "\n"); } @@ -776,7 +776,7 @@ int ttm_pool_mgr_init(unsigned long num_pages) spin_lock_init(&shrinker_lock); INIT_LIST_HEAD(&shrinker_list); - for (i = 0; i <= MAX_ORDER; ++i) { + for (i = 0; i < NR_PAGE_ORDERS; ++i) { ttm_pool_type_init(&global_write_combined[i], NULL, ttm_write_combined, i); ttm_pool_type_init(&global_uncached[i], NULL, ttm_uncached, i); @@ -816,7 +816,7 @@ void ttm_pool_mgr_fini(void) { unsigned int i; - for (i = 0; i <= MAX_ORDER; ++i) { + for (i = 0; i < NR_PAGE_ORDERS; ++i) { ttm_pool_type_fini(&global_write_combined[i]); ttm_pool_type_fini(&global_uncached[i]); diff --git a/include/drm/ttm/ttm_pool.h b/include/drm/ttm/ttm_pool.h index 30a347e5aa11..4490d43c63e3 100644 --- a/include/drm/ttm/ttm_pool.h +++ b/include/drm/ttm/ttm_pool.h @@ -74,7 +74,7 @@ struct ttm_pool { bool use_dma32; struct { - struct ttm_pool_type orders[MAX_ORDER + 1]; + struct ttm_pool_type orders[NR_PAGE_ORDERS]; } caching[TTM_NUM_CACHING_TYPES]; }; diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index c18c53353b50..1ea7636dfb76 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -35,6 +35,8 @@ #define IS_MAX_ORDER_ALIGNED(pfn) IS_ALIGNED(pfn, MAX_ORDER_NR_PAGES) +#define NR_PAGE_ORDERS (MAX_ORDER + 1) + /* * PAGE_ALLOC_COSTLY_ORDER is the order at which allocations are deemed * costly to service. That is between allocation orders which should @@ -96,7 +98,7 @@ static inline bool migratetype_is_mergeable(int mt) } #define for_each_migratetype_order(order, type) \ - for (order = 0; order <= MAX_ORDER; order++) \ + for (order = 0; order < NR_PAGE_ORDERS; order++) \ for (type = 0; type < MIGRATE_TYPES; type++) extern int page_group_by_mobility_disabled; @@ -933,7 +935,7 @@ struct zone { CACHELINE_PADDING(_pad1_); /* free areas of different sizes */ - struct free_area free_area[MAX_ORDER + 1]; + struct free_area free_area[NR_PAGE_ORDERS]; #ifdef CONFIG_UNACCEPTED_MEMORY /* Pages to be accepted. All pages on the list are MAX_ORDER */ diff --git a/kernel/crash_core.c b/kernel/crash_core.c index d4313b53837e..56cf4ad7abbb 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -802,7 +802,7 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_OFFSET(list_head, prev); VMCOREINFO_OFFSET(vmap_area, va_start); VMCOREINFO_OFFSET(vmap_area, list); - VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER + 1); + VMCOREINFO_LENGTH(zone.free_area, NR_PAGE_ORDERS); log_buf_vmcoreinfo_setup(); VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES); VMCOREINFO_NUMBER(NR_FREE_PAGES); diff --git a/lib/test_meminit.c b/lib/test_meminit.c index 0ae35223d773..0dc173849a54 100644 --- a/lib/test_meminit.c +++ b/lib/test_meminit.c @@ -93,7 +93,7 @@ static int __init test_pages(int *total_failures) int failures = 0, num_tests = 0; int i; - for (i = 0; i <= MAX_ORDER; i++) + for (i = 0; i < NR_PAGE_ORDERS; i++) num_tests += do_alloc_pages_order(i, &failures); REPORT_FAILURES_IN_FN(); diff --git a/mm/compaction.c b/mm/compaction.c index de15a2ef0af5..24f8eb4d6260 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -2229,7 +2229,7 @@ static enum compact_result __compact_finished(struct compact_control *cc) /* Direct compactor: Is a suitable page free? */ ret = COMPACT_NO_SUITABLE_PAGE; - for (order = cc->order; order <= MAX_ORDER; order++) { + for (order = cc->order; order < NR_PAGE_ORDERS; order++) { struct free_area *area = &cc->zone->free_area[order]; bool can_steal; diff --git a/mm/kmsan/init.c b/mm/kmsan/init.c index ffedf4dbc49d..103e2e88ea03 100644 --- a/mm/kmsan/init.c +++ b/mm/kmsan/init.c @@ -96,7 +96,7 @@ void __init kmsan_init_shadow(void) struct metadata_page_pair { struct page *shadow, *origin; }; -static struct metadata_page_pair held_back[MAX_ORDER + 1] __initdata; +static struct metadata_page_pair held_back[NR_PAGE_ORDERS] __initdata; /* * Eager metadata allocation. When the memblock allocator is freeing pages to diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5526797b7f96..ccecf6158ae4 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1560,7 +1560,7 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, struct page *page; /* Find a page of the appropriate size in the preferred list */ - for (current_order = order; current_order <= MAX_ORDER; ++current_order) { + for (current_order = order; current_order < NR_PAGE_ORDERS; ++current_order) { area = &(zone->free_area[current_order]); page = get_page_from_free_area(area, migratetype); if (!page) @@ -1934,7 +1934,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, continue; spin_lock_irqsave(&zone->lock, flags); - for (order = 0; order <= MAX_ORDER; order++) { + for (order = 0; order < NR_PAGE_ORDERS; order++) { struct free_area *area = &(zone->free_area[order]); page = get_page_from_free_area(area, MIGRATE_HIGHATOMIC); @@ -2044,8 +2044,7 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype, return false; find_smallest: - for (current_order = order; current_order <= MAX_ORDER; - current_order++) { + for (current_order = order; current_order < NR_PAGE_ORDERS; current_order++) { area = &(zone->free_area[current_order]); fallback_mt = find_suitable_fallback(area, current_order, start_migratetype, false, &can_steal); @@ -3000,7 +2999,7 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, return true; /* For a high-order request, check at least one suitable page is free */ - for (o = order; o <= MAX_ORDER; o++) { + for (o = order; o < NR_PAGE_ORDERS; o++) { struct free_area *area = &z->free_area[o]; int mt; @@ -6628,7 +6627,7 @@ bool is_free_buddy_page(struct page *page) unsigned long pfn = page_to_pfn(page); unsigned int order; - for (order = 0; order <= MAX_ORDER; order++) { + for (order = 0; order < NR_PAGE_ORDERS; order++) { struct page *page_head = page - (pfn & ((1 << order) - 1)); if (PageBuddy(page_head) && @@ -6683,7 +6682,7 @@ bool take_page_off_buddy(struct page *page) bool ret = false; spin_lock_irqsave(&zone->lock, flags); - for (order = 0; order <= MAX_ORDER; order++) { + for (order = 0; order < NR_PAGE_ORDERS; order++) { struct page *page_head = page - (pfn & ((1 << order) - 1)); int page_order = buddy_order(page_head); diff --git a/mm/page_reporting.c b/mm/page_reporting.c index b021f482a4cb..66369cc5279b 100644 --- a/mm/page_reporting.c +++ b/mm/page_reporting.c @@ -276,7 +276,7 @@ page_reporting_process_zone(struct page_reporting_dev_info *prdev, return err; /* Process each free list starting from lowest order/mt */ - for (order = page_reporting_order; order <= MAX_ORDER; order++) { + for (order = page_reporting_order; order < NR_PAGE_ORDERS; order++) { for (mt = 0; mt < MIGRATE_TYPES; mt++) { /* We do not pull pages from the isolate free list */ if (is_migrate_isolate(mt)) diff --git a/mm/show_mem.c b/mm/show_mem.c index ba0808d6917f..8dcfafbd283c 100644 --- a/mm/show_mem.c +++ b/mm/show_mem.c @@ -352,8 +352,8 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z for_each_populated_zone(zone) { unsigned int order; - unsigned long nr[MAX_ORDER + 1], flags, total = 0; - unsigned char types[MAX_ORDER + 1]; + unsigned long nr[NR_PAGE_ORDERS], flags, total = 0; + unsigned char types[NR_PAGE_ORDERS]; if (zone_idx(zone) > max_zone_idx) continue; @@ -363,7 +363,7 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z printk(KERN_CONT "%s: ", zone->name); spin_lock_irqsave(&zone->lock, flags); - for (order = 0; order <= MAX_ORDER; order++) { + for (order = 0; order < NR_PAGE_ORDERS; order++) { struct free_area *area = &zone->free_area[order]; int type; @@ -377,7 +377,7 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z } } spin_unlock_irqrestore(&zone->lock, flags); - for (order = 0; order <= MAX_ORDER; order++) { + for (order = 0; order < NR_PAGE_ORDERS; order++) { printk(KERN_CONT "%lu*%lukB ", nr[order], K(1UL) << order); if (nr[order]) diff --git a/mm/vmstat.c b/mm/vmstat.c index 1437ca2f28c5..03ead31c46a0 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1059,7 +1059,7 @@ static void fill_contig_page_info(struct zone *zone, info->free_blocks_total = 0; info->free_blocks_suitable = 0; - for (order = 0; order <= MAX_ORDER; order++) { + for (order = 0; order < NR_PAGE_ORDERS; order++) { unsigned long blocks; /* @@ -1476,7 +1476,7 @@ static void frag_show_print(struct seq_file *m, pg_data_t *pgdat, int order; seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name); - for (order = 0; order <= MAX_ORDER; ++order) + for (order = 0; order < NR_PAGE_ORDERS; ++order) /* * Access to nr_free is lockless as nr_free is used only for * printing purposes. Use data_race to avoid KCSAN warning. @@ -1505,7 +1505,7 @@ static void pagetypeinfo_showfree_print(struct seq_file *m, pgdat->node_id, zone->name, migratetype_names[mtype]); - for (order = 0; order <= MAX_ORDER; ++order) { + for (order = 0; order < NR_PAGE_ORDERS; ++order) { unsigned long freecount = 0; struct free_area *area; struct list_head *curr; @@ -1545,7 +1545,7 @@ static void pagetypeinfo_showfree(struct seq_file *m, void *arg) /* Print header */ seq_printf(m, "%-43s ", "Free pages count per migrate type at order"); - for (order = 0; order <= MAX_ORDER; ++order) + for (order = 0; order < NR_PAGE_ORDERS; ++order) seq_printf(m, "%6d ", order); seq_putc(m, '\n'); @@ -2181,7 +2181,7 @@ static void unusable_show_print(struct seq_file *m, seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name); - for (order = 0; order <= MAX_ORDER; ++order) { + for (order = 0; order < NR_PAGE_ORDERS; ++order) { fill_contig_page_info(zone, order, &info); index = unusable_free_index(order, &info); seq_printf(m, "%d.%03d ", index / 1000, index % 1000); @@ -2233,7 +2233,7 @@ static void extfrag_show_print(struct seq_file *m, seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name); - for (order = 0; order <= MAX_ORDER; ++order) { + for (order = 0; order < NR_PAGE_ORDERS; ++order) { fill_contig_page_info(zone, order, &info); index = __fragmentation_index(order, &info); seq_printf(m, "%2d.%03d ", index / 1000, index % 1000); -- cgit v1.2.3 From 5e0a760b44417f7cadd79de2204d6247109558a0 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 28 Dec 2023 17:47:04 +0300 Subject: mm, treewide: rename MAX_ORDER to MAX_PAGE_ORDER commit 23baf831a32c ("mm, treewide: redefine MAX_ORDER sanely") has changed the definition of MAX_ORDER to be inclusive. This has caused issues with code that was not yet upstream and depended on the previous definition. To draw attention to the altered meaning of the define, rename MAX_ORDER to MAX_PAGE_ORDER. Link: https://lkml.kernel.org/r/20231228144704.14033-2-kirill.shutemov@linux.intel.com Signed-off-by: Kirill A. Shutemov Cc: Linus Torvalds Signed-off-by: Andrew Morton --- Documentation/admin-guide/kdump/vmcoreinfo.rst | 2 +- Documentation/admin-guide/kernel-parameters.txt | 24 ++++++++++++------------ Documentation/networking/packet_mmap.rst | 14 +++++++------- arch/arm/Kconfig | 2 +- arch/arm64/Kconfig | 20 ++++++++++---------- arch/arm64/include/asm/sparsemem.h | 2 +- arch/arm64/kvm/hyp/nvhe/page_alloc.c | 3 ++- arch/arm64/mm/hugetlbpage.c | 2 +- arch/m68k/Kconfig.cpu | 2 +- arch/nios2/Kconfig | 2 +- arch/powerpc/Kconfig | 2 +- arch/powerpc/mm/book3s64/iommu_api.c | 2 +- arch/powerpc/mm/hugetlbpage.c | 2 +- arch/powerpc/platforms/powernv/pci-ioda.c | 2 +- arch/sh/mm/Kconfig | 2 +- arch/sparc/Kconfig | 2 +- arch/sparc/kernel/pci_sun4v.c | 2 +- arch/sparc/mm/tsb.c | 4 ++-- arch/um/kernel/um_arch.c | 4 ++-- arch/xtensa/Kconfig | 2 +- drivers/accel/qaic/qaic_data.c | 2 +- drivers/base/regmap/regmap-debugfs.c | 8 ++++---- drivers/block/floppy.c | 2 +- drivers/crypto/ccp/sev-dev.c | 2 +- drivers/crypto/hisilicon/sgl.c | 6 +++--- drivers/gpu/drm/i915/gem/i915_gem_internal.c | 2 +- drivers/gpu/drm/i915/gem/selftests/huge_pages.c | 2 +- drivers/gpu/drm/ttm/tests/ttm_pool_test.c | 8 ++++---- drivers/gpu/drm/ttm/ttm_pool.c | 2 +- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h | 2 +- drivers/iommu/dma-iommu.c | 2 +- drivers/irqchip/irq-gic-v3-its.c | 4 ++-- drivers/md/dm-bufio.c | 2 +- drivers/md/dm-crypt.c | 2 +- drivers/md/dm-flakey.c | 2 +- drivers/misc/genwqe/card_dev.c | 2 +- drivers/misc/genwqe/card_utils.c | 4 ++-- drivers/net/ethernet/hisilicon/hns3/hns3_enet.c | 2 +- drivers/net/ethernet/ibm/ibmvnic.h | 4 ++-- drivers/video/fbdev/hyperv_fb.c | 6 +++--- drivers/video/fbdev/vermilion/vermilion.c | 2 +- drivers/virtio/virtio_balloon.c | 2 +- drivers/virtio/virtio_mem.c | 8 ++++---- fs/ramfs/file-nommu.c | 2 +- include/linux/hugetlb.h | 2 +- include/linux/mmzone.h | 14 +++++++------- include/linux/pageblock-flags.h | 4 ++-- include/linux/slab.h | 4 ++-- kernel/dma/pool.c | 6 +++--- kernel/dma/swiotlb.c | 4 ++-- kernel/events/ring_buffer.c | 10 +++++----- mm/Kconfig | 6 +++--- mm/compaction.c | 4 ++-- mm/debug_page_alloc.c | 2 +- mm/debug_vm_pgtable.c | 4 ++-- mm/huge_memory.c | 2 +- mm/hugetlb.c | 4 ++-- mm/internal.h | 2 +- mm/kmsan/init.c | 6 +++--- mm/memblock.c | 7 ++++--- mm/memory_hotplug.c | 9 +++++---- mm/mm_init.c | 22 +++++++++++----------- mm/page_alloc.c | 24 ++++++++++++------------ mm/page_isolation.c | 17 +++++++++-------- mm/page_owner.c | 6 +++--- mm/page_reporting.c | 4 ++-- mm/shuffle.h | 2 +- mm/slab.c | 2 +- mm/slub.c | 4 ++-- mm/vmscan.c | 2 +- mm/vmstat.c | 2 +- net/smc/smc_ib.c | 2 +- security/integrity/ima/ima_crypto.c | 2 +- tools/perf/Documentation/perf-intel-pt.txt | 2 +- tools/testing/memblock/linux/mmzone.h | 6 +++--- tools/testing/selftests/mm/thuge-gen.c | 3 ++- 76 files changed, 186 insertions(+), 181 deletions(-) (limited to 'include') diff --git a/Documentation/admin-guide/kdump/vmcoreinfo.rst b/Documentation/admin-guide/kdump/vmcoreinfo.rst index 3f8769e46b07..bced9e4b6e08 100644 --- a/Documentation/admin-guide/kdump/vmcoreinfo.rst +++ b/Documentation/admin-guide/kdump/vmcoreinfo.rst @@ -193,7 +193,7 @@ from this. -------------------------------- Free areas descriptor. User-space tools use this value to iterate the -free_area ranges. MAX_ORDER is used by the zone buddy allocator. +free_area ranges. NR_PAGE_ORDERS is used by the zone buddy allocator. prb --- diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 65731b060e3f..8a01b8112f0b 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -970,17 +970,17 @@ buddy allocator. Bigger value increase the probability of catching random memory corruption, but reduce the amount of memory for normal system use. The maximum - possible value is MAX_ORDER/2. Setting this parameter - to 1 or 2 should be enough to identify most random - memory corruption problems caused by bugs in kernel or - driver code when a CPU writes to (or reads from) a - random memory location. Note that there exists a class - of memory corruptions problems caused by buggy H/W or - F/W or by drivers badly programming DMA (basically when - memory is written at bus level and the CPU MMU is - bypassed) which are not detectable by - CONFIG_DEBUG_PAGEALLOC, hence this option will not help - tracking down these problems. + possible value is MAX_PAGE_ORDER/2. Setting this + parameter to 1 or 2 should be enough to identify most + random memory corruption problems caused by bugs in + kernel or driver code when a CPU writes to (or reads + from) a random memory location. Note that there exists + a class of memory corruptions problems caused by buggy + H/W or F/W or by drivers badly programming DMA + (basically when memory is written at bus level and the + CPU MMU is bypassed) which are not detectable by + CONFIG_DEBUG_PAGEALLOC, hence this option will not + help tracking down these problems. debug_pagealloc= [KNL] When CONFIG_DEBUG_PAGEALLOC is set, this parameter @@ -4136,7 +4136,7 @@ [KNL] Minimal page reporting order Format: Adjust the minimal page reporting order. The page - reporting is disabled when it exceeds MAX_ORDER. + reporting is disabled when it exceeds MAX_PAGE_ORDER. panic= [KNL] Kernel behaviour on panic: delay timeout > 0: seconds before rebooting diff --git a/Documentation/networking/packet_mmap.rst b/Documentation/networking/packet_mmap.rst index 30a3be3c48f3..dca15d15feaf 100644 --- a/Documentation/networking/packet_mmap.rst +++ b/Documentation/networking/packet_mmap.rst @@ -263,20 +263,20 @@ the name indicates, this function allocates pages of memory, and the second argument is "order" or a power of two number of pages, that is (for PAGE_SIZE == 4096) order=0 ==> 4096 bytes, order=1 ==> 8192 bytes, order=2 ==> 16384 bytes, etc. The maximum size of a -region allocated by __get_free_pages is determined by the MAX_ORDER macro. More -precisely the limit can be calculated as:: +region allocated by __get_free_pages is determined by the MAX_PAGE_ORDER macro. +More precisely the limit can be calculated as:: - PAGE_SIZE << MAX_ORDER + PAGE_SIZE << MAX_PAGE_ORDER In a i386 architecture PAGE_SIZE is 4096 bytes - In a 2.4/i386 kernel MAX_ORDER is 10 - In a 2.6/i386 kernel MAX_ORDER is 11 + In a 2.4/i386 kernel MAX_PAGE_ORDER is 10 + In a 2.6/i386 kernel MAX_PAGE_ORDER is 11 So get_free_pages can allocate as much as 4MB or 8MB in a 2.4/2.6 kernel respectively, with an i386 architecture. User space programs can include /usr/include/sys/user.h and -/usr/include/linux/mmzone.h to get PAGE_SIZE MAX_ORDER declarations. +/usr/include/linux/mmzone.h to get PAGE_SIZE MAX_PAGE_ORDER declarations. The pagesize can also be determined dynamically with the getpagesize (2) system call. @@ -324,7 +324,7 @@ Definitions: (see /proc/slabinfo) depends on the architecture -- ``sizeof(void *)`` depends on the architecture -- PAGE_SIZE or getpagesize (2) - is the value defined with MAX_ORDER + is the value defined with MAX_PAGE_ORDER it's an upper bound of frame's capture size (more on this later) ============== ================================================================ diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index f8567e95f98b..b2ab8db63c4b 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -1362,7 +1362,7 @@ config ARCH_FORCE_MAX_ORDER default "10" help The kernel page allocator limits the size of maximal physically - contiguous allocations. The limit is called MAX_ORDER and it + contiguous allocations. The limit is called MAX_PAGE_ORDER and it defines the maximal power of two of number of pages that can be allocated as a single contiguous block. This option allows overriding the default setting when ability to allocate very diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 12d611f3da5d..442539fd06fe 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1520,15 +1520,15 @@ config XEN # include/linux/mmzone.h requires the following to be true: # -# MAX_ORDER + PAGE_SHIFT <= SECTION_SIZE_BITS +# MAX_PAGE_ORDER + PAGE_SHIFT <= SECTION_SIZE_BITS # -# so the maximum value of MAX_ORDER is SECTION_SIZE_BITS - PAGE_SHIFT: +# so the maximum value of MAX_PAGE_ORDER is SECTION_SIZE_BITS - PAGE_SHIFT: # -# | SECTION_SIZE_BITS | PAGE_SHIFT | max MAX_ORDER | default MAX_ORDER | -# ----+-------------------+--------------+-----------------+--------------------+ -# 4K | 27 | 12 | 15 | 10 | -# 16K | 27 | 14 | 13 | 11 | -# 64K | 29 | 16 | 13 | 13 | +# | SECTION_SIZE_BITS | PAGE_SHIFT | max MAX_PAGE_ORDER | default MAX_PAGE_ORDER | +# ----+-------------------+--------------+----------------------+-------------------------+ +# 4K | 27 | 12 | 15 | 10 | +# 16K | 27 | 14 | 13 | 11 | +# 64K | 29 | 16 | 13 | 13 | config ARCH_FORCE_MAX_ORDER int default "13" if ARM64_64K_PAGES @@ -1536,16 +1536,16 @@ config ARCH_FORCE_MAX_ORDER default "10" help The kernel page allocator limits the size of maximal physically - contiguous allocations. The limit is called MAX_ORDER and it + contiguous allocations. The limit is called MAX_PAGE_ORDER and it defines the maximal power of two of number of pages that can be allocated as a single contiguous block. This option allows overriding the default setting when ability to allocate very large blocks of physically contiguous memory is required. The maximal size of allocation cannot exceed the size of the - section, so the value of MAX_ORDER should satisfy + section, so the value of MAX_PAGE_ORDER should satisfy - MAX_ORDER + PAGE_SHIFT <= SECTION_SIZE_BITS + MAX_PAGE_ORDER + PAGE_SHIFT <= SECTION_SIZE_BITS Don't change if unsure. diff --git a/arch/arm64/include/asm/sparsemem.h b/arch/arm64/include/asm/sparsemem.h index 5f5437621029..8a8acc220371 100644 --- a/arch/arm64/include/asm/sparsemem.h +++ b/arch/arm64/include/asm/sparsemem.h @@ -10,7 +10,7 @@ /* * Section size must be at least 512MB for 64K base * page size config. Otherwise it will be less than - * MAX_ORDER and the build process will fail. + * MAX_PAGE_ORDER and the build process will fail. */ #ifdef CONFIG_ARM64_64K_PAGES #define SECTION_SIZE_BITS 29 diff --git a/arch/arm64/kvm/hyp/nvhe/page_alloc.c b/arch/arm64/kvm/hyp/nvhe/page_alloc.c index b1e392186a0f..e691290d3765 100644 --- a/arch/arm64/kvm/hyp/nvhe/page_alloc.c +++ b/arch/arm64/kvm/hyp/nvhe/page_alloc.c @@ -228,7 +228,8 @@ int hyp_pool_init(struct hyp_pool *pool, u64 pfn, unsigned int nr_pages, int i; hyp_spin_lock_init(&pool->lock); - pool->max_order = min(MAX_ORDER, get_order(nr_pages << PAGE_SHIFT)); + pool->max_order = min(MAX_PAGE_ORDER, + get_order(nr_pages << PAGE_SHIFT)); for (i = 0; i <= pool->max_order; i++) INIT_LIST_HEAD(&pool->free_area[i]); pool->range_start = phys; diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index f5aae342632c..8116ac599f80 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -51,7 +51,7 @@ void __init arm64_hugetlb_cma_reserve(void) * page allocator. Just warn if there is any change * breaking this assumption. */ - WARN_ON(order <= MAX_ORDER); + WARN_ON(order <= MAX_PAGE_ORDER); hugetlb_cma_reserve(order); } #endif /* CONFIG_CMA */ diff --git a/arch/m68k/Kconfig.cpu b/arch/m68k/Kconfig.cpu index ad69b466a08b..9dcf245c9cbf 100644 --- a/arch/m68k/Kconfig.cpu +++ b/arch/m68k/Kconfig.cpu @@ -402,7 +402,7 @@ config ARCH_FORCE_MAX_ORDER default "10" help The kernel page allocator limits the size of maximal physically - contiguous allocations. The limit is called MAX_ORDER and it + contiguous allocations. The limit is called MAX_PAGE_ORDER and it defines the maximal power of two of number of pages that can be allocated as a single contiguous block. This option allows overriding the default setting when ability to allocate very diff --git a/arch/nios2/Kconfig b/arch/nios2/Kconfig index d54464021a61..58d9565dc2c7 100644 --- a/arch/nios2/Kconfig +++ b/arch/nios2/Kconfig @@ -50,7 +50,7 @@ config ARCH_FORCE_MAX_ORDER default "10" help The kernel page allocator limits the size of maximal physically - contiguous allocations. The limit is called MAX_ORDER and it + contiguous allocations. The limit is called MAX_PAGE_ORDER and it defines the maximal power of two of number of pages that can be allocated as a single contiguous block. This option allows overriding the default setting when ability to allocate very diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 1f11a62809f2..52d7e3fad553 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -915,7 +915,7 @@ config ARCH_FORCE_MAX_ORDER default "10" help The kernel page allocator limits the size of maximal physically - contiguous allocations. The limit is called MAX_ORDER and it + contiguous allocations. The limit is called MAX_PAGE_ORDER and it defines the maximal power of two of number of pages that can be allocated as a single contiguous block. This option allows overriding the default setting when ability to allocate very diff --git a/arch/powerpc/mm/book3s64/iommu_api.c b/arch/powerpc/mm/book3s64/iommu_api.c index d19fb1f3007d..c0e8d597e4cb 100644 --- a/arch/powerpc/mm/book3s64/iommu_api.c +++ b/arch/powerpc/mm/book3s64/iommu_api.c @@ -97,7 +97,7 @@ static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua, } mmap_read_lock(mm); - chunk = (1UL << (PAGE_SHIFT + MAX_ORDER)) / + chunk = (1UL << (PAGE_SHIFT + MAX_PAGE_ORDER)) / sizeof(struct vm_area_struct *); chunk = min(chunk, entries); for (entry = 0; entry < entries; entry += chunk) { diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index f7c683b672c1..0a540b37aab6 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -615,7 +615,7 @@ void __init gigantic_hugetlb_cma_reserve(void) order = mmu_psize_to_shift(MMU_PAGE_16G) - PAGE_SHIFT; if (order) { - VM_WARN_ON(order <= MAX_ORDER); + VM_WARN_ON(order <= MAX_PAGE_ORDER); hugetlb_cma_reserve(order); } } diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 28fac4770073..23f5b5093ec1 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -1389,7 +1389,7 @@ static long pnv_pci_ioda2_setup_default_config(struct pnv_ioda_pe *pe) * DMA window can be larger than available memory, which will * cause errors later. */ - const u64 maxblock = 1UL << (PAGE_SHIFT + MAX_ORDER); + const u64 maxblock = 1UL << (PAGE_SHIFT + MAX_PAGE_ORDER); /* * We create the default window as big as we can. The constraint is diff --git a/arch/sh/mm/Kconfig b/arch/sh/mm/Kconfig index 511c17aede4a..455311d9a5e9 100644 --- a/arch/sh/mm/Kconfig +++ b/arch/sh/mm/Kconfig @@ -26,7 +26,7 @@ config ARCH_FORCE_MAX_ORDER default "10" help The kernel page allocator limits the size of maximal physically - contiguous allocations. The limit is called MAX_ORDER and it + contiguous allocations. The limit is called MAX_PAGE:_ORDER and it defines the maximal power of two of number of pages that can be allocated as a single contiguous block. This option allows overriding the default setting when ability to allocate very diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index 49849790e66d..204c43cb3d43 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -277,7 +277,7 @@ config ARCH_FORCE_MAX_ORDER default "12" help The kernel page allocator limits the size of maximal physically - contiguous allocations. The limit is called MAX_ORDER and it + contiguous allocations. The limit is called MAX_PAGE_ORDER and it defines the maximal power of two of number of pages that can be allocated as a single contiguous block. This option allows overriding the default setting when ability to allocate very diff --git a/arch/sparc/kernel/pci_sun4v.c b/arch/sparc/kernel/pci_sun4v.c index c80b0a21d709..083e5f05a7f0 100644 --- a/arch/sparc/kernel/pci_sun4v.c +++ b/arch/sparc/kernel/pci_sun4v.c @@ -194,7 +194,7 @@ static void *dma_4v_alloc_coherent(struct device *dev, size_t size, size = IO_PAGE_ALIGN(size); order = get_order(size); - if (unlikely(order > MAX_ORDER)) + if (unlikely(order > MAX_PAGE_ORDER)) return NULL; npages = size >> IO_PAGE_SHIFT; diff --git a/arch/sparc/mm/tsb.c b/arch/sparc/mm/tsb.c index 5e2931a18409..6acd8a4c1e2a 100644 --- a/arch/sparc/mm/tsb.c +++ b/arch/sparc/mm/tsb.c @@ -402,8 +402,8 @@ void tsb_grow(struct mm_struct *mm, unsigned long tsb_index, unsigned long rss) unsigned long new_rss_limit; gfp_t gfp_flags; - if (max_tsb_size > PAGE_SIZE << MAX_ORDER) - max_tsb_size = PAGE_SIZE << MAX_ORDER; + if (max_tsb_size > PAGE_SIZE << MAX_PAGE_ORDER) + max_tsb_size = PAGE_SIZE << MAX_PAGE_ORDER; new_cache_index = 0; for (new_size = 8192; new_size < max_tsb_size; new_size <<= 1UL) { diff --git a/arch/um/kernel/um_arch.c b/arch/um/kernel/um_arch.c index b1bfed0c8528..7a9820797eae 100644 --- a/arch/um/kernel/um_arch.c +++ b/arch/um/kernel/um_arch.c @@ -373,10 +373,10 @@ int __init linux_main(int argc, char **argv) max_physmem = TASK_SIZE - uml_physmem - iomem_size - MIN_VMALLOC; /* - * Zones have to begin on a 1 << MAX_ORDER page boundary, + * Zones have to begin on a 1 << MAX_PAGE_ORDER page boundary, * so this makes sure that's true for highmem */ - max_physmem &= ~((1 << (PAGE_SHIFT + MAX_ORDER)) - 1); + max_physmem &= ~((1 << (PAGE_SHIFT + MAX_PAGE_ORDER)) - 1); if (physmem_size + iomem_size > max_physmem) { highmem = physmem_size + iomem_size - max_physmem; physmem_size -= highmem; diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig index 7d792077e5fd..e031eaf36c99 100644 --- a/arch/xtensa/Kconfig +++ b/arch/xtensa/Kconfig @@ -793,7 +793,7 @@ config ARCH_FORCE_MAX_ORDER default "10" help The kernel page allocator limits the size of maximal physically - contiguous allocations. The limit is called MAX_ORDER and it + contiguous allocations. The limit is called MAX_PAGE_ORDER and it defines the maximal power of two of number of pages that can be allocated as a single contiguous block. This option allows overriding the default setting when ability to allocate very diff --git a/drivers/accel/qaic/qaic_data.c b/drivers/accel/qaic/qaic_data.c index 4a8e43a7a6a4..aaeb2c9c071a 100644 --- a/drivers/accel/qaic/qaic_data.c +++ b/drivers/accel/qaic/qaic_data.c @@ -451,7 +451,7 @@ static int create_sgt(struct qaic_device *qdev, struct sg_table **sgt_out, u64 s * later */ buf_extra = (PAGE_SIZE - size % PAGE_SIZE) % PAGE_SIZE; - max_order = min(MAX_ORDER - 1, get_order(size)); + max_order = min(MAX_PAGE_ORDER - 1, get_order(size)); } else { /* allocate a single page for book keeping */ nr_pages = 1; diff --git a/drivers/base/regmap/regmap-debugfs.c b/drivers/base/regmap/regmap-debugfs.c index bdd80b73c3e6..fb84cda92a75 100644 --- a/drivers/base/regmap/regmap-debugfs.c +++ b/drivers/base/regmap/regmap-debugfs.c @@ -226,8 +226,8 @@ static ssize_t regmap_read_debugfs(struct regmap *map, unsigned int from, if (*ppos < 0 || !count) return -EINVAL; - if (count > (PAGE_SIZE << MAX_ORDER)) - count = PAGE_SIZE << MAX_ORDER; + if (count > (PAGE_SIZE << MAX_PAGE_ORDER)) + count = PAGE_SIZE << MAX_PAGE_ORDER; buf = kmalloc(count, GFP_KERNEL); if (!buf) @@ -373,8 +373,8 @@ static ssize_t regmap_reg_ranges_read_file(struct file *file, if (*ppos < 0 || !count) return -EINVAL; - if (count > (PAGE_SIZE << MAX_ORDER)) - count = PAGE_SIZE << MAX_ORDER; + if (count > (PAGE_SIZE << MAX_PAGE_ORDER)) + count = PAGE_SIZE << MAX_PAGE_ORDER; buf = kmalloc(count, GFP_KERNEL); if (!buf) diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index 11114a5d9e5c..d0e41d52d6a9 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -3079,7 +3079,7 @@ static void raw_cmd_free(struct floppy_raw_cmd **ptr) } } -#define MAX_LEN (1UL << MAX_ORDER << PAGE_SHIFT) +#define MAX_LEN (1UL << MAX_PAGE_ORDER << PAGE_SHIFT) static int raw_cmd_copyin(int cmd, void __user *param, struct floppy_raw_cmd **rcmd) diff --git a/drivers/crypto/ccp/sev-dev.c b/drivers/crypto/ccp/sev-dev.c index fcaccd0b5a65..e4d3f45242f6 100644 --- a/drivers/crypto/ccp/sev-dev.c +++ b/drivers/crypto/ccp/sev-dev.c @@ -906,7 +906,7 @@ static int sev_ioctl_do_get_id2(struct sev_issue_cmd *argp) /* * The length of the ID shouldn't be assumed by software since * it may change in the future. The allocation size is limited - * to 1 << (PAGE_SHIFT + MAX_ORDER) by the page allocator. + * to 1 << (PAGE_SHIFT + MAX_PAGE_ORDER) by the page allocator. * If the allocation fails, simply return ENOMEM rather than * warning in the kernel log. */ diff --git a/drivers/crypto/hisilicon/sgl.c b/drivers/crypto/hisilicon/sgl.c index 3df7a256e919..5c1012d7ffa9 100644 --- a/drivers/crypto/hisilicon/sgl.c +++ b/drivers/crypto/hisilicon/sgl.c @@ -70,11 +70,11 @@ struct hisi_acc_sgl_pool *hisi_acc_create_sgl_pool(struct device *dev, HISI_ACC_SGL_ALIGN_SIZE); /* - * the pool may allocate a block of memory of size PAGE_SIZE * 2^MAX_ORDER, + * the pool may allocate a block of memory of size PAGE_SIZE * 2^MAX_PAGE_ORDER, * block size may exceed 2^31 on ia64, so the max of block size is 2^31 */ - block_size = 1 << (PAGE_SHIFT + MAX_ORDER < 32 ? - PAGE_SHIFT + MAX_ORDER : 31); + block_size = 1 << (PAGE_SHIFT + MAX_PAGE_ORDER < 32 ? + PAGE_SHIFT + MAX_PAGE_ORDER : 31); sgl_num_per_block = block_size / sgl_size; block_num = count / sgl_num_per_block; remain_sgl = count % sgl_num_per_block; diff --git a/drivers/gpu/drm/i915/gem/i915_gem_internal.c b/drivers/gpu/drm/i915/gem/i915_gem_internal.c index 6bc26b4b06b8..ea7561ae6e13 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_internal.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_internal.c @@ -36,7 +36,7 @@ static int i915_gem_object_get_pages_internal(struct drm_i915_gem_object *obj) struct sg_table *st; struct scatterlist *sg; unsigned int npages; /* restricted by sg_alloc_table */ - int max_order = MAX_ORDER; + int max_order = MAX_PAGE_ORDER; unsigned int max_segment; gfp_t gfp; diff --git a/drivers/gpu/drm/i915/gem/selftests/huge_pages.c b/drivers/gpu/drm/i915/gem/selftests/huge_pages.c index 6b9f6cf50bf6..84c50c4c4af7 100644 --- a/drivers/gpu/drm/i915/gem/selftests/huge_pages.c +++ b/drivers/gpu/drm/i915/gem/selftests/huge_pages.c @@ -115,7 +115,7 @@ static int get_huge_pages(struct drm_i915_gem_object *obj) do { struct page *page; - GEM_BUG_ON(order > MAX_ORDER); + GEM_BUG_ON(order > MAX_PAGE_ORDER); page = alloc_pages(GFP | __GFP_ZERO, order); if (!page) goto err; diff --git a/drivers/gpu/drm/ttm/tests/ttm_pool_test.c b/drivers/gpu/drm/ttm/tests/ttm_pool_test.c index 2d9cae8cd984..cceaa18d4e46 100644 --- a/drivers/gpu/drm/ttm/tests/ttm_pool_test.c +++ b/drivers/gpu/drm/ttm/tests/ttm_pool_test.c @@ -109,7 +109,7 @@ static const struct ttm_pool_test_case ttm_pool_basic_cases[] = { }, { .description = "Above the allocation limit", - .order = MAX_ORDER + 1, + .order = MAX_PAGE_ORDER + 1, }, { .description = "One page, with coherent DMA mappings enabled", @@ -118,7 +118,7 @@ static const struct ttm_pool_test_case ttm_pool_basic_cases[] = { }, { .description = "Above the allocation limit, with coherent DMA mappings enabled", - .order = MAX_ORDER + 1, + .order = MAX_PAGE_ORDER + 1, .use_dma_alloc = true, }, }; @@ -165,7 +165,7 @@ static void ttm_pool_alloc_basic(struct kunit *test) fst_page = tt->pages[0]; last_page = tt->pages[tt->num_pages - 1]; - if (params->order <= MAX_ORDER) { + if (params->order <= MAX_PAGE_ORDER) { if (params->use_dma_alloc) { KUNIT_ASSERT_NOT_NULL(test, (void *)fst_page->private); KUNIT_ASSERT_NOT_NULL(test, (void *)last_page->private); @@ -182,7 +182,7 @@ static void ttm_pool_alloc_basic(struct kunit *test) * order 0 blocks */ KUNIT_ASSERT_EQ(test, fst_page->private, - min_t(unsigned int, MAX_ORDER, + min_t(unsigned int, MAX_PAGE_ORDER, params->order)); KUNIT_ASSERT_EQ(test, last_page->private, 0); } diff --git a/drivers/gpu/drm/ttm/ttm_pool.c b/drivers/gpu/drm/ttm/ttm_pool.c index d183bb97c526..b62f420a9f96 100644 --- a/drivers/gpu/drm/ttm/ttm_pool.c +++ b/drivers/gpu/drm/ttm/ttm_pool.c @@ -447,7 +447,7 @@ int ttm_pool_alloc(struct ttm_pool *pool, struct ttm_tt *tt, else gfp_flags |= GFP_HIGHUSER; - for (order = min_t(unsigned int, MAX_ORDER, __fls(num_pages)); + for (order = min_t(unsigned int, MAX_PAGE_ORDER, __fls(num_pages)); num_pages; order = min_t(unsigned int, order, __fls(num_pages))) { struct ttm_pool_type *pt; diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h index 961205ba86d2..925ac6a47bce 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h @@ -188,7 +188,7 @@ #ifdef CONFIG_CMA_ALIGNMENT #define Q_MAX_SZ_SHIFT (PAGE_SHIFT + CONFIG_CMA_ALIGNMENT) #else -#define Q_MAX_SZ_SHIFT (PAGE_SHIFT + MAX_ORDER) +#define Q_MAX_SZ_SHIFT (PAGE_SHIFT + MAX_PAGE_ORDER) #endif /* diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index 85163a83df2f..e59f50e11ea8 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -884,7 +884,7 @@ static struct page **__iommu_dma_alloc_pages(struct device *dev, struct page **pages; unsigned int i = 0, nid = dev_to_node(dev); - order_mask &= GENMASK(MAX_ORDER, 0); + order_mask &= GENMASK(MAX_PAGE_ORDER, 0); if (!order_mask) return NULL; diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index 9a7a74239eab..d097001c1e3e 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -2465,8 +2465,8 @@ static bool its_parse_indirect_baser(struct its_node *its, * feature is not supported by hardware. */ new_order = max_t(u32, get_order(esz << ids), new_order); - if (new_order > MAX_ORDER) { - new_order = MAX_ORDER; + if (new_order > MAX_PAGE_ORDER) { + new_order = MAX_PAGE_ORDER; ids = ilog2(PAGE_ORDER_TO_SIZE(new_order) / (int)esz); pr_warn("ITS@%pa: %s Table too large, reduce ids %llu->%u\n", &its->phys_base, its_base_type_string[type], diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index f03d7dba270c..13c65b7e1ed6 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -1170,7 +1170,7 @@ static void __cache_size_refresh(void) * If the allocation may fail we use __get_free_pages. Memory fragmentation * won't have a fatal effect here, but it just causes flushes of some other * buffers and more I/O will be performed. Don't use __get_free_pages if it - * always fails (i.e. order > MAX_ORDER). + * always fails (i.e. order > MAX_PAGE_ORDER). * * If the allocation shouldn't fail we use __vmalloc. This is only for the * initial reserve allocation, so there's no risk of wasting all vmalloc diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 2ae8560b6a14..855b482cbff1 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -1673,7 +1673,7 @@ static struct bio *crypt_alloc_buffer(struct dm_crypt_io *io, unsigned int size) unsigned int nr_iovecs = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; gfp_t gfp_mask = GFP_NOWAIT | __GFP_HIGHMEM; unsigned int remaining_size; - unsigned int order = MAX_ORDER; + unsigned int order = MAX_PAGE_ORDER; retry: if (unlikely(gfp_mask & __GFP_DIRECT_RECLAIM)) diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c index f57fb821528d..7916ed9f10e8 100644 --- a/drivers/md/dm-flakey.c +++ b/drivers/md/dm-flakey.c @@ -434,7 +434,7 @@ static struct bio *clone_bio(struct dm_target *ti, struct flakey_c *fc, struct b remaining_size = size; - order = MAX_ORDER; + order = MAX_PAGE_ORDER; while (remaining_size) { struct page *pages; unsigned size_to_add, to_copy; diff --git a/drivers/misc/genwqe/card_dev.c b/drivers/misc/genwqe/card_dev.c index 55fc5b80e649..4441aca2280a 100644 --- a/drivers/misc/genwqe/card_dev.c +++ b/drivers/misc/genwqe/card_dev.c @@ -443,7 +443,7 @@ static int genwqe_mmap(struct file *filp, struct vm_area_struct *vma) if (vsize == 0) return -EINVAL; - if (get_order(vsize) > MAX_ORDER) + if (get_order(vsize) > MAX_PAGE_ORDER) return -ENOMEM; dma_map = kzalloc(sizeof(struct dma_mapping), GFP_KERNEL); diff --git a/drivers/misc/genwqe/card_utils.c b/drivers/misc/genwqe/card_utils.c index 1c798d6b2dfb..a2c4a9b4f871 100644 --- a/drivers/misc/genwqe/card_utils.c +++ b/drivers/misc/genwqe/card_utils.c @@ -210,7 +210,7 @@ u32 genwqe_crc32(u8 *buff, size_t len, u32 init) void *__genwqe_alloc_consistent(struct genwqe_dev *cd, size_t size, dma_addr_t *dma_handle) { - if (get_order(size) > MAX_ORDER) + if (get_order(size) > MAX_PAGE_ORDER) return NULL; return dma_alloc_coherent(&cd->pci_dev->dev, size, dma_handle, @@ -308,7 +308,7 @@ int genwqe_alloc_sync_sgl(struct genwqe_dev *cd, struct genwqe_sgl *sgl, sgl->write = write; sgl->sgl_size = genwqe_sgl_size(sgl->nr_pages); - if (get_order(sgl->sgl_size) > MAX_ORDER) { + if (get_order(sgl->sgl_size) > MAX_PAGE_ORDER) { dev_err(&pci_dev->dev, "[%s] err: too much memory requested!\n", __func__); return ret; diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c index b618797a7e8d..f1695c889d3a 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c @@ -1041,7 +1041,7 @@ static void hns3_init_tx_spare_buffer(struct hns3_enet_ring *ring) return; order = get_order(alloc_size); - if (order > MAX_ORDER) { + if (order > MAX_PAGE_ORDER) { if (net_ratelimit()) dev_warn(ring_to_dev(ring), "failed to allocate tx spare buffer, exceed to max order\n"); return; diff --git a/drivers/net/ethernet/ibm/ibmvnic.h b/drivers/net/ethernet/ibm/ibmvnic.h index 4e18b4cefa97..94ac36b1408b 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.h +++ b/drivers/net/ethernet/ibm/ibmvnic.h @@ -48,7 +48,7 @@ * of 4096 jumbo frames (MTU=9000) we will need about 9K*4K = 36MB plus * some padding. * - * But the size of a single DMA region is limited by MAX_ORDER in the + * But the size of a single DMA region is limited by MAX_PAGE_ORDER in the * kernel (about 16MB currently). To support say 4K Jumbo frames, we * use a set of LTBs (struct ltb_set) per pool. * @@ -75,7 +75,7 @@ * pool for the 4MB. Thus the 16 Rx and Tx queues require 32 * 5 = 160 * plus 16 for the TSO pools for a total of 176 LTB mappings per VNIC. */ -#define IBMVNIC_ONE_LTB_MAX ((u32)((1 << MAX_ORDER) * PAGE_SIZE)) +#define IBMVNIC_ONE_LTB_MAX ((u32)((1 << MAX_PAGE_ORDER) * PAGE_SIZE)) #define IBMVNIC_ONE_LTB_SIZE min((u32)(8 << 20), IBMVNIC_ONE_LTB_MAX) #define IBMVNIC_LTB_SET_SIZE (38 << 20) diff --git a/drivers/video/fbdev/hyperv_fb.c b/drivers/video/fbdev/hyperv_fb.c index a80939fe2ee6..6a29d2594b91 100644 --- a/drivers/video/fbdev/hyperv_fb.c +++ b/drivers/video/fbdev/hyperv_fb.c @@ -927,8 +927,8 @@ static phys_addr_t hvfb_get_phymem(struct hv_device *hdev, if (request_size == 0) return -1; - if (order <= MAX_ORDER) { - /* Call alloc_pages if the size is less than 2^MAX_ORDER */ + if (order <= MAX_PAGE_ORDER) { + /* Call alloc_pages if the size is less than 2^MAX_PAGE_ORDER */ page = alloc_pages(GFP_KERNEL | __GFP_ZERO, order); if (!page) return -1; @@ -958,7 +958,7 @@ static void hvfb_release_phymem(struct hv_device *hdev, { unsigned int order = get_order(size); - if (order <= MAX_ORDER) + if (order <= MAX_PAGE_ORDER) __free_pages(pfn_to_page(paddr >> PAGE_SHIFT), order); else dma_free_coherent(&hdev->device, diff --git a/drivers/video/fbdev/vermilion/vermilion.c b/drivers/video/fbdev/vermilion/vermilion.c index 840ead69654b..a32e5b2924c9 100644 --- a/drivers/video/fbdev/vermilion/vermilion.c +++ b/drivers/video/fbdev/vermilion/vermilion.c @@ -197,7 +197,7 @@ static int vmlfb_alloc_vram(struct vml_info *vinfo, va = &vinfo->vram[i]; order = 0; - while (requested > (PAGE_SIZE << order) && order <= MAX_ORDER) + while (requested > (PAGE_SIZE << order) && order <= MAX_PAGE_ORDER) order++; err = vmlfb_alloc_vram_area(va, order, 0); diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c index 1fe93e93f5bc..59cdc0292dce 100644 --- a/drivers/virtio/virtio_balloon.c +++ b/drivers/virtio/virtio_balloon.c @@ -33,7 +33,7 @@ #define VIRTIO_BALLOON_FREE_PAGE_ALLOC_FLAG (__GFP_NORETRY | __GFP_NOWARN | \ __GFP_NOMEMALLOC) /* The order of free page blocks to report to host */ -#define VIRTIO_BALLOON_HINT_BLOCK_ORDER MAX_ORDER +#define VIRTIO_BALLOON_HINT_BLOCK_ORDER MAX_PAGE_ORDER /* The size of a free page block in bytes */ #define VIRTIO_BALLOON_HINT_BLOCK_BYTES \ (1 << (VIRTIO_BALLOON_HINT_BLOCK_ORDER + PAGE_SHIFT)) diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c index fa5226c198cc..8e3223294442 100644 --- a/drivers/virtio/virtio_mem.c +++ b/drivers/virtio/virtio_mem.c @@ -1154,13 +1154,13 @@ static void virtio_mem_clear_fake_offline(unsigned long pfn, */ static void virtio_mem_fake_online(unsigned long pfn, unsigned long nr_pages) { - unsigned long order = MAX_ORDER; + unsigned long order = MAX_PAGE_ORDER; unsigned long i; /* * We might get called for ranges that don't cover properly aligned - * MAX_ORDER pages; however, we can only online properly aligned - * pages with an order of MAX_ORDER at maximum. + * MAX_PAGE_ORDER pages; however, we can only online properly aligned + * pages with an order of MAX_PAGE_ORDER at maximum. */ while (!IS_ALIGNED(pfn | nr_pages, 1 << order)) order--; @@ -1280,7 +1280,7 @@ static void virtio_mem_online_page(struct virtio_mem *vm, bool do_online; /* - * We can get called with any order up to MAX_ORDER. If our subblock + * We can get called with any order up to MAX_PAGE_ORDER. If our subblock * size is smaller than that and we have a mixture of plugged and * unplugged subblocks within such a page, we have to process in * smaller granularity. In that case we'll adjust the order exactly once diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c index efb1b4c1a0a4..7a6d980e614d 100644 --- a/fs/ramfs/file-nommu.c +++ b/fs/ramfs/file-nommu.c @@ -70,7 +70,7 @@ int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize) /* make various checks */ order = get_order(newsize); - if (unlikely(order > MAX_ORDER)) + if (unlikely(order > MAX_PAGE_ORDER)) return -EFBIG; ret = inode_newsize_ok(inode, newsize); diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 236ec7b63c54..c1ee640d87b1 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -829,7 +829,7 @@ static inline unsigned huge_page_shift(struct hstate *h) static inline bool hstate_is_gigantic(struct hstate *h) { - return huge_page_order(h) > MAX_ORDER; + return huge_page_order(h) > MAX_PAGE_ORDER; } static inline unsigned int pages_per_huge_page(const struct hstate *h) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 1ea7636dfb76..4ed33b127821 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -27,15 +27,15 @@ /* Free memory management - zoned buddy allocator. */ #ifndef CONFIG_ARCH_FORCE_MAX_ORDER -#define MAX_ORDER 10 +#define MAX_PAGE_ORDER 10 #else -#define MAX_ORDER CONFIG_ARCH_FORCE_MAX_ORDER +#define MAX_PAGE_ORDER CONFIG_ARCH_FORCE_MAX_ORDER #endif -#define MAX_ORDER_NR_PAGES (1 << MAX_ORDER) +#define MAX_ORDER_NR_PAGES (1 << MAX_PAGE_ORDER) #define IS_MAX_ORDER_ALIGNED(pfn) IS_ALIGNED(pfn, MAX_ORDER_NR_PAGES) -#define NR_PAGE_ORDERS (MAX_ORDER + 1) +#define NR_PAGE_ORDERS (MAX_PAGE_ORDER + 1) /* * PAGE_ALLOC_COSTLY_ORDER is the order at which allocations are deemed @@ -938,7 +938,7 @@ struct zone { struct free_area free_area[NR_PAGE_ORDERS]; #ifdef CONFIG_UNACCEPTED_MEMORY - /* Pages to be accepted. All pages on the list are MAX_ORDER */ + /* Pages to be accepted. All pages on the list are MAX_PAGE_ORDER */ struct list_head unaccepted_pages; #endif @@ -1748,8 +1748,8 @@ static inline bool movable_only_nodes(nodemask_t *nodes) #define SECTION_BLOCKFLAGS_BITS \ ((1UL << (PFN_SECTION_SHIFT - pageblock_order)) * NR_PAGEBLOCK_BITS) -#if (MAX_ORDER + PAGE_SHIFT) > SECTION_SIZE_BITS -#error Allocator MAX_ORDER exceeds SECTION_SIZE +#if (MAX_PAGE_ORDER + PAGE_SHIFT) > SECTION_SIZE_BITS +#error Allocator MAX_PAGE_ORDER exceeds SECTION_SIZE #endif static inline unsigned long pfn_to_section_nr(unsigned long pfn) diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h index e83c4c095041..3f2409b968ec 100644 --- a/include/linux/pageblock-flags.h +++ b/include/linux/pageblock-flags.h @@ -41,14 +41,14 @@ extern unsigned int pageblock_order; * Huge pages are a constant size, but don't exceed the maximum allocation * granularity. */ -#define pageblock_order min_t(unsigned int, HUGETLB_PAGE_ORDER, MAX_ORDER) +#define pageblock_order min_t(unsigned int, HUGETLB_PAGE_ORDER, MAX_PAGE_ORDER) #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ #else /* CONFIG_HUGETLB_PAGE */ /* If huge pages are not used, group by MAX_ORDER_NR_PAGES */ -#define pageblock_order MAX_ORDER +#define pageblock_order MAX_PAGE_ORDER #endif /* CONFIG_HUGETLB_PAGE */ diff --git a/include/linux/slab.h b/include/linux/slab.h index d6d6ffeeb9a2..d63823e518c0 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -308,7 +308,7 @@ static inline unsigned int arch_slab_minalign(void) * (PAGE_SIZE*2). Larger requests are passed to the page allocator. */ #define KMALLOC_SHIFT_HIGH (PAGE_SHIFT + 1) -#define KMALLOC_SHIFT_MAX (MAX_ORDER + PAGE_SHIFT) +#define KMALLOC_SHIFT_MAX (MAX_PAGE_ORDER + PAGE_SHIFT) #ifndef KMALLOC_SHIFT_LOW #define KMALLOC_SHIFT_LOW 5 #endif @@ -316,7 +316,7 @@ static inline unsigned int arch_slab_minalign(void) #ifdef CONFIG_SLUB #define KMALLOC_SHIFT_HIGH (PAGE_SHIFT + 1) -#define KMALLOC_SHIFT_MAX (MAX_ORDER + PAGE_SHIFT) +#define KMALLOC_SHIFT_MAX (MAX_PAGE_ORDER + PAGE_SHIFT) #ifndef KMALLOC_SHIFT_LOW #define KMALLOC_SHIFT_LOW 3 #endif diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c index b481c48a31a6..d10613eb0f63 100644 --- a/kernel/dma/pool.c +++ b/kernel/dma/pool.c @@ -84,8 +84,8 @@ static int atomic_pool_expand(struct gen_pool *pool, size_t pool_size, void *addr; int ret = -ENOMEM; - /* Cannot allocate larger than MAX_ORDER */ - order = min(get_order(pool_size), MAX_ORDER); + /* Cannot allocate larger than MAX_PAGE_ORDER */ + order = min(get_order(pool_size), MAX_PAGE_ORDER); do { pool_size = 1 << (PAGE_SHIFT + order); @@ -190,7 +190,7 @@ static int __init dma_atomic_pool_init(void) /* * If coherent_pool was not used on the command line, default the pool - * sizes to 128KB per 1GB of memory, min 128KB, max MAX_ORDER. + * sizes to 128KB per 1GB of memory, min 128KB, max MAX_PAGE_ORDER. */ if (!atomic_pool_size) { unsigned long pages = totalram_pages() / (SZ_1G / SZ_128K); diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 33d942615be5..176078bf2215 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -686,8 +686,8 @@ static struct io_tlb_pool *swiotlb_alloc_pool(struct device *dev, size_t pool_size; size_t tlb_size; - if (nslabs > SLABS_PER_PAGE << MAX_ORDER) { - nslabs = SLABS_PER_PAGE << MAX_ORDER; + if (nslabs > SLABS_PER_PAGE << MAX_PAGE_ORDER) { + nslabs = SLABS_PER_PAGE << MAX_PAGE_ORDER; nareas = limit_nareas(nareas, nslabs); } diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index e8d82c2f07d0..60ed43d1c29e 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -610,8 +610,8 @@ static struct page *rb_alloc_aux_page(int node, int order) { struct page *page; - if (order > MAX_ORDER) - order = MAX_ORDER; + if (order > MAX_PAGE_ORDER) + order = MAX_PAGE_ORDER; do { page = alloc_pages_node(node, PERF_AUX_GFP, order); @@ -702,9 +702,9 @@ int rb_alloc_aux(struct perf_buffer *rb, struct perf_event *event, /* * kcalloc_node() is unable to allocate buffer if the size is larger - * than: PAGE_SIZE << MAX_ORDER; directly bail out in this case. + * than: PAGE_SIZE << MAX_PAGE_ORDER; directly bail out in this case. */ - if (get_order((unsigned long)nr_pages * sizeof(void *)) > MAX_ORDER) + if (get_order((unsigned long)nr_pages * sizeof(void *)) > MAX_PAGE_ORDER) return -ENOMEM; rb->aux_pages = kcalloc_node(nr_pages, sizeof(void *), GFP_KERNEL, node); @@ -821,7 +821,7 @@ struct perf_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags) size = sizeof(struct perf_buffer); size += nr_pages * sizeof(void *); - if (order_base_2(size) > PAGE_SHIFT+MAX_ORDER) + if (order_base_2(size) > PAGE_SHIFT+MAX_PAGE_ORDER) goto fail; node = (cpu == -1) ? cpu : cpu_to_node(cpu); diff --git a/mm/Kconfig b/mm/Kconfig index 79d563d8f9e0..cb9d470f0bf7 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -381,7 +381,7 @@ config SHUFFLE_PAGE_ALLOCATOR the presence of a memory-side-cache. There are also incidental security benefits as it reduces the predictability of page allocations to compliment SLAB_FREELIST_RANDOM, but the - default granularity of shuffling on the MAX_ORDER i.e, 10th + default granularity of shuffling on the MAX_PAGE_ORDER i.e, 10th order of pages is selected based on cache utilization benefits on x86. @@ -713,8 +713,8 @@ config HUGETLB_PAGE_SIZE_VARIABLE HUGETLB_PAGE_ORDER when there are multiple HugeTLB page sizes available on a platform. - Note that the pageblock_order cannot exceed MAX_ORDER and will be - clamped down to MAX_ORDER. + Note that the pageblock_order cannot exceed MAX_PAGE_ORDER and will be + clamped down to MAX_PAGE_ORDER. config CONTIG_ALLOC def_bool (MEMORY_ISOLATION && COMPACTION) || CMA diff --git a/mm/compaction.c b/mm/compaction.c index 24f8eb4d6260..27ada42924d5 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -999,7 +999,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, * a valid page order. Consider only values in the * valid order range to prevent low_pfn overflow. */ - if (freepage_order > 0 && freepage_order <= MAX_ORDER) { + if (freepage_order > 0 && freepage_order <= MAX_PAGE_ORDER) { low_pfn += (1UL << freepage_order) - 1; nr_scanned += (1UL << freepage_order) - 1; } @@ -1017,7 +1017,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, if (PageCompound(page) && !cc->alloc_contig) { const unsigned int order = compound_order(page); - if (likely(order <= MAX_ORDER)) { + if (likely(order <= MAX_PAGE_ORDER)) { low_pfn += (1UL << order) - 1; nr_scanned += (1UL << order) - 1; } diff --git a/mm/debug_page_alloc.c b/mm/debug_page_alloc.c index f9d145730fd1..6755f0c9d4a3 100644 --- a/mm/debug_page_alloc.c +++ b/mm/debug_page_alloc.c @@ -22,7 +22,7 @@ static int __init debug_guardpage_minorder_setup(char *buf) { unsigned long res; - if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) { + if (kstrtoul(buf, 10, &res) < 0 || res > MAX_PAGE_ORDER / 2) { pr_err("Bad debug_guardpage_minorder value\n"); return 0; } diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index e651500e597a..5662e29fe253 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -1091,7 +1091,7 @@ debug_vm_pgtable_alloc_huge_page(struct pgtable_debug_args *args, int order) struct page *page = NULL; #ifdef CONFIG_CONTIG_ALLOC - if (order > MAX_ORDER) { + if (order > MAX_PAGE_ORDER) { page = alloc_contig_pages((1 << order), GFP_KERNEL, first_online_node, NULL); if (page) { @@ -1101,7 +1101,7 @@ debug_vm_pgtable_alloc_huge_page(struct pgtable_debug_args *args, int order) } #endif - if (order <= MAX_ORDER) + if (order <= MAX_PAGE_ORDER) page = alloc_pages(GFP_KERNEL, order); return page; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 1a588e29d287..b9a7a57691d7 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -682,7 +682,7 @@ static int __init hugepage_init(void) /* * hugepages can't be allocated by the buddy allocator */ - MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER > MAX_ORDER); + MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER > MAX_PAGE_ORDER); /* * we use page->mapping and page->index in second tail page * as list_head: assuming THP order >= 2 diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 378e460a6ab4..0d262784ce60 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3410,7 +3410,7 @@ static void __init prep_and_add_bootmem_folios(struct hstate *h, /* * Put bootmem huge pages into the standard lists after mem_map is up. - * Note: This only applies to gigantic (order > MAX_ORDER) pages. + * Note: This only applies to gigantic (order > MAX_PAGE_ORDER) pages. */ static void __init gather_bootmem_prealloc(void) { @@ -4790,7 +4790,7 @@ static int __init default_hugepagesz_setup(char *s) * The number of default huge pages (for this size) could have been * specified as the first hugetlb parameter: hugepages=X. If so, * then default_hstate_max_huge_pages is set. If the default huge - * page size is gigantic (> MAX_ORDER), then the pages must be + * page size is gigantic (> MAX_PAGE_ORDER), then the pages must be * allocated here from bootmem allocator. */ if (default_hstate_max_huge_pages) { diff --git a/mm/internal.h b/mm/internal.h index ac40c3d00336..f309a010d50f 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -335,7 +335,7 @@ static inline bool page_is_buddy(struct page *page, struct page *buddy, * satisfies the following equation: * P = B & ~(1 << O) * - * Assumption: *_mem_map is contiguous at least up to MAX_ORDER + * Assumption: *_mem_map is contiguous at least up to MAX_PAGE_ORDER */ static inline unsigned long __find_buddy_pfn(unsigned long page_pfn, unsigned int order) diff --git a/mm/kmsan/init.c b/mm/kmsan/init.c index 103e2e88ea03..3ac3b8921d36 100644 --- a/mm/kmsan/init.c +++ b/mm/kmsan/init.c @@ -141,7 +141,7 @@ struct smallstack { static struct smallstack collect = { .index = 0, - .order = MAX_ORDER, + .order = MAX_PAGE_ORDER, }; static void smallstack_push(struct smallstack *stack, struct page *pages) @@ -211,8 +211,8 @@ static void kmsan_memblock_discard(void) * order=N-1, * - repeat. */ - collect.order = MAX_ORDER; - for (int i = MAX_ORDER; i >= 0; i--) { + collect.order = MAX_PAGE_ORDER; + for (int i = MAX_PAGE_ORDER; i >= 0; i--) { if (held_back[i].shadow) smallstack_push(&collect, held_back[i].shadow); if (held_back[i].origin) diff --git a/mm/memblock.c b/mm/memblock.c index 4a62f7774b65..8c194d8afeec 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -2113,12 +2113,13 @@ static void __init __free_pages_memory(unsigned long start, unsigned long end) * Free the pages in the largest chunks alignment allows. * * __ffs() behaviour is undefined for 0. start == 0 is - * MAX_ORDER-aligned, set order to MAX_ORDER for the case. + * MAX_PAGE_ORDER-aligned, set order to MAX_PAGE_ORDER for + * the case. */ if (start) - order = min_t(int, MAX_ORDER, __ffs(start)); + order = min_t(int, MAX_PAGE_ORDER, __ffs(start)); else - order = MAX_ORDER; + order = MAX_PAGE_ORDER; while (start + (1UL << order) > end) order--; diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 926e1cfb10e9..b3c0ff52bb72 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -645,7 +645,7 @@ static void online_pages_range(unsigned long start_pfn, unsigned long nr_pages) unsigned long pfn; /* - * Online the pages in MAX_ORDER aligned chunks. The callback might + * Online the pages in MAX_PAGE_ORDER aligned chunks. The callback might * decide to not expose all pages to the buddy (e.g., expose them * later). We account all pages as being online and belonging to this * zone ("present"). @@ -660,12 +660,13 @@ static void online_pages_range(unsigned long start_pfn, unsigned long nr_pages) * Free to online pages in the largest chunks alignment allows. * * __ffs() behaviour is undefined for 0. start == 0 is - * MAX_ORDER-aligned, Set order to MAX_ORDER for the case. + * MAX_PAGE_ORDER-aligned, Set order to MAX_PAGE_ORDER for + * the case. */ if (pfn) - order = min_t(int, MAX_ORDER, __ffs(pfn)); + order = min_t(int, MAX_PAGE_ORDER, __ffs(pfn)); else - order = MAX_ORDER; + order = MAX_PAGE_ORDER; (*online_page_callback)(pfn_to_page(pfn), order); pfn += (1UL << order); diff --git a/mm/mm_init.c b/mm/mm_init.c index 2830eef2b16c..89dc29f1e6c6 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -1455,7 +1455,7 @@ static inline void setup_usemap(struct zone *zone) {} /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */ void __init set_pageblock_order(void) { - unsigned int order = MAX_ORDER; + unsigned int order = MAX_PAGE_ORDER; /* Check that pageblock_nr_pages has not already been setup */ if (pageblock_order) @@ -1638,7 +1638,7 @@ static void __init alloc_node_mem_map(struct pglist_data *pgdat) start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1); offset = pgdat->node_start_pfn - start; /* - * The zone's endpoints aren't required to be MAX_ORDER + * The zone's endpoints aren't required to be MAX_PAGE_ORDER * aligned but the node_mem_map endpoints must be in order * for the buddy allocator to function correctly. */ @@ -1964,11 +1964,11 @@ static void __init deferred_free_range(unsigned long pfn, if (nr_pages == MAX_ORDER_NR_PAGES && IS_MAX_ORDER_ALIGNED(pfn)) { for (i = 0; i < nr_pages; i += pageblock_nr_pages) set_pageblock_migratetype(page + i, MIGRATE_MOVABLE); - __free_pages_core(page, MAX_ORDER); + __free_pages_core(page, MAX_PAGE_ORDER); return; } - /* Accept chunks smaller than MAX_ORDER upfront */ + /* Accept chunks smaller than MAX_PAGE_ORDER upfront */ accept_memory(PFN_PHYS(pfn), PFN_PHYS(pfn + nr_pages)); for (i = 0; i < nr_pages; i++, page++, pfn++) { @@ -1991,8 +1991,8 @@ static inline void __init pgdat_init_report_one_done(void) /* * Returns true if page needs to be initialized or freed to buddy allocator. * - * We check if a current MAX_ORDER block is valid by only checking the validity - * of the head pfn. + * We check if a current MAX_PAGE_ORDER block is valid by only checking the + * validity of the head pfn. */ static inline bool __init deferred_pfn_valid(unsigned long pfn) { @@ -2149,8 +2149,8 @@ deferred_init_memmap_chunk(unsigned long start_pfn, unsigned long end_pfn, deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, start_pfn); /* - * Initialize and free pages in MAX_ORDER sized increments so that we - * can avoid introducing any issues with the buddy allocator. + * Initialize and free pages in MAX_PAGE_ORDER sized increments so that + * we can avoid introducing any issues with the buddy allocator. */ while (spfn < end_pfn) { deferred_init_maxorder(&i, zone, &spfn, &epfn); @@ -2291,7 +2291,7 @@ bool __init deferred_grow_zone(struct zone *zone, unsigned int order) } /* - * Initialize and free pages in MAX_ORDER sized increments so + * Initialize and free pages in MAX_PAGE_ORDER sized increments so * that we can avoid introducing any issues with the buddy * allocator. */ @@ -2509,7 +2509,7 @@ void *__init alloc_large_system_hash(const char *tablename, else table = memblock_alloc_raw(size, SMP_CACHE_BYTES); - } else if (get_order(size) > MAX_ORDER || hashdist) { + } else if (get_order(size) > MAX_PAGE_ORDER || hashdist) { table = vmalloc_huge(size, gfp_flags); virt = true; if (table) @@ -2756,7 +2756,7 @@ void __init mm_core_init(void) /* * page_ext requires contiguous pages, - * bigger than MAX_ORDER unless SPARSEMEM. + * bigger than MAX_PAGE_ORDER unless SPARSEMEM. */ page_ext_init_flatmem(); mem_debugging_and_hardening_init(); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ccecf6158ae4..a01baf0454f8 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -727,7 +727,7 @@ buddy_merge_likely(unsigned long pfn, unsigned long buddy_pfn, unsigned long higher_page_pfn; struct page *higher_page; - if (order >= MAX_ORDER - 1) + if (order >= MAX_PAGE_ORDER - 1) return false; higher_page_pfn = buddy_pfn & pfn; @@ -782,7 +782,7 @@ static inline void __free_one_page(struct page *page, VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page); VM_BUG_ON_PAGE(bad_range(zone, page), page); - while (order < MAX_ORDER) { + while (order < MAX_PAGE_ORDER) { if (compaction_capture(capc, page, order, migratetype)) { __mod_zone_freepage_state(zone, -(1 << order), migratetype); @@ -1297,7 +1297,7 @@ void __free_pages_core(struct page *page, unsigned int order) atomic_long_add(nr_pages, &page_zone(page)->managed_pages); if (page_contains_unaccepted(page, order)) { - if (order == MAX_ORDER && __free_unaccepted(page)) + if (order == MAX_PAGE_ORDER && __free_unaccepted(page)) return; accept_page(page, order); @@ -1327,7 +1327,7 @@ void __free_pages_core(struct page *page, unsigned int order) * * Note: the function may return non-NULL struct page even for a page block * which contains a memory hole (i.e. there is no physical memory for a subset - * of the pfn range). For example, if the pageblock order is MAX_ORDER, which + * of the pfn range). For example, if the pageblock order is MAX_PAGE_ORDER, which * will fall into 2 sub-sections, and the end pfn of the pageblock may be hole * even though the start pfn is online and valid. This should be safe most of * the time because struct pages are still initialized via init_unavailable_range() @@ -2018,7 +2018,7 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype, * approximates finding the pageblock with the most free pages, which * would be too costly to do exactly. */ - for (current_order = MAX_ORDER; current_order >= min_order; + for (current_order = MAX_PAGE_ORDER; current_order >= min_order; --current_order) { area = &(zone->free_area[current_order]); fallback_mt = find_suitable_fallback(area, current_order, @@ -2056,7 +2056,7 @@ find_smallest: * This should not happen - we already found a suitable fallback * when looking for the largest page. */ - VM_BUG_ON(current_order > MAX_ORDER); + VM_BUG_ON(current_order > MAX_PAGE_ORDER); do_steal: page = get_page_from_free_area(area, fallback_mt); @@ -4533,7 +4533,7 @@ struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid, * There are several places where we assume that the order value is sane * so bail out early if the request is out of bound. */ - if (WARN_ON_ONCE_GFP(order > MAX_ORDER, gfp)) + if (WARN_ON_ONCE_GFP(order > MAX_PAGE_ORDER, gfp)) return NULL; gfp &= gfp_allowed_mask; @@ -4815,7 +4815,7 @@ static void *make_alloc_exact(unsigned long addr, unsigned int order, * minimum number of pages to satisfy the request. alloc_pages() can only * allocate memory in power-of-two pages. * - * This function is also limited by MAX_ORDER. + * This function is also limited by MAX_PAGE_ORDER. * * Memory allocated by this function must be released by free_pages_exact(). * @@ -6373,7 +6373,7 @@ int alloc_contig_range(unsigned long start, unsigned long end, order = 0; outer_start = start; while (!PageBuddy(pfn_to_page(outer_start))) { - if (++order > MAX_ORDER) { + if (++order > MAX_PAGE_ORDER) { outer_start = start; break; } @@ -6635,7 +6635,7 @@ bool is_free_buddy_page(struct page *page) break; } - return order <= MAX_ORDER; + return order <= MAX_PAGE_ORDER; } EXPORT_SYMBOL(is_free_buddy_page); @@ -6807,9 +6807,9 @@ static bool try_to_accept_memory_one(struct zone *zone) __mod_zone_page_state(zone, NR_UNACCEPTED, -MAX_ORDER_NR_PAGES); spin_unlock_irqrestore(&zone->lock, flags); - accept_page(page, MAX_ORDER); + accept_page(page, MAX_PAGE_ORDER); - __free_pages_ok(page, MAX_ORDER, FPI_TO_TAIL); + __free_pages_ok(page, MAX_PAGE_ORDER, FPI_TO_TAIL); if (last) static_branch_dec(&zones_with_unaccepted_pages); diff --git a/mm/page_isolation.c b/mm/page_isolation.c index bcf99ba747a0..cd0ea3668253 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -226,7 +226,7 @@ static void unset_migratetype_isolate(struct page *page, int migratetype) */ if (PageBuddy(page)) { order = buddy_order(page); - if (order >= pageblock_order && order < MAX_ORDER) { + if (order >= pageblock_order && order < MAX_PAGE_ORDER) { buddy = find_buddy_page_pfn(page, page_to_pfn(page), order, NULL); if (buddy && !is_migrate_isolate_page(buddy)) { @@ -290,11 +290,12 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages) * isolate_single_pageblock() * @migratetype: migrate type to set in error recovery. * - * Free and in-use pages can be as big as MAX_ORDER and contain more than one + * Free and in-use pages can be as big as MAX_PAGE_ORDER and contain more than one * pageblock. When not all pageblocks within a page are isolated at the same * time, free page accounting can go wrong. For example, in the case of - * MAX_ORDER = pageblock_order + 1, a MAX_ORDER page has two pagelbocks. - * [ MAX_ORDER ] + * MAX_PAGE_ORDER = pageblock_order + 1, a MAX_PAGE_ORDER page has two + * pagelbocks. + * [ MAX_PAGE_ORDER ] * [ pageblock0 | pageblock1 ] * When either pageblock is isolated, if it is a free page, the page is not * split into separate migratetype lists, which is supposed to; if it is an @@ -451,7 +452,7 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags, * the free page to the right migratetype list. * * head_pfn is not used here as a hugetlb page order - * can be bigger than MAX_ORDER, but after it is + * can be bigger than MAX_PAGE_ORDER, but after it is * freed, the free page order is not. Use pfn within * the range to find the head of the free page. */ @@ -459,7 +460,7 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags, outer_pfn = pfn; while (!PageBuddy(pfn_to_page(outer_pfn))) { /* stop if we cannot find the free page */ - if (++order > MAX_ORDER) + if (++order > MAX_PAGE_ORDER) goto failed; outer_pfn &= ~0UL << order; } @@ -660,8 +661,8 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn, int ret; /* - * Note: pageblock_nr_pages != MAX_ORDER. Then, chunks of free pages - * are not aligned to pageblock_nr_pages. + * Note: pageblock_nr_pages != MAX_PAGE_ORDER. Then, chunks of free + * pages are not aligned to pageblock_nr_pages. * Then we just check migratetype first. */ for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { diff --git a/mm/page_owner.c b/mm/page_owner.c index 040dbf26a986..5634e5d890f8 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -320,7 +320,7 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m, unsigned long freepage_order; freepage_order = buddy_order_unsafe(page); - if (freepage_order <= MAX_ORDER) + if (freepage_order <= MAX_PAGE_ORDER) pfn += (1UL << freepage_order) - 1; continue; } @@ -555,7 +555,7 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) if (PageBuddy(page)) { unsigned long freepage_order = buddy_order_unsafe(page); - if (freepage_order <= MAX_ORDER) + if (freepage_order <= MAX_PAGE_ORDER) pfn += (1UL << freepage_order) - 1; continue; } @@ -663,7 +663,7 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone) if (PageBuddy(page)) { unsigned long order = buddy_order_unsafe(page); - if (order > 0 && order <= MAX_ORDER) + if (order > 0 && order <= MAX_PAGE_ORDER) pfn += (1UL << order) - 1; continue; } diff --git a/mm/page_reporting.c b/mm/page_reporting.c index 66369cc5279b..e4c428e61d8c 100644 --- a/mm/page_reporting.c +++ b/mm/page_reporting.c @@ -20,7 +20,7 @@ static int page_order_update_notify(const char *val, const struct kernel_param * * If param is set beyond this limit, order is set to default * pageblock_order value */ - return param_set_uint_minmax(val, kp, 0, MAX_ORDER); + return param_set_uint_minmax(val, kp, 0, MAX_PAGE_ORDER); } static const struct kernel_param_ops page_reporting_param_ops = { @@ -370,7 +370,7 @@ int page_reporting_register(struct page_reporting_dev_info *prdev) */ if (page_reporting_order == -1) { - if (prdev->order > 0 && prdev->order <= MAX_ORDER) + if (prdev->order > 0 && prdev->order <= MAX_PAGE_ORDER) page_reporting_order = prdev->order; else page_reporting_order = pageblock_order; diff --git a/mm/shuffle.h b/mm/shuffle.h index a6bdf54f96f1..61bbcddeeee6 100644 --- a/mm/shuffle.h +++ b/mm/shuffle.h @@ -4,7 +4,7 @@ #define _MM_SHUFFLE_H #include -#define SHUFFLE_ORDER MAX_ORDER +#define SHUFFLE_ORDER MAX_PAGE_ORDER #ifdef CONFIG_SHUFFLE_PAGE_ALLOCATOR DECLARE_STATIC_KEY_FALSE(page_alloc_shuffle_key); diff --git a/mm/slab.c b/mm/slab.c index 773c79e153f3..073cae923d56 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -465,7 +465,7 @@ static int __init slab_max_order_setup(char *str) { get_option(&str, &slab_max_order); slab_max_order = slab_max_order < 0 ? 0 : - min(slab_max_order, MAX_ORDER); + min(slab_max_order, MAX_PAGE_ORDER); slab_max_order_set = true; return 1; diff --git a/mm/slub.c b/mm/slub.c index a5420be89c8c..ba162e661e2e 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -4194,7 +4194,7 @@ static inline int calculate_order(unsigned int size) * Doh this slab cannot be placed using slub_max_order. */ order = get_order(size); - if (order <= MAX_ORDER) + if (order <= MAX_PAGE_ORDER) return order; return -ENOSYS; } @@ -4722,7 +4722,7 @@ __setup("slub_min_order=", setup_slub_min_order); static int __init setup_slub_max_order(char *str) { get_option(&str, (int *)&slub_max_order); - slub_max_order = min_t(unsigned int, slub_max_order, MAX_ORDER); + slub_max_order = min_t(unsigned int, slub_max_order, MAX_PAGE_ORDER); if (slub_min_order > slub_max_order) slub_min_order = slub_max_order; diff --git a/mm/vmscan.c b/mm/vmscan.c index 600ed3cbf7cb..68f0abbb8e59 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -6415,7 +6415,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, * scan_control uses s8 fields for order, priority, and reclaim_idx. * Confirm they are large enough for max values. */ - BUILD_BUG_ON(MAX_ORDER >= S8_MAX); + BUILD_BUG_ON(MAX_PAGE_ORDER >= S8_MAX); BUILD_BUG_ON(DEF_PRIORITY > S8_MAX); BUILD_BUG_ON(MAX_NR_ZONES > S8_MAX); diff --git a/mm/vmstat.c b/mm/vmstat.c index 03ead31c46a0..db79935e4a54 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1092,7 +1092,7 @@ static int __fragmentation_index(unsigned int order, struct contig_page_info *in { unsigned long requested = 1UL << order; - if (WARN_ON_ONCE(order > MAX_ORDER)) + if (WARN_ON_ONCE(order > MAX_PAGE_ORDER)) return 0; if (!info->free_blocks_total) diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index 89981dbe46c9..97704a9e84c7 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -844,7 +844,7 @@ long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev) goto out; /* the calculated number of cq entries fits to mlx5 cq allocation */ cqe_size_order = cache_line_size() == 128 ? 7 : 6; - smc_order = MAX_ORDER - cqe_size_order; + smc_order = MAX_PAGE_ORDER - cqe_size_order; if (SMC_MAX_CQE + 2 > (0x00000001 << smc_order) * PAGE_SIZE) cqattr.cqe = (0x00000001 << smc_order) * PAGE_SIZE - 2; smcibdev->roce_cq_send = ib_create_cq(smcibdev->ibdev, diff --git a/security/integrity/ima/ima_crypto.c b/security/integrity/ima/ima_crypto.c index 51ad29940f05..f3738b2c8bcd 100644 --- a/security/integrity/ima/ima_crypto.c +++ b/security/integrity/ima/ima_crypto.c @@ -38,7 +38,7 @@ static int param_set_bufsize(const char *val, const struct kernel_param *kp) size = memparse(val, NULL); order = get_order(size); - if (order > MAX_ORDER) + if (order > MAX_PAGE_ORDER) return -EINVAL; ima_maxorder = order; ima_bufsize = PAGE_SIZE << order; diff --git a/tools/perf/Documentation/perf-intel-pt.txt b/tools/perf/Documentation/perf-intel-pt.txt index 4c90cc176f81..2109690b0d5f 100644 --- a/tools/perf/Documentation/perf-intel-pt.txt +++ b/tools/perf/Documentation/perf-intel-pt.txt @@ -683,7 +683,7 @@ Buffer handling ~~~~~~~~~~~~~~~ There may be buffer limitations (i.e. single ToPa entry) which means that actual -buffer sizes are limited to powers of 2 up to 4MiB (MAX_ORDER). In order to +buffer sizes are limited to powers of 2 up to 4MiB (MAX_PAGE_ORDER). In order to provide other sizes, and in particular an arbitrarily large size, multiple buffers are logically concatenated. However an interrupt must be used to switch between buffers. That has two potential problems: diff --git a/tools/testing/memblock/linux/mmzone.h b/tools/testing/memblock/linux/mmzone.h index 134f8eab0768..71546e15bdd3 100644 --- a/tools/testing/memblock/linux/mmzone.h +++ b/tools/testing/memblock/linux/mmzone.h @@ -17,10 +17,10 @@ enum zone_type { }; #define MAX_NR_ZONES __MAX_NR_ZONES -#define MAX_ORDER 10 -#define MAX_ORDER_NR_PAGES (1 << MAX_ORDER) +#define MAX_PAGE_ORDER 10 +#define MAX_ORDER_NR_PAGES (1 << MAX_PAGE_ORDER) -#define pageblock_order MAX_ORDER +#define pageblock_order MAX_PAGE_ORDER #define pageblock_nr_pages BIT(pageblock_order) #define pageblock_align(pfn) ALIGN((pfn), pageblock_nr_pages) #define pageblock_start_pfn(pfn) ALIGN_DOWN((pfn), pageblock_nr_pages) diff --git a/tools/testing/selftests/mm/thuge-gen.c b/tools/testing/selftests/mm/thuge-gen.c index 16ed4dfa7359..622987f12c89 100644 --- a/tools/testing/selftests/mm/thuge-gen.c +++ b/tools/testing/selftests/mm/thuge-gen.c @@ -3,7 +3,8 @@ Before running this huge pages for each huge page size must have been reserved. - For large pages beyond MAX_ORDER (like 1GB on x86) boot options must be used. + For large pages beyond MAX_PAGE_ORDER (like 1GB on x86) boot options must + be used. Also shmmax must be increased. And you need to run as root to work around some weird permissions in shm. And nothing using huge pages should run in parallel. -- cgit v1.2.3