From b2472efe4316b2687c153919c1513a098bd82c17 Mon Sep 17 00:00:00 2001 From: Peng Zhang Date: Fri, 27 Oct 2023 11:38:37 +0800 Subject: maple_tree: introduce {mtree,mas}_lock_nested() In some cases, nested locks may be needed, so {mtree,mas}_lock_nested is introduced. For example, when duplicating maple tree, we need to hold the locks of two trees, in which case nested locks are needed. At the same time, add the definition of spin_lock_nested() in tools for testing. Link: https://lkml.kernel.org/r/20231027033845.90608-3-zhangpeng.00@bytedance.com Signed-off-by: Peng Zhang Reviewed-by: Liam R. Howlett Cc: Christian Brauner Cc: Jonathan Corbet Cc: Mateusz Guzik Cc: Mathieu Desnoyers Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Mike Christie Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- tools/include/linux/spinlock.h | 1 + 1 file changed, 1 insertion(+) (limited to 'tools') diff --git a/tools/include/linux/spinlock.h b/tools/include/linux/spinlock.h index 622266b197d0..a6cdf25b6b9d 100644 --- a/tools/include/linux/spinlock.h +++ b/tools/include/linux/spinlock.h @@ -11,6 +11,7 @@ #define spin_lock_init(x) pthread_mutex_init(x, NULL) #define spin_lock(x) pthread_mutex_lock(x) +#define spin_lock_nested(x, subclass) pthread_mutex_lock(x) #define spin_unlock(x) pthread_mutex_unlock(x) #define spin_lock_bh(x) pthread_mutex_lock(x) #define spin_unlock_bh(x) pthread_mutex_unlock(x) -- cgit v1.2.3 From 46c99e26f2f86260fed226cab217d0b3ca8dca56 Mon Sep 17 00:00:00 2001 From: Peng Zhang Date: Fri, 27 Oct 2023 11:38:39 +0800 Subject: radix tree test suite: align kmem_cache_alloc_bulk() with kernel behavior. When kmem_cache_alloc_bulk() fails to allocate, leave the freed pointers in the array. This enables a more accurate simulation of the kernel's behavior and allows for testing potential double-free scenarios. Link: https://lkml.kernel.org/r/20231027033845.90608-5-zhangpeng.00@bytedance.com Signed-off-by: Peng Zhang Reviewed-by: Liam R. Howlett Cc: Christian Brauner Cc: Jonathan Corbet Cc: Mateusz Guzik Cc: Mathieu Desnoyers Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Mike Christie Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- tools/testing/radix-tree/linux.c | 45 +++++++++++++++++++++++++++++----------- 1 file changed, 33 insertions(+), 12 deletions(-) (limited to 'tools') diff --git a/tools/testing/radix-tree/linux.c b/tools/testing/radix-tree/linux.c index 61fe2601cb3a..4eb442206d01 100644 --- a/tools/testing/radix-tree/linux.c +++ b/tools/testing/radix-tree/linux.c @@ -93,13 +93,9 @@ void *kmem_cache_alloc_lru(struct kmem_cache *cachep, struct list_lru *lru, return p; } -void kmem_cache_free_locked(struct kmem_cache *cachep, void *objp) +void __kmem_cache_free_locked(struct kmem_cache *cachep, void *objp) { assert(objp); - uatomic_dec(&nr_allocated); - uatomic_dec(&cachep->nr_allocated); - if (kmalloc_verbose) - printf("Freeing %p to slab\n", objp); if (cachep->nr_objs > 10 || cachep->align) { memset(objp, POISON_FREE, cachep->size); free(objp); @@ -111,6 +107,15 @@ void kmem_cache_free_locked(struct kmem_cache *cachep, void *objp) } } +void kmem_cache_free_locked(struct kmem_cache *cachep, void *objp) +{ + uatomic_dec(&nr_allocated); + uatomic_dec(&cachep->nr_allocated); + if (kmalloc_verbose) + printf("Freeing %p to slab\n", objp); + __kmem_cache_free_locked(cachep, objp); +} + void kmem_cache_free(struct kmem_cache *cachep, void *objp) { pthread_mutex_lock(&cachep->lock); @@ -141,18 +146,17 @@ int kmem_cache_alloc_bulk(struct kmem_cache *cachep, gfp_t gfp, size_t size, if (kmalloc_verbose) pr_debug("Bulk alloc %lu\n", size); - if (!(gfp & __GFP_DIRECT_RECLAIM)) { - if (cachep->non_kernel < size) - return 0; - - cachep->non_kernel -= size; - } - pthread_mutex_lock(&cachep->lock); if (cachep->nr_objs >= size) { struct radix_tree_node *node; for (i = 0; i < size; i++) { + if (!(gfp & __GFP_DIRECT_RECLAIM)) { + if (!cachep->non_kernel) + break; + cachep->non_kernel--; + } + node = cachep->objs; cachep->nr_objs--; cachep->objs = node->parent; @@ -163,11 +167,19 @@ int kmem_cache_alloc_bulk(struct kmem_cache *cachep, gfp_t gfp, size_t size, } else { pthread_mutex_unlock(&cachep->lock); for (i = 0; i < size; i++) { + if (!(gfp & __GFP_DIRECT_RECLAIM)) { + if (!cachep->non_kernel) + break; + cachep->non_kernel--; + } + if (cachep->align) { posix_memalign(&p[i], cachep->align, cachep->size); } else { p[i] = malloc(cachep->size); + if (!p[i]) + break; } if (cachep->ctor) cachep->ctor(p[i]); @@ -176,6 +188,15 @@ int kmem_cache_alloc_bulk(struct kmem_cache *cachep, gfp_t gfp, size_t size, } } + if (i < size) { + size = i; + pthread_mutex_lock(&cachep->lock); + for (i = 0; i < size; i++) + __kmem_cache_free_locked(cachep, p[i]); + pthread_mutex_unlock(&cachep->lock); + return 0; + } + for (i = 0; i < size; i++) { uatomic_inc(&nr_allocated); uatomic_inc(&cachep->nr_allocated); -- cgit v1.2.3 From a2587a7e8d37885dc063255f5400a66299b42e48 Mon Sep 17 00:00:00 2001 From: Peng Zhang Date: Fri, 27 Oct 2023 11:38:40 +0800 Subject: maple_tree: add test for mtree_dup() Add test for mtree_dup(). Test by duplicating different maple trees and then comparing the two trees. Includes tests for duplicating full trees and memory allocation failures on different nodes. Link: https://lkml.kernel.org/r/20231027033845.90608-6-zhangpeng.00@bytedance.com Signed-off-by: Peng Zhang Reviewed-by: Liam R. Howlett Cc: Christian Brauner Cc: Jonathan Corbet Cc: Mateusz Guzik Cc: Mathieu Desnoyers Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Mike Christie Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- tools/testing/radix-tree/maple.c | 361 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 361 insertions(+) (limited to 'tools') diff --git a/tools/testing/radix-tree/maple.c b/tools/testing/radix-tree/maple.c index e5da1cad70ba..12b3390e9591 100644 --- a/tools/testing/radix-tree/maple.c +++ b/tools/testing/radix-tree/maple.c @@ -35857,6 +35857,363 @@ static noinline void __init check_locky(struct maple_tree *mt) mt_clear_in_rcu(mt); } +/* + * Compares two nodes except for the addresses stored in the nodes. + * Returns zero if they are the same, otherwise returns non-zero. + */ +static int __init compare_node(struct maple_enode *enode_a, + struct maple_enode *enode_b) +{ + struct maple_node *node_a, *node_b; + struct maple_node a, b; + void **slots_a, **slots_b; /* Do not use the rcu tag. */ + enum maple_type type; + int i; + + if (((unsigned long)enode_a & MAPLE_NODE_MASK) != + ((unsigned long)enode_b & MAPLE_NODE_MASK)) { + pr_err("The lower 8 bits of enode are different.\n"); + return -1; + } + + type = mte_node_type(enode_a); + node_a = mte_to_node(enode_a); + node_b = mte_to_node(enode_b); + a = *node_a; + b = *node_b; + + /* Do not compare addresses. */ + if (ma_is_root(node_a) || ma_is_root(node_b)) { + a.parent = (struct maple_pnode *)((unsigned long)a.parent & + MA_ROOT_PARENT); + b.parent = (struct maple_pnode *)((unsigned long)b.parent & + MA_ROOT_PARENT); + } else { + a.parent = (struct maple_pnode *)((unsigned long)a.parent & + MAPLE_NODE_MASK); + b.parent = (struct maple_pnode *)((unsigned long)b.parent & + MAPLE_NODE_MASK); + } + + if (a.parent != b.parent) { + pr_err("The lower 8 bits of parents are different. %p %p\n", + a.parent, b.parent); + return -1; + } + + /* + * If it is a leaf node, the slots do not contain the node address, and + * no special processing of slots is required. + */ + if (ma_is_leaf(type)) + goto cmp; + + slots_a = ma_slots(&a, type); + slots_b = ma_slots(&b, type); + + for (i = 0; i < mt_slots[type]; i++) { + if (!slots_a[i] && !slots_b[i]) + break; + + if (!slots_a[i] || !slots_b[i]) { + pr_err("The number of slots is different.\n"); + return -1; + } + + /* Do not compare addresses in slots. */ + ((unsigned long *)slots_a)[i] &= MAPLE_NODE_MASK; + ((unsigned long *)slots_b)[i] &= MAPLE_NODE_MASK; + } + +cmp: + /* + * Compare all contents of two nodes, including parent (except address), + * slots (except address), pivots, gaps and metadata. + */ + return memcmp(&a, &b, sizeof(struct maple_node)); +} + +/* + * Compare two trees and return 0 if they are the same, non-zero otherwise. + */ +static int __init compare_tree(struct maple_tree *mt_a, struct maple_tree *mt_b) +{ + MA_STATE(mas_a, mt_a, 0, 0); + MA_STATE(mas_b, mt_b, 0, 0); + + if (mt_a->ma_flags != mt_b->ma_flags) { + pr_err("The flags of the two trees are different.\n"); + return -1; + } + + mas_dfs_preorder(&mas_a); + mas_dfs_preorder(&mas_b); + + if (mas_is_ptr(&mas_a) || mas_is_ptr(&mas_b)) { + if (!(mas_is_ptr(&mas_a) && mas_is_ptr(&mas_b))) { + pr_err("One is MAS_ROOT and the other is not.\n"); + return -1; + } + return 0; + } + + while (!mas_is_none(&mas_a) || !mas_is_none(&mas_b)) { + + if (mas_is_none(&mas_a) || mas_is_none(&mas_b)) { + pr_err("One is MAS_NONE and the other is not.\n"); + return -1; + } + + if (mas_a.min != mas_b.min || + mas_a.max != mas_b.max) { + pr_err("mas->min, mas->max do not match.\n"); + return -1; + } + + if (compare_node(mas_a.node, mas_b.node)) { + pr_err("The contents of nodes %p and %p are different.\n", + mas_a.node, mas_b.node); + mt_dump(mt_a, mt_dump_dec); + mt_dump(mt_b, mt_dump_dec); + return -1; + } + + mas_dfs_preorder(&mas_a); + mas_dfs_preorder(&mas_b); + } + + return 0; +} + +static __init void mas_subtree_max_range(struct ma_state *mas) +{ + unsigned long limit = mas->max; + MA_STATE(newmas, mas->tree, 0, 0); + void *entry; + + mas_for_each(mas, entry, limit) { + if (mas->last - mas->index >= + newmas.last - newmas.index) { + newmas = *mas; + } + } + + *mas = newmas; +} + +/* + * build_full_tree() - Build a full tree. + * @mt: The tree to build. + * @flags: Use @flags to build the tree. + * @height: The height of the tree to build. + * + * Build a tree with full leaf nodes and internal nodes. Note that the height + * should not exceed 3, otherwise it will take a long time to build. + * Return: zero if the build is successful, non-zero if it fails. + */ +static __init int build_full_tree(struct maple_tree *mt, unsigned int flags, + int height) +{ + MA_STATE(mas, mt, 0, 0); + unsigned long step; + int ret = 0, cnt = 1; + enum maple_type type; + + mt_init_flags(mt, flags); + mtree_insert_range(mt, 0, ULONG_MAX, xa_mk_value(5), GFP_KERNEL); + + mtree_lock(mt); + + while (1) { + mas_set(&mas, 0); + if (mt_height(mt) < height) { + mas.max = ULONG_MAX; + goto store; + } + + while (1) { + mas_dfs_preorder(&mas); + if (mas_is_none(&mas)) + goto unlock; + + type = mte_node_type(mas.node); + if (mas_data_end(&mas) + 1 < mt_slots[type]) { + mas_set(&mas, mas.min); + goto store; + } + } +store: + mas_subtree_max_range(&mas); + step = mas.last - mas.index; + if (step < 1) { + ret = -1; + goto unlock; + } + + step /= 2; + mas.last = mas.index + step; + mas_store_gfp(&mas, xa_mk_value(5), + GFP_KERNEL); + ++cnt; + } +unlock: + mtree_unlock(mt); + + MT_BUG_ON(mt, mt_height(mt) != height); + /* pr_info("height:%u number of elements:%d\n", mt_height(mt), cnt); */ + return ret; +} + +static noinline void __init check_mtree_dup(struct maple_tree *mt) +{ + DEFINE_MTREE(new); + int i, j, ret, count = 0; + unsigned int rand_seed = 17, rand; + + /* store a value at [0, 0] */ + mt_init_flags(mt, 0); + mtree_store_range(mt, 0, 0, xa_mk_value(0), GFP_KERNEL); + ret = mtree_dup(mt, &new, GFP_KERNEL); + MT_BUG_ON(&new, ret); + mt_validate(&new); + if (compare_tree(mt, &new)) + MT_BUG_ON(&new, 1); + + mtree_destroy(mt); + mtree_destroy(&new); + + /* The two trees have different attributes. */ + mt_init_flags(mt, 0); + mt_init_flags(&new, MT_FLAGS_ALLOC_RANGE); + ret = mtree_dup(mt, &new, GFP_KERNEL); + MT_BUG_ON(&new, ret != -EINVAL); + mtree_destroy(mt); + mtree_destroy(&new); + + /* The new tree is not empty */ + mt_init_flags(mt, 0); + mt_init_flags(&new, 0); + mtree_store(&new, 5, xa_mk_value(5), GFP_KERNEL); + ret = mtree_dup(mt, &new, GFP_KERNEL); + MT_BUG_ON(&new, ret != -EINVAL); + mtree_destroy(mt); + mtree_destroy(&new); + + /* Test for duplicating full trees. */ + for (i = 1; i <= 3; i++) { + ret = build_full_tree(mt, 0, i); + MT_BUG_ON(mt, ret); + mt_init_flags(&new, 0); + + ret = mtree_dup(mt, &new, GFP_KERNEL); + MT_BUG_ON(&new, ret); + mt_validate(&new); + if (compare_tree(mt, &new)) + MT_BUG_ON(&new, 1); + + mtree_destroy(mt); + mtree_destroy(&new); + } + + for (i = 1; i <= 3; i++) { + ret = build_full_tree(mt, MT_FLAGS_ALLOC_RANGE, i); + MT_BUG_ON(mt, ret); + mt_init_flags(&new, MT_FLAGS_ALLOC_RANGE); + + ret = mtree_dup(mt, &new, GFP_KERNEL); + MT_BUG_ON(&new, ret); + mt_validate(&new); + if (compare_tree(mt, &new)) + MT_BUG_ON(&new, 1); + + mtree_destroy(mt); + mtree_destroy(&new); + } + + /* Test for normal duplicating. */ + for (i = 0; i < 1000; i += 3) { + if (i & 1) { + mt_init_flags(mt, 0); + mt_init_flags(&new, 0); + } else { + mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); + mt_init_flags(&new, MT_FLAGS_ALLOC_RANGE); + } + + for (j = 0; j < i; j++) { + mtree_store_range(mt, j * 10, j * 10 + 5, + xa_mk_value(j), GFP_KERNEL); + } + + ret = mtree_dup(mt, &new, GFP_KERNEL); + MT_BUG_ON(&new, ret); + mt_validate(&new); + if (compare_tree(mt, &new)) + MT_BUG_ON(&new, 1); + + mtree_destroy(mt); + mtree_destroy(&new); + } + + /* Test memory allocation failed. */ + mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); + for (i = 0; i < 30; i += 3) { + mtree_store_range(mt, j * 10, j * 10 + 5, + xa_mk_value(j), GFP_KERNEL); + } + + /* Failed at the first node. */ + mt_init_flags(&new, MT_FLAGS_ALLOC_RANGE); + mt_set_non_kernel(0); + ret = mtree_dup(mt, &new, GFP_NOWAIT); + mt_set_non_kernel(0); + MT_BUG_ON(&new, ret != -ENOMEM); + mtree_destroy(mt); + mtree_destroy(&new); + + /* Random maple tree fails at a random node. */ + for (i = 0; i < 1000; i += 3) { + if (i & 1) { + mt_init_flags(mt, 0); + mt_init_flags(&new, 0); + } else { + mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); + mt_init_flags(&new, MT_FLAGS_ALLOC_RANGE); + } + + for (j = 0; j < i; j++) { + mtree_store_range(mt, j * 10, j * 10 + 5, + xa_mk_value(j), GFP_KERNEL); + } + /* + * The rand() library function is not used, so we can generate + * the same random numbers on any platform. + */ + rand_seed = rand_seed * 1103515245 + 12345; + rand = rand_seed / 65536 % 128; + mt_set_non_kernel(rand); + + ret = mtree_dup(mt, &new, GFP_NOWAIT); + mt_set_non_kernel(0); + if (ret != 0) { + MT_BUG_ON(&new, ret != -ENOMEM); + count++; + mtree_destroy(mt); + continue; + } + + mt_validate(&new); + if (compare_tree(mt, &new)) + MT_BUG_ON(&new, 1); + + mtree_destroy(mt); + mtree_destroy(&new); + } + + /* pr_info("mtree_dup() fail %d times\n", count); */ + BUG_ON(!count); +} + extern void test_kmem_cache_bulk(void); void farmer_tests(void) @@ -35904,6 +36261,10 @@ void farmer_tests(void) check_null_expand(&tree); mtree_destroy(&tree); + mt_init_flags(&tree, 0); + check_mtree_dup(&tree); + mtree_destroy(&tree); + /* RCU testing */ mt_init_flags(&tree, 0); check_erase_testset(&tree); -- cgit v1.2.3 From f670fa1caadb4ea532a89012c5451e4c6789bfcc Mon Sep 17 00:00:00 2001 From: Peng Zhang Date: Fri, 27 Oct 2023 11:38:42 +0800 Subject: maple_tree: skip other tests when BENCH is enabled Skip other tests when BENCH is enabled so that performance can be measured in user space. Link: https://lkml.kernel.org/r/20231027033845.90608-8-zhangpeng.00@bytedance.com Signed-off-by: Peng Zhang Reviewed-by: Liam R. Howlett Cc: Christian Brauner Cc: Jonathan Corbet Cc: Mateusz Guzik Cc: Mathieu Desnoyers Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Mike Christie Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- lib/test_maple_tree.c | 8 ++++---- tools/testing/radix-tree/maple.c | 2 ++ 2 files changed, 6 insertions(+), 4 deletions(-) (limited to 'tools') diff --git a/lib/test_maple_tree.c b/lib/test_maple_tree.c index 464eeb90d5ad..de470950714f 100644 --- a/lib/test_maple_tree.c +++ b/lib/test_maple_tree.c @@ -3585,10 +3585,6 @@ static int __init maple_tree_seed(void) pr_info("\nTEST STARTING\n\n"); - mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); - check_root_expand(&tree); - mtree_destroy(&tree); - #if defined(BENCH_SLOT_STORE) #define BENCH mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); @@ -3646,6 +3642,10 @@ static int __init maple_tree_seed(void) goto skip; #endif + mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); + check_root_expand(&tree); + mtree_destroy(&tree); + mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); check_iteration(&tree); mtree_destroy(&tree); diff --git a/tools/testing/radix-tree/maple.c b/tools/testing/radix-tree/maple.c index 12b3390e9591..cb5358674521 100644 --- a/tools/testing/radix-tree/maple.c +++ b/tools/testing/radix-tree/maple.c @@ -36299,7 +36299,9 @@ void farmer_tests(void) void maple_tree_tests(void) { +#if !defined(BENCH) farmer_tests(); +#endif maple_tree_seed(); maple_tree_harvest(); } -- cgit v1.2.3 From 446e1867e6df3cbdd19af6be8f8f4ed56176adb4 Mon Sep 17 00:00:00 2001 From: Peng Zhang Date: Fri, 27 Oct 2023 11:38:43 +0800 Subject: maple_tree: update check_forking() and bench_forking() Updated check_forking() and bench_forking() to use __mt_dup() to duplicate maple tree. Link: https://lkml.kernel.org/r/20231027033845.90608-9-zhangpeng.00@bytedance.com Signed-off-by: Peng Zhang Reviewed-by: Liam R. Howlett Cc: Christian Brauner Cc: Jonathan Corbet Cc: Mateusz Guzik Cc: Mathieu Desnoyers Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Mike Christie Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- lib/test_maple_tree.c | 117 ++++++++++++++++++++++---------------------- tools/include/linux/rwsem.h | 4 ++ 2 files changed, 62 insertions(+), 59 deletions(-) (limited to 'tools') diff --git a/lib/test_maple_tree.c b/lib/test_maple_tree.c index de470950714f..3e4597fb49d3 100644 --- a/lib/test_maple_tree.c +++ b/lib/test_maple_tree.c @@ -1834,47 +1834,48 @@ static noinline void __init bench_mas_prev(struct maple_tree *mt) } #endif /* check_forking - simulate the kernel forking sequence with the tree. */ -static noinline void __init check_forking(struct maple_tree *mt) +static noinline void __init check_forking(void) { - - struct maple_tree newmt; - int i, nr_entries = 134; + struct maple_tree mt, newmt; + int i, nr_entries = 134, ret; void *val; - MA_STATE(mas, mt, 0, 0); - MA_STATE(newmas, mt, 0, 0); - struct rw_semaphore newmt_lock; + MA_STATE(mas, &mt, 0, 0); + MA_STATE(newmas, &newmt, 0, 0); + struct rw_semaphore mt_lock, newmt_lock; + init_rwsem(&mt_lock); init_rwsem(&newmt_lock); - for (i = 0; i <= nr_entries; i++) - mtree_store_range(mt, i*10, i*10 + 5, - xa_mk_value(i), GFP_KERNEL); + mt_init_flags(&mt, MT_FLAGS_ALLOC_RANGE | MT_FLAGS_LOCK_EXTERN); + mt_set_external_lock(&mt, &mt_lock); - mt_set_non_kernel(99999); mt_init_flags(&newmt, MT_FLAGS_ALLOC_RANGE | MT_FLAGS_LOCK_EXTERN); mt_set_external_lock(&newmt, &newmt_lock); - newmas.tree = &newmt; - mas_reset(&newmas); - mas_reset(&mas); - down_write(&newmt_lock); - mas.index = 0; - mas.last = 0; - if (mas_expected_entries(&newmas, nr_entries)) { + + down_write(&mt_lock); + for (i = 0; i <= nr_entries; i++) { + mas_set_range(&mas, i*10, i*10 + 5); + mas_store_gfp(&mas, xa_mk_value(i), GFP_KERNEL); + } + + down_write_nested(&newmt_lock, SINGLE_DEPTH_NESTING); + ret = __mt_dup(&mt, &newmt, GFP_KERNEL); + if (ret) { pr_err("OOM!"); BUG_ON(1); } - rcu_read_lock(); - mas_for_each(&mas, val, ULONG_MAX) { - newmas.index = mas.index; - newmas.last = mas.last; + + mas_set(&newmas, 0); + mas_for_each(&newmas, val, ULONG_MAX) mas_store(&newmas, val); - } - rcu_read_unlock(); + mas_destroy(&newmas); + mas_destroy(&mas); mt_validate(&newmt); - mt_set_non_kernel(0); __mt_destroy(&newmt); + __mt_destroy(&mt); up_write(&newmt_lock); + up_write(&mt_lock); } static noinline void __init check_iteration(struct maple_tree *mt) @@ -1977,49 +1978,51 @@ static noinline void __init check_mas_store_gfp(struct maple_tree *mt) } #if defined(BENCH_FORK) -static noinline void __init bench_forking(struct maple_tree *mt) +static noinline void __init bench_forking(void) { - - struct maple_tree newmt; - int i, nr_entries = 134, nr_fork = 80000; + struct maple_tree mt, newmt; + int i, nr_entries = 134, nr_fork = 80000, ret; void *val; - MA_STATE(mas, mt, 0, 0); - MA_STATE(newmas, mt, 0, 0); - struct rw_semaphore newmt_lock; + MA_STATE(mas, &mt, 0, 0); + MA_STATE(newmas, &newmt, 0, 0); + struct rw_semaphore mt_lock, newmt_lock; + init_rwsem(&mt_lock); init_rwsem(&newmt_lock); - mt_set_external_lock(&newmt, &newmt_lock); - for (i = 0; i <= nr_entries; i++) - mtree_store_range(mt, i*10, i*10 + 5, - xa_mk_value(i), GFP_KERNEL); + mt_init_flags(&mt, MT_FLAGS_ALLOC_RANGE | MT_FLAGS_LOCK_EXTERN); + mt_set_external_lock(&mt, &mt_lock); + + down_write(&mt_lock); + for (i = 0; i <= nr_entries; i++) { + mas_set_range(&mas, i*10, i*10 + 5); + mas_store_gfp(&mas, xa_mk_value(i), GFP_KERNEL); + } for (i = 0; i < nr_fork; i++) { - mt_set_non_kernel(99999); - mt_init_flags(&newmt, MT_FLAGS_ALLOC_RANGE); - newmas.tree = &newmt; - mas_reset(&newmas); - mas_reset(&mas); - mas.index = 0; - mas.last = 0; - rcu_read_lock(); - down_write(&newmt_lock); - if (mas_expected_entries(&newmas, nr_entries)) { - printk("OOM!"); + mt_init_flags(&newmt, + MT_FLAGS_ALLOC_RANGE | MT_FLAGS_LOCK_EXTERN); + mt_set_external_lock(&newmt, &newmt_lock); + + down_write_nested(&newmt_lock, SINGLE_DEPTH_NESTING); + ret = __mt_dup(&mt, &newmt, GFP_KERNEL); + if (ret) { + pr_err("OOM!"); BUG_ON(1); } - mas_for_each(&mas, val, ULONG_MAX) { - newmas.index = mas.index; - newmas.last = mas.last; + + mas_set(&newmas, 0); + mas_for_each(&newmas, val, ULONG_MAX) mas_store(&newmas, val); - } + mas_destroy(&newmas); - rcu_read_unlock(); mt_validate(&newmt); - mt_set_non_kernel(0); __mt_destroy(&newmt); up_write(&newmt_lock); } + mas_destroy(&mas); + __mt_destroy(&mt); + up_write(&mt_lock); } #endif @@ -3615,9 +3618,7 @@ static int __init maple_tree_seed(void) #endif #if defined(BENCH_FORK) #define BENCH - mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); - bench_forking(&tree); - mtree_destroy(&tree); + bench_forking(); goto skip; #endif #if defined(BENCH_MT_FOR_EACH) @@ -3650,9 +3651,7 @@ static int __init maple_tree_seed(void) check_iteration(&tree); mtree_destroy(&tree); - mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); - check_forking(&tree); - mtree_destroy(&tree); + check_forking(); mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); check_mas_store_gfp(&tree); diff --git a/tools/include/linux/rwsem.h b/tools/include/linux/rwsem.h index 83971b3cbfce..f8bffd4a987c 100644 --- a/tools/include/linux/rwsem.h +++ b/tools/include/linux/rwsem.h @@ -37,4 +37,8 @@ static inline int up_write(struct rw_semaphore *sem) { return pthread_rwlock_unlock(&sem->lock); } + +#define down_read_nested(sem, subclass) down_read(sem) +#define down_write_nested(sem, subclass) down_write(sem) + #endif /* _TOOLS_RWSEM_H */ -- cgit v1.2.3 From e6a9a2cbc13bf43e4c03f57666e93d511249d5d7 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Mon, 6 Nov 2023 14:09:58 -0800 Subject: fs/proc/task_mmu: report SOFT_DIRTY bits through the PAGEMAP_SCAN ioctl MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The PAGEMAP_SCAN ioctl returns information regarding page table entries. It is more efficient compared to reading pagemap files. CRIU can start to utilize this ioctl, but it needs info about soft-dirty bits to track memory changes. We are aware of a new method for tracking memory changes implemented in the PAGEMAP_SCAN ioctl. For CRIU, the primary advantage of this method is its usability by unprivileged users. However, it is not feasible to transparently replace the soft-dirty tracker with the new one. The main problem here is userfault descriptors that have to be preserved between pre-dump iterations. It means criu continues supporting the soft-dirty method to avoid breakage for current users. The new method will be implemented as a separate feature. [avagin@google.com: update tools/include/uapi/linux/fs.h] Link: https://lkml.kernel.org/r/20231107164139.576046-1-avagin@google.com Link: https://lkml.kernel.org/r/20231106220959.296568-1-avagin@google.com Signed-off-by: Andrei Vagin Reviewed-by: Muhammad Usama Anjum Cc: Michał Mirosław Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/pagemap.rst | 1 + fs/proc/task_mmu.c | 17 ++++++++++++++++- include/uapi/linux/fs.h | 1 + tools/include/uapi/linux/fs.h | 1 + 4 files changed, 19 insertions(+), 1 deletion(-) (limited to 'tools') diff --git a/Documentation/admin-guide/mm/pagemap.rst b/Documentation/admin-guide/mm/pagemap.rst index fe17cf210426..f5f065c67615 100644 --- a/Documentation/admin-guide/mm/pagemap.rst +++ b/Documentation/admin-guide/mm/pagemap.rst @@ -253,6 +253,7 @@ Following flags about pages are currently supported: - ``PAGE_IS_SWAPPED`` - Page is in swapped - ``PAGE_IS_PFNZERO`` - Page has zero PFN - ``PAGE_IS_HUGE`` - Page is THP or Hugetlb backed +- ``PAGE_IS_SOFT_DIRTY`` - Page is soft-dirty The ``struct pm_scan_arg`` is used as the argument of the IOCTL. diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 435b61054b5b..d19924bf0a39 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1761,7 +1761,7 @@ static int pagemap_release(struct inode *inode, struct file *file) #define PM_SCAN_CATEGORIES (PAGE_IS_WPALLOWED | PAGE_IS_WRITTEN | \ PAGE_IS_FILE | PAGE_IS_PRESENT | \ PAGE_IS_SWAPPED | PAGE_IS_PFNZERO | \ - PAGE_IS_HUGE) + PAGE_IS_HUGE | PAGE_IS_SOFT_DIRTY) #define PM_SCAN_FLAGS (PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC) struct pagemap_scan_private { @@ -1793,6 +1793,8 @@ static unsigned long pagemap_page_category(struct pagemap_scan_private *p, if (is_zero_pfn(pte_pfn(pte))) categories |= PAGE_IS_PFNZERO; + if (pte_soft_dirty(pte)) + categories |= PAGE_IS_SOFT_DIRTY; } else if (is_swap_pte(pte)) { swp_entry_t swp; @@ -1806,6 +1808,8 @@ static unsigned long pagemap_page_category(struct pagemap_scan_private *p, !PageAnon(pfn_swap_entry_to_page(swp))) categories |= PAGE_IS_FILE; } + if (pte_swp_soft_dirty(pte)) + categories |= PAGE_IS_SOFT_DIRTY; } return categories; @@ -1853,12 +1857,16 @@ static unsigned long pagemap_thp_category(struct pagemap_scan_private *p, if (is_zero_pfn(pmd_pfn(pmd))) categories |= PAGE_IS_PFNZERO; + if (pmd_soft_dirty(pmd)) + categories |= PAGE_IS_SOFT_DIRTY; } else if (is_swap_pmd(pmd)) { swp_entry_t swp; categories |= PAGE_IS_SWAPPED; if (!pmd_swp_uffd_wp(pmd)) categories |= PAGE_IS_WRITTEN; + if (pmd_swp_soft_dirty(pmd)) + categories |= PAGE_IS_SOFT_DIRTY; if (p->masks_of_interest & PAGE_IS_FILE) { swp = pmd_to_swp_entry(pmd); @@ -1905,10 +1913,14 @@ static unsigned long pagemap_hugetlb_category(pte_t pte) categories |= PAGE_IS_FILE; if (is_zero_pfn(pte_pfn(pte))) categories |= PAGE_IS_PFNZERO; + if (pte_soft_dirty(pte)) + categories |= PAGE_IS_SOFT_DIRTY; } else if (is_swap_pte(pte)) { categories |= PAGE_IS_SWAPPED; if (!pte_swp_uffd_wp_any(pte)) categories |= PAGE_IS_WRITTEN; + if (pte_swp_soft_dirty(pte)) + categories |= PAGE_IS_SOFT_DIRTY; } return categories; @@ -2007,6 +2019,9 @@ static int pagemap_scan_test_walk(unsigned long start, unsigned long end, if (wp_allowed) vma_category |= PAGE_IS_WPALLOWED; + if (vma->vm_flags & VM_SOFTDIRTY) + vma_category |= PAGE_IS_SOFT_DIRTY; + if (!pagemap_scan_is_interesting_vma(vma_category, p)) return 1; diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index da43810b7485..48ad69f7722e 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -316,6 +316,7 @@ typedef int __bitwise __kernel_rwf_t; #define PAGE_IS_SWAPPED (1 << 4) #define PAGE_IS_PFNZERO (1 << 5) #define PAGE_IS_HUGE (1 << 6) +#define PAGE_IS_SOFT_DIRTY (1 << 7) /* * struct page_region - Page region with flags diff --git a/tools/include/uapi/linux/fs.h b/tools/include/uapi/linux/fs.h index da43810b7485..48ad69f7722e 100644 --- a/tools/include/uapi/linux/fs.h +++ b/tools/include/uapi/linux/fs.h @@ -316,6 +316,7 @@ typedef int __bitwise __kernel_rwf_t; #define PAGE_IS_SWAPPED (1 << 4) #define PAGE_IS_PFNZERO (1 << 5) #define PAGE_IS_HUGE (1 << 6) +#define PAGE_IS_SOFT_DIRTY (1 << 7) /* * struct page_region - Page region with flags -- cgit v1.2.3 From 600bca580579d8d8454cc8fe3290e2f8b9c01884 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Mon, 6 Nov 2023 14:09:59 -0800 Subject: selftests/mm: check that PAGEMAP_SCAN returns correct categories MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Right now, tests read page flags from /proc/pid/pagemap files. With this change, tests will check that PAGEMAP_SCAN return correct information too. [colin.i.king@gmail.com: fix spelling mistake "succedded" -> "succeeded"] Link: https://lkml.kernel.org/r/20231121093104.1728332-1-colin.i.king@gmail.com Link: https://lkml.kernel.org/r/20231106220959.296568-2-avagin@google.com Signed-off-by: Andrei Vagin Signed-off-by: Colin Ian King Reviewed-by: Muhammad Usama Anjum Tested-by: Muhammad Usama Anjum Cc: Michał Mirosław [avagin@google.com: allow running tests on old kernels] Link: https://lkml.kernel.org/r/20231117181127.2574897-1-avagin@google.com Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/vm_util.c | 80 ++++++++++++++++++++++++++++++++++-- 1 file changed, 77 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/mm/vm_util.c b/tools/testing/selftests/mm/vm_util.c index 3082b40492dd..05736c615734 100644 --- a/tools/testing/selftests/mm/vm_util.c +++ b/tools/testing/selftests/mm/vm_util.c @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include "../kselftest.h" @@ -28,19 +29,92 @@ uint64_t pagemap_get_entry(int fd, char *start) return entry; } +static uint64_t __pagemap_scan_get_categories(int fd, char *start, struct page_region *r) +{ + struct pm_scan_arg arg; + + arg.start = (uintptr_t)start; + arg.end = (uintptr_t)(start + psize()); + arg.vec = (uintptr_t)r; + arg.vec_len = 1; + arg.flags = 0; + arg.size = sizeof(struct pm_scan_arg); + arg.max_pages = 0; + arg.category_inverted = 0; + arg.category_mask = 0; + arg.category_anyof_mask = PAGE_IS_WPALLOWED | PAGE_IS_WRITTEN | PAGE_IS_FILE | + PAGE_IS_PRESENT | PAGE_IS_SWAPPED | PAGE_IS_PFNZERO | + PAGE_IS_HUGE | PAGE_IS_SOFT_DIRTY; + arg.return_mask = arg.category_anyof_mask; + + return ioctl(fd, PAGEMAP_SCAN, &arg); +} + +static uint64_t pagemap_scan_get_categories(int fd, char *start) +{ + struct page_region r; + long ret; + + ret = __pagemap_scan_get_categories(fd, start, &r); + if (ret < 0) + ksft_exit_fail_msg("PAGEMAP_SCAN failed: %s\n", strerror(errno)); + if (ret == 0) + return 0; + return r.categories; +} + +/* `start` is any valid address. */ +static bool pagemap_scan_supported(int fd, char *start) +{ + static int supported = -1; + int ret; + + if (supported != -1) + return supported; + + /* Provide an invalid address in order to trigger EFAULT. */ + ret = __pagemap_scan_get_categories(fd, start, (struct page_region *) ~0UL); + if (ret == 0) + ksft_exit_fail_msg("PAGEMAP_SCAN succeeded unexpectedly\n"); + + supported = errno == EFAULT; + + return supported; +} + +static bool page_entry_is(int fd, char *start, char *desc, + uint64_t pagemap_flags, uint64_t pagescan_flags) +{ + bool m = pagemap_get_entry(fd, start) & pagemap_flags; + + if (pagemap_scan_supported(fd, start)) { + bool s = pagemap_scan_get_categories(fd, start) & pagescan_flags; + + if (m == s) + return m; + + ksft_exit_fail_msg( + "read and ioctl return unmatched results for %s: %d %d", desc, m, s); + } + return m; +} + bool pagemap_is_softdirty(int fd, char *start) { - return pagemap_get_entry(fd, start) & PM_SOFT_DIRTY; + return page_entry_is(fd, start, "soft-dirty", + PM_SOFT_DIRTY, PAGE_IS_SOFT_DIRTY); } bool pagemap_is_swapped(int fd, char *start) { - return pagemap_get_entry(fd, start) & PM_SWAP; + return page_entry_is(fd, start, "swap", PM_SWAP, PAGE_IS_SWAPPED); } bool pagemap_is_populated(int fd, char *start) { - return pagemap_get_entry(fd, start) & (PM_PRESENT | PM_SWAP); + return page_entry_is(fd, start, "populated", + PM_PRESENT | PM_SWAP, + PAGE_IS_PRESENT | PAGE_IS_SWAPPED); } unsigned long pagemap_get_pfn(int fd, char *start) -- cgit v1.2.3 From 60433a9d038db006ca2f49e3c5f050dc46aaad3a Mon Sep 17 00:00:00 2001 From: Dmitry Rokosov Date: Thu, 23 Nov 2023 10:19:43 +0300 Subject: samples: introduce new samples subdir for cgroup Patch series "samples: introduce cgroup events listeners", v3. To begin with, this patch series relocates the cgroup example code to the samples/cgroup directory, which is the appropriate location for such code snippets. Furthermore, a new memcg events listener is introduced. This listener is a simple yet effective tool for monitoring memory events and managing counter changes during runtime. Additionally, as per Andrew Morton's suggestion, a helpful reminder comment is included in the memcontrol implementation. This comment serves to ensure that the samples code is updated whenever new events are added. This patch (of 3): Move the cgroup_event_listener for cgroup v1 to the samples directory. This suggestion was proposed by Andrew Morton during the discussion [1]. Link: https://lore.kernel.org/all/20231106140934.3f5d4960141562fe8da53906@linux-foundation.org/ [1] Link: https://lkml.kernel.org/r/20231123071945.25811-1-ddrokosov@salutedevices.com Link: https://lkml.kernel.org/r/20231123071945.25811-2-ddrokosov@salutedevices.com Signed-off-by: Dmitry Rokosov Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: Roman Gushchin Cc: Shakeel Butt Signed-off-by: Andrew Morton --- MAINTAINERS | 1 + samples/Kconfig | 6 +++ samples/Makefile | 1 + samples/cgroup/Makefile | 5 ++ samples/cgroup/cgroup_event_listener.c | 83 ++++++++++++++++++++++++++++++++++ tools/cgroup/Makefile | 11 ----- tools/cgroup/cgroup_event_listener.c | 83 ---------------------------------- 7 files changed, 96 insertions(+), 94 deletions(-) create mode 100644 samples/cgroup/Makefile create mode 100644 samples/cgroup/cgroup_event_listener.c delete mode 100644 tools/cgroup/Makefile delete mode 100644 tools/cgroup/cgroup_event_listener.c (limited to 'tools') diff --git a/MAINTAINERS b/MAINTAINERS index 5c9d3d854671..6f5d6962d26d 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5339,6 +5339,7 @@ L: linux-mm@kvack.org S: Maintained F: mm/memcontrol.c F: mm/swap_cgroup.c +F: samples/cgroup/* F: tools/testing/selftests/cgroup/memcg_protection.m F: tools/testing/selftests/cgroup/test_hugetlb_memcg.c F: tools/testing/selftests/cgroup/test_kmem.c diff --git a/samples/Kconfig b/samples/Kconfig index b0ddf5f36738..b288d9991d27 100644 --- a/samples/Kconfig +++ b/samples/Kconfig @@ -285,6 +285,12 @@ config SAMPLE_KMEMLEAK Build a sample program which have explicitly leaks memory to test kmemleak +config SAMPLE_CGROUP + bool "Build cgroup sample code" + depends on CGROUPS && CC_CAN_LINK && HEADERS_INSTALL + help + Build samples that demonstrate the usage of the cgroup API. + source "samples/rust/Kconfig" endif # SAMPLES diff --git a/samples/Makefile b/samples/Makefile index 0a551c2b33f4..b85fa64390c5 100644 --- a/samples/Makefile +++ b/samples/Makefile @@ -3,6 +3,7 @@ subdir-$(CONFIG_SAMPLE_AUXDISPLAY) += auxdisplay subdir-$(CONFIG_SAMPLE_ANDROID_BINDERFS) += binderfs +subdir-$(CONFIG_SAMPLE_CGROUP) += cgroup obj-$(CONFIG_SAMPLE_CONFIGFS) += configfs/ obj-$(CONFIG_SAMPLE_CONNECTOR) += connector/ obj-$(CONFIG_SAMPLE_FANOTIFY_ERROR) += fanotify/ diff --git a/samples/cgroup/Makefile b/samples/cgroup/Makefile new file mode 100644 index 000000000000..deef4530f5e7 --- /dev/null +++ b/samples/cgroup/Makefile @@ -0,0 +1,5 @@ +# SPDX-License-Identifier: GPL-2.0 + +userprogs-always-y += cgroup_event_listener + +userccflags += -I usr/include diff --git a/samples/cgroup/cgroup_event_listener.c b/samples/cgroup/cgroup_event_listener.c new file mode 100644 index 000000000000..3d70dc831a76 --- /dev/null +++ b/samples/cgroup/cgroup_event_listener.c @@ -0,0 +1,83 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * cgroup_event_listener.c - Simple listener of cgroup events + * + * Copyright (C) Kirill A. Shutemov + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#define USAGE_STR "Usage: cgroup_event_listener " + +int main(int argc, char **argv) +{ + int efd = -1; + int cfd = -1; + int event_control = -1; + char event_control_path[PATH_MAX]; + char line[LINE_MAX]; + int ret; + + if (argc != 3) + errx(1, "%s", USAGE_STR); + + cfd = open(argv[1], O_RDONLY); + if (cfd == -1) + err(1, "Cannot open %s", argv[1]); + + ret = snprintf(event_control_path, PATH_MAX, "%s/cgroup.event_control", + dirname(argv[1])); + if (ret >= PATH_MAX) + errx(1, "Path to cgroup.event_control is too long"); + + event_control = open(event_control_path, O_WRONLY); + if (event_control == -1) + err(1, "Cannot open %s", event_control_path); + + efd = eventfd(0, 0); + if (efd == -1) + err(1, "eventfd() failed"); + + ret = snprintf(line, LINE_MAX, "%d %d %s", efd, cfd, argv[2]); + if (ret >= LINE_MAX) + errx(1, "Arguments string is too long"); + + ret = write(event_control, line, strlen(line) + 1); + if (ret == -1) + err(1, "Cannot write to cgroup.event_control"); + + while (1) { + uint64_t result; + + ret = read(efd, &result, sizeof(result)); + if (ret == -1) { + if (errno == EINTR) + continue; + err(1, "Cannot read from eventfd"); + } + assert(ret == sizeof(result)); + + ret = access(event_control_path, W_OK); + if ((ret == -1) && (errno == ENOENT)) { + puts("The cgroup seems to have removed."); + break; + } + + if (ret == -1) + err(1, "cgroup.event_control is not accessible any more"); + + printf("%s %s: crossed\n", argv[1], argv[2]); + } + + return 0; +} diff --git a/tools/cgroup/Makefile b/tools/cgroup/Makefile deleted file mode 100644 index ffca068e4a76..000000000000 --- a/tools/cgroup/Makefile +++ /dev/null @@ -1,11 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -# Makefile for cgroup tools - -CFLAGS = -Wall -Wextra - -all: cgroup_event_listener -%: %.c - $(CC) $(CFLAGS) -o $@ $^ - -clean: - $(RM) cgroup_event_listener diff --git a/tools/cgroup/cgroup_event_listener.c b/tools/cgroup/cgroup_event_listener.c deleted file mode 100644 index 3d70dc831a76..000000000000 --- a/tools/cgroup/cgroup_event_listener.c +++ /dev/null @@ -1,83 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * cgroup_event_listener.c - Simple listener of cgroup events - * - * Copyright (C) Kirill A. Shutemov - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#define USAGE_STR "Usage: cgroup_event_listener " - -int main(int argc, char **argv) -{ - int efd = -1; - int cfd = -1; - int event_control = -1; - char event_control_path[PATH_MAX]; - char line[LINE_MAX]; - int ret; - - if (argc != 3) - errx(1, "%s", USAGE_STR); - - cfd = open(argv[1], O_RDONLY); - if (cfd == -1) - err(1, "Cannot open %s", argv[1]); - - ret = snprintf(event_control_path, PATH_MAX, "%s/cgroup.event_control", - dirname(argv[1])); - if (ret >= PATH_MAX) - errx(1, "Path to cgroup.event_control is too long"); - - event_control = open(event_control_path, O_WRONLY); - if (event_control == -1) - err(1, "Cannot open %s", event_control_path); - - efd = eventfd(0, 0); - if (efd == -1) - err(1, "eventfd() failed"); - - ret = snprintf(line, LINE_MAX, "%d %d %s", efd, cfd, argv[2]); - if (ret >= LINE_MAX) - errx(1, "Arguments string is too long"); - - ret = write(event_control, line, strlen(line) + 1); - if (ret == -1) - err(1, "Cannot write to cgroup.event_control"); - - while (1) { - uint64_t result; - - ret = read(efd, &result, sizeof(result)); - if (ret == -1) { - if (errno == EINTR) - continue; - err(1, "Cannot read from eventfd"); - } - assert(ret == sizeof(result)); - - ret = access(event_control_path, W_OK); - if ((ret == -1) && (errno == ENOENT)) { - puts("The cgroup seems to have removed."); - break; - } - - if (ret == -1) - err(1, "cgroup.event_control is not accessible any more"); - - printf("%s %s: crossed\n", argv[1], argv[2]); - } - - return 0; -} -- cgit v1.2.3 From bf857ddd21d0bffc1edafc317e8e2ce0d6d5950c Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Wed, 1 Nov 2023 13:16:20 -0400 Subject: maple_tree: move debug check to __mas_set_range() __mas_set_range() was created to shortcut resetting the maple state and a debug check was added to the caller (the vma iterator) to ensure the internal maple state remains safe to use. Move the debug check from the vma iterator into the maple tree itself so other users do not incorrectly use the advanced maple state modification. Fallout from this change include a large amount of debug setup needed to be moved to earlier in the header, and the maple_tree.h radix-tree test code needed to move the inclusion of the header to after the atomic define. None of those changes have functional changes. Link: https://lkml.kernel.org/r/20231101171629.3612299-4-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Peng Zhang Signed-off-by: Andrew Morton --- include/linux/maple_tree.h | 255 ++++++++++++++-------------- mm/internal.h | 2 - tools/testing/radix-tree/linux/maple_tree.h | 2 +- 3 files changed, 130 insertions(+), 129 deletions(-) (limited to 'tools') diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index a452dd8a1e5c..b5d5992578c9 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -557,6 +557,131 @@ static inline void mas_reset(struct ma_state *mas) */ #define mas_for_each(__mas, __entry, __max) \ while (((__entry) = mas_find((__mas), (__max))) != NULL) + +#ifdef CONFIG_DEBUG_MAPLE_TREE +enum mt_dump_format { + mt_dump_dec, + mt_dump_hex, +}; + +extern atomic_t maple_tree_tests_run; +extern atomic_t maple_tree_tests_passed; + +void mt_dump(const struct maple_tree *mt, enum mt_dump_format format); +void mas_dump(const struct ma_state *mas); +void mas_wr_dump(const struct ma_wr_state *wr_mas); +void mt_validate(struct maple_tree *mt); +void mt_cache_shrink(void); +#define MT_BUG_ON(__tree, __x) do { \ + atomic_inc(&maple_tree_tests_run); \ + if (__x) { \ + pr_info("BUG at %s:%d (%u)\n", \ + __func__, __LINE__, __x); \ + mt_dump(__tree, mt_dump_hex); \ + pr_info("Pass: %u Run:%u\n", \ + atomic_read(&maple_tree_tests_passed), \ + atomic_read(&maple_tree_tests_run)); \ + dump_stack(); \ + } else { \ + atomic_inc(&maple_tree_tests_passed); \ + } \ +} while (0) + +#define MAS_BUG_ON(__mas, __x) do { \ + atomic_inc(&maple_tree_tests_run); \ + if (__x) { \ + pr_info("BUG at %s:%d (%u)\n", \ + __func__, __LINE__, __x); \ + mas_dump(__mas); \ + mt_dump((__mas)->tree, mt_dump_hex); \ + pr_info("Pass: %u Run:%u\n", \ + atomic_read(&maple_tree_tests_passed), \ + atomic_read(&maple_tree_tests_run)); \ + dump_stack(); \ + } else { \ + atomic_inc(&maple_tree_tests_passed); \ + } \ +} while (0) + +#define MAS_WR_BUG_ON(__wrmas, __x) do { \ + atomic_inc(&maple_tree_tests_run); \ + if (__x) { \ + pr_info("BUG at %s:%d (%u)\n", \ + __func__, __LINE__, __x); \ + mas_wr_dump(__wrmas); \ + mas_dump((__wrmas)->mas); \ + mt_dump((__wrmas)->mas->tree, mt_dump_hex); \ + pr_info("Pass: %u Run:%u\n", \ + atomic_read(&maple_tree_tests_passed), \ + atomic_read(&maple_tree_tests_run)); \ + dump_stack(); \ + } else { \ + atomic_inc(&maple_tree_tests_passed); \ + } \ +} while (0) + +#define MT_WARN_ON(__tree, __x) ({ \ + int ret = !!(__x); \ + atomic_inc(&maple_tree_tests_run); \ + if (ret) { \ + pr_info("WARN at %s:%d (%u)\n", \ + __func__, __LINE__, __x); \ + mt_dump(__tree, mt_dump_hex); \ + pr_info("Pass: %u Run:%u\n", \ + atomic_read(&maple_tree_tests_passed), \ + atomic_read(&maple_tree_tests_run)); \ + dump_stack(); \ + } else { \ + atomic_inc(&maple_tree_tests_passed); \ + } \ + unlikely(ret); \ +}) + +#define MAS_WARN_ON(__mas, __x) ({ \ + int ret = !!(__x); \ + atomic_inc(&maple_tree_tests_run); \ + if (ret) { \ + pr_info("WARN at %s:%d (%u)\n", \ + __func__, __LINE__, __x); \ + mas_dump(__mas); \ + mt_dump((__mas)->tree, mt_dump_hex); \ + pr_info("Pass: %u Run:%u\n", \ + atomic_read(&maple_tree_tests_passed), \ + atomic_read(&maple_tree_tests_run)); \ + dump_stack(); \ + } else { \ + atomic_inc(&maple_tree_tests_passed); \ + } \ + unlikely(ret); \ +}) + +#define MAS_WR_WARN_ON(__wrmas, __x) ({ \ + int ret = !!(__x); \ + atomic_inc(&maple_tree_tests_run); \ + if (ret) { \ + pr_info("WARN at %s:%d (%u)\n", \ + __func__, __LINE__, __x); \ + mas_wr_dump(__wrmas); \ + mas_dump((__wrmas)->mas); \ + mt_dump((__wrmas)->mas->tree, mt_dump_hex); \ + pr_info("Pass: %u Run:%u\n", \ + atomic_read(&maple_tree_tests_passed), \ + atomic_read(&maple_tree_tests_run)); \ + dump_stack(); \ + } else { \ + atomic_inc(&maple_tree_tests_passed); \ + } \ + unlikely(ret); \ +}) +#else +#define MT_BUG_ON(__tree, __x) BUG_ON(__x) +#define MAS_BUG_ON(__mas, __x) BUG_ON(__x) +#define MAS_WR_BUG_ON(__mas, __x) BUG_ON(__x) +#define MT_WARN_ON(__tree, __x) WARN_ON(__x) +#define MAS_WARN_ON(__mas, __x) WARN_ON(__x) +#define MAS_WR_WARN_ON(__mas, __x) WARN_ON(__x) +#endif /* CONFIG_DEBUG_MAPLE_TREE */ + /** * __mas_set_range() - Set up Maple Tree operation state to a sub-range of the * current location. @@ -570,6 +695,9 @@ static inline void mas_reset(struct ma_state *mas) static inline void __mas_set_range(struct ma_state *mas, unsigned long start, unsigned long last) { + /* Ensure the range starts within the current slot */ + MAS_WARN_ON(mas, mas_is_active(mas) && + (mas->index > start || mas->last < start)); mas->index = start; mas->last = last; } @@ -587,8 +715,8 @@ static inline void __mas_set_range(struct ma_state *mas, unsigned long start, static inline void mas_set_range(struct ma_state *mas, unsigned long start, unsigned long last) { - __mas_set_range(mas, start, last); mas->node = MAS_START; + __mas_set_range(mas, start, last); } /** @@ -713,129 +841,4 @@ void *mt_next(struct maple_tree *mt, unsigned long index, unsigned long max); for (__entry = mt_find(__tree, &(__index), __max); \ __entry; __entry = mt_find_after(__tree, &(__index), __max)) - -#ifdef CONFIG_DEBUG_MAPLE_TREE -enum mt_dump_format { - mt_dump_dec, - mt_dump_hex, -}; - -extern atomic_t maple_tree_tests_run; -extern atomic_t maple_tree_tests_passed; - -void mt_dump(const struct maple_tree *mt, enum mt_dump_format format); -void mas_dump(const struct ma_state *mas); -void mas_wr_dump(const struct ma_wr_state *wr_mas); -void mt_validate(struct maple_tree *mt); -void mt_cache_shrink(void); -#define MT_BUG_ON(__tree, __x) do { \ - atomic_inc(&maple_tree_tests_run); \ - if (__x) { \ - pr_info("BUG at %s:%d (%u)\n", \ - __func__, __LINE__, __x); \ - mt_dump(__tree, mt_dump_hex); \ - pr_info("Pass: %u Run:%u\n", \ - atomic_read(&maple_tree_tests_passed), \ - atomic_read(&maple_tree_tests_run)); \ - dump_stack(); \ - } else { \ - atomic_inc(&maple_tree_tests_passed); \ - } \ -} while (0) - -#define MAS_BUG_ON(__mas, __x) do { \ - atomic_inc(&maple_tree_tests_run); \ - if (__x) { \ - pr_info("BUG at %s:%d (%u)\n", \ - __func__, __LINE__, __x); \ - mas_dump(__mas); \ - mt_dump((__mas)->tree, mt_dump_hex); \ - pr_info("Pass: %u Run:%u\n", \ - atomic_read(&maple_tree_tests_passed), \ - atomic_read(&maple_tree_tests_run)); \ - dump_stack(); \ - } else { \ - atomic_inc(&maple_tree_tests_passed); \ - } \ -} while (0) - -#define MAS_WR_BUG_ON(__wrmas, __x) do { \ - atomic_inc(&maple_tree_tests_run); \ - if (__x) { \ - pr_info("BUG at %s:%d (%u)\n", \ - __func__, __LINE__, __x); \ - mas_wr_dump(__wrmas); \ - mas_dump((__wrmas)->mas); \ - mt_dump((__wrmas)->mas->tree, mt_dump_hex); \ - pr_info("Pass: %u Run:%u\n", \ - atomic_read(&maple_tree_tests_passed), \ - atomic_read(&maple_tree_tests_run)); \ - dump_stack(); \ - } else { \ - atomic_inc(&maple_tree_tests_passed); \ - } \ -} while (0) - -#define MT_WARN_ON(__tree, __x) ({ \ - int ret = !!(__x); \ - atomic_inc(&maple_tree_tests_run); \ - if (ret) { \ - pr_info("WARN at %s:%d (%u)\n", \ - __func__, __LINE__, __x); \ - mt_dump(__tree, mt_dump_hex); \ - pr_info("Pass: %u Run:%u\n", \ - atomic_read(&maple_tree_tests_passed), \ - atomic_read(&maple_tree_tests_run)); \ - dump_stack(); \ - } else { \ - atomic_inc(&maple_tree_tests_passed); \ - } \ - unlikely(ret); \ -}) - -#define MAS_WARN_ON(__mas, __x) ({ \ - int ret = !!(__x); \ - atomic_inc(&maple_tree_tests_run); \ - if (ret) { \ - pr_info("WARN at %s:%d (%u)\n", \ - __func__, __LINE__, __x); \ - mas_dump(__mas); \ - mt_dump((__mas)->tree, mt_dump_hex); \ - pr_info("Pass: %u Run:%u\n", \ - atomic_read(&maple_tree_tests_passed), \ - atomic_read(&maple_tree_tests_run)); \ - dump_stack(); \ - } else { \ - atomic_inc(&maple_tree_tests_passed); \ - } \ - unlikely(ret); \ -}) - -#define MAS_WR_WARN_ON(__wrmas, __x) ({ \ - int ret = !!(__x); \ - atomic_inc(&maple_tree_tests_run); \ - if (ret) { \ - pr_info("WARN at %s:%d (%u)\n", \ - __func__, __LINE__, __x); \ - mas_wr_dump(__wrmas); \ - mas_dump((__wrmas)->mas); \ - mt_dump((__wrmas)->mas->tree, mt_dump_hex); \ - pr_info("Pass: %u Run:%u\n", \ - atomic_read(&maple_tree_tests_passed), \ - atomic_read(&maple_tree_tests_run)); \ - dump_stack(); \ - } else { \ - atomic_inc(&maple_tree_tests_passed); \ - } \ - unlikely(ret); \ -}) -#else -#define MT_BUG_ON(__tree, __x) BUG_ON(__x) -#define MAS_BUG_ON(__mas, __x) BUG_ON(__x) -#define MAS_WR_BUG_ON(__mas, __x) BUG_ON(__x) -#define MT_WARN_ON(__tree, __x) WARN_ON(__x) -#define MAS_WARN_ON(__mas, __x) WARN_ON(__x) -#define MAS_WR_WARN_ON(__mas, __x) WARN_ON(__x) -#endif /* CONFIG_DEBUG_MAPLE_TREE */ - #endif /*_LINUX_MAPLE_TREE_H */ diff --git a/mm/internal.h b/mm/internal.h index 2bc9ff8db393..0005b8adbd5c 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1135,8 +1135,6 @@ static inline bool vma_soft_dirty_enabled(struct vm_area_struct *vma) static inline void vma_iter_config(struct vma_iterator *vmi, unsigned long index, unsigned long last) { - MAS_BUG_ON(&vmi->mas, vmi->mas.node != MAS_START && - (vmi->mas.index > index || vmi->mas.last < index)); __mas_set_range(&vmi->mas, index, last - 1); } diff --git a/tools/testing/radix-tree/linux/maple_tree.h b/tools/testing/radix-tree/linux/maple_tree.h index 7d8d1f445b89..06c89bdcc515 100644 --- a/tools/testing/radix-tree/linux/maple_tree.h +++ b/tools/testing/radix-tree/linux/maple_tree.h @@ -1,7 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0+ */ #define atomic_t int32_t -#include "../../../../include/linux/maple_tree.h" #define atomic_inc(x) uatomic_inc(x) #define atomic_read(x) uatomic_read(x) #define atomic_set(x, y) do {} while (0) #define U8_MAX UCHAR_MAX +#include "../../../../include/linux/maple_tree.h" -- cgit v1.2.3 From 31c532a8af57513228c2b12d281104198ff412b8 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Wed, 1 Nov 2023 13:16:21 -0400 Subject: maple_tree: add end of node tracking to the maple state Analysis of the mas_for_each() iteration showed that there is a significant time spent finding the end of a node. This time can be greatly reduced if the end of the node is cached in the maple state. Care must be taken to update & invalidate as necessary. Link: https://lkml.kernel.org/r/20231101171629.3612299-5-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Peng Zhang Signed-off-by: Andrew Morton --- include/linux/maple_tree.h | 1 + lib/maple_tree.c | 7 +++++++ tools/testing/radix-tree/maple.c | 1 + 3 files changed, 9 insertions(+) (limited to 'tools') diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index b5d5992578c9..0b82efe0cf1e 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -393,6 +393,7 @@ struct ma_state { unsigned char depth; /* depth of tree descent during write */ unsigned char offset; unsigned char mas_flags; + unsigned char end; /* The end of the node */ }; struct ma_wr_state { diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 8d379d34ea0a..ea0a36341fed 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -2841,6 +2841,7 @@ next: goto dead_node; } while (!ma_is_leaf(type)); + mas->end = end; mas->offset = offset; mas->index = min; mas->last = max; @@ -3507,6 +3508,7 @@ static noinline_for_kasan int mas_commit_b_node(struct ma_wr_state *wr_mas, mas_replace_node(wr_mas->mas, old_enode); reuse_node: mas_update_gap(wr_mas->mas); + wr_mas->mas->end = b_end; return 1; } @@ -4010,6 +4012,7 @@ done: } trace_ma_write(__func__, mas, 0, wr_mas->entry); mas_update_gap(mas); + mas->end = new_end; return true; } @@ -4190,6 +4193,7 @@ static inline bool mas_wr_append(struct ma_wr_state *wr_mas, if (!wr_mas->content || !wr_mas->entry) mas_update_gap(mas); + mas->end = new_end; trace_ma_write(__func__, mas, new_end, wr_mas->entry); return true; } @@ -4428,6 +4432,7 @@ static inline int mas_prev_node(struct ma_state *mas, unsigned long min) if (unlikely(mte_dead_node(mas->node))) return 1; + mas->end = mas->offset; return 0; no_entry: @@ -5074,6 +5079,7 @@ int mas_empty_area(struct ma_state *mas, unsigned long min, if (mas->index < min) mas->index = min; mas->last = mas->index + size - 1; + mas->end = mas_data_end(mas); return 0; } EXPORT_SYMBOL_GPL(mas_empty_area); @@ -5134,6 +5140,7 @@ int mas_empty_area_rev(struct ma_state *mas, unsigned long min, mas->last = max; mas->index = mas->last - size + 1; + mas->end = mas_data_end(mas); return 0; } EXPORT_SYMBOL_GPL(mas_empty_area_rev); diff --git a/tools/testing/radix-tree/maple.c b/tools/testing/radix-tree/maple.c index cb5358674521..7095fb0ec026 100644 --- a/tools/testing/radix-tree/maple.c +++ b/tools/testing/radix-tree/maple.c @@ -945,6 +945,7 @@ retry: goto retry; } + mas->end = mas_data_end(mas); return ret; not_found: -- cgit v1.2.3 From 067311d33e650adfe7ae23765959ddcc1ba18510 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Wed, 1 Nov 2023 13:16:25 -0400 Subject: maple_tree: separate ma_state node from status The maple tree node is overloaded to keep status as well as the active node. This, unfortunately, results in a re-walk on underflow or overflow. Since the maple state has room, the status can be placed in its own enum in the structure. Once an underflow/overflow is detected, certain modes can restore the status to active and others may need to re-walk just that one node to see the entry. The status being an enum has the benefit of detecting unhandled status in switch statements. [Liam.Howlett@oracle.com: fix comments about MAS_*] Link: https://lkml.kernel.org/r/20231106154124.614247-1-Liam.Howlett@oracle.com [Liam.Howlett@oracle.com: update forking to separate maple state and node] Link: https://lkml.kernel.org/r/20231106154551.615042-1-Liam.Howlett@oracle.com [Liam.Howlett@oracle.com: fix mas_prev() state separation code] Link: https://lkml.kernel.org/r/20231207193319.4025462-1-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20231101171629.3612299-9-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Peng Zhang Signed-off-by: Andrew Morton --- include/linux/maple_tree.h | 87 ++++---- include/linux/mm_types.h | 3 +- lib/maple_tree.c | 459 +++++++++++++++++++++++---------------- lib/test_maple_tree.c | 189 ++++++++-------- mm/internal.h | 8 +- tools/testing/radix-tree/maple.c | 26 ++- 6 files changed, 445 insertions(+), 327 deletions(-) (limited to 'tools') diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index 0b82efe0cf1e..4dd668f7b111 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -349,6 +349,36 @@ static inline bool mtree_empty(const struct maple_tree *mt) /* Advanced API */ +/* + * Maple State Status + * ma_active means the maple state is pointing to a node and offset and can + * continue operating on the tree. + * ma_start means we have not searched the tree. + * ma_root means we have searched the tree and the entry we found lives in + * the root of the tree (ie it has index 0, length 1 and is the only entry in + * the tree). + * ma_none means we have searched the tree and there is no node in the + * tree for this entry. For example, we searched for index 1 in an empty + * tree. Or we have a tree which points to a full leaf node and we + * searched for an entry which is larger than can be contained in that + * leaf node. + * ma_pause means the data within the maple state may be stale, restart the + * operation + * ma_overflow means the search has reached the upper limit of the search + * ma_underflow means the search has reached the lower limit of the search + * ma_error means there was an error, check the node for the error number. + */ +enum maple_status { + ma_active, + ma_start, + ma_root, + ma_none, + ma_pause, + ma_overflow, + ma_underflow, + ma_error, +}; + /* * The maple state is defined in the struct ma_state and is used to keep track * of information during operations, and even between operations when using the @@ -381,6 +411,13 @@ static inline bool mtree_empty(const struct maple_tree *mt) * When returning a value the maple state index and last respectively contain * the start and end of the range for the entry. Ranges are inclusive in the * Maple Tree. + * + * The status of the state is used to determine how the next action should treat + * the state. For instance, if the status is ma_start then the next action + * should start at the root of the tree and walk down. If the status is + * ma_pause then the node may be stale data and should be discarded. If the + * status is ma_overflow, then the last action hit the upper limit. + * */ struct ma_state { struct maple_tree *tree; /* The tree we're operating in */ @@ -390,6 +427,7 @@ struct ma_state { unsigned long min; /* The minimum index of this node - implied pivot min */ unsigned long max; /* The maximum index of this node - implied pivot max */ struct maple_alloc *alloc; /* Allocated nodes for this operation */ + enum maple_status status; /* The status of the state (active, start, none, etc) */ unsigned char depth; /* depth of tree descent during write */ unsigned char offset; unsigned char mas_flags; @@ -416,28 +454,12 @@ struct ma_wr_state { spin_lock_nested(&((mas)->tree->ma_lock), subclass) #define mas_unlock(mas) spin_unlock(&((mas)->tree->ma_lock)) - /* * Special values for ma_state.node. - * MAS_START means we have not searched the tree. - * MAS_ROOT means we have searched the tree and the entry we found lives in - * the root of the tree (ie it has index 0, length 1 and is the only entry in - * the tree). - * MAS_NONE means we have searched the tree and there is no node in the - * tree for this entry. For example, we searched for index 1 in an empty - * tree. Or we have a tree which points to a full leaf node and we - * searched for an entry which is larger than can be contained in that - * leaf node. * MA_ERROR represents an errno. After dropping the lock and attempting * to resolve the error, the walk would have to be restarted from the * top of the tree as the tree may have been modified. */ -#define MAS_START ((struct maple_enode *)1UL) -#define MAS_ROOT ((struct maple_enode *)5UL) -#define MAS_NONE ((struct maple_enode *)9UL) -#define MAS_PAUSE ((struct maple_enode *)17UL) -#define MAS_OVERFLOW ((struct maple_enode *)33UL) -#define MAS_UNDERFLOW ((struct maple_enode *)65UL) #define MA_ERROR(err) \ ((struct maple_enode *)(((unsigned long)err << 2) | 2UL)) @@ -446,7 +468,8 @@ struct ma_wr_state { .tree = mt, \ .index = first, \ .last = end, \ - .node = MAS_START, \ + .node = NULL, \ + .status = ma_start, \ .min = 0, \ .max = ULONG_MAX, \ .alloc = NULL, \ @@ -477,7 +500,6 @@ void *mas_find_range(struct ma_state *mas, unsigned long max); void *mas_find_rev(struct ma_state *mas, unsigned long min); void *mas_find_range_rev(struct ma_state *mas, unsigned long max); int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp); -bool mas_is_err(struct ma_state *mas); bool mas_nomem(struct ma_state *mas, gfp_t gfp); void mas_pause(struct ma_state *mas); @@ -506,28 +528,18 @@ static inline void mas_init(struct ma_state *mas, struct maple_tree *tree, mas->tree = tree; mas->index = mas->last = addr; mas->max = ULONG_MAX; - mas->node = MAS_START; + mas->status = ma_start; + mas->node = NULL; } -/* Checks if a mas has not found anything */ -static inline bool mas_is_none(const struct ma_state *mas) -{ - return mas->node == MAS_NONE; -} - -/* Checks if a mas has been paused */ -static inline bool mas_is_paused(const struct ma_state *mas) +static inline bool mas_is_active(struct ma_state *mas) { - return mas->node == MAS_PAUSE; + return mas->status == ma_active; } -/* Check if the mas is pointing to a node or not */ -static inline bool mas_is_active(struct ma_state *mas) +static inline bool mas_is_err(struct ma_state *mas) { - if ((unsigned long)mas->node >= MAPLE_RESERVED_RANGE) - return true; - - return false; + return mas->status == ma_error; } /** @@ -540,9 +552,10 @@ static inline bool mas_is_active(struct ma_state *mas) * * Context: Any context. */ -static inline void mas_reset(struct ma_state *mas) +static __always_inline void mas_reset(struct ma_state *mas) { - mas->node = MAS_START; + mas->status = ma_start; + mas->node = NULL; } /** @@ -716,7 +729,7 @@ static inline void __mas_set_range(struct ma_state *mas, unsigned long start, static inline void mas_set_range(struct ma_state *mas, unsigned long start, unsigned long last) { - mas->node = MAS_START; + mas_reset(mas); __mas_set_range(mas, start, last); } diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index ef18d2b25378..a66534c78c4d 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1071,7 +1071,8 @@ struct vma_iterator { .mas = { \ .tree = &(__mm)->mm_mt, \ .index = __addr, \ - .node = MAS_START, \ + .node = NULL, \ + .status = ma_start, \ }, \ } diff --git a/lib/maple_tree.c b/lib/maple_tree.c index f0d2aea91351..187a9796188e 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -249,40 +249,40 @@ static __always_inline bool mt_is_reserved(const void *entry) xa_is_internal(entry); } -static inline void mas_set_err(struct ma_state *mas, long err) +static __always_inline void mas_set_err(struct ma_state *mas, long err) { mas->node = MA_ERROR(err); + mas->status = ma_error; } -static inline bool mas_is_ptr(const struct ma_state *mas) +static __always_inline bool mas_is_ptr(const struct ma_state *mas) { - return mas->node == MAS_ROOT; + return mas->status == ma_root; } -static inline bool mas_is_start(const struct ma_state *mas) +static __always_inline bool mas_is_start(const struct ma_state *mas) { - return mas->node == MAS_START; + return mas->status == ma_start; } -bool mas_is_err(struct ma_state *mas) +static __always_inline bool mas_is_none(const struct ma_state *mas) { - return xa_is_err(mas->node); + return mas->status == ma_none; } -static __always_inline bool mas_is_overflow(struct ma_state *mas) +static __always_inline bool mas_is_paused(const struct ma_state *mas) { - if (unlikely(mas->node == MAS_OVERFLOW)) - return true; - - return false; + return mas->status == ma_pause; } -static __always_inline bool mas_is_underflow(struct ma_state *mas) +static __always_inline bool mas_is_overflow(struct ma_state *mas) { - if (unlikely(mas->node == MAS_UNDERFLOW)) - return true; + return mas->status == ma_overflow; +} - return false; +static inline bool mas_is_underflow(struct ma_state *mas) +{ + return mas->status == ma_underflow; } static inline bool mas_searchable(struct ma_state *mas) @@ -1274,6 +1274,7 @@ static inline void mas_alloc_nodes(struct ma_state *mas, gfp_t gfp) if (mas->mas_flags & MA_STATE_PREALLOC) { if (allocated) return; + BUG_ON(!allocated); WARN_ON(!allocated); } @@ -1379,14 +1380,14 @@ static void mas_node_count(struct ma_state *mas, int count) * mas_start() - Sets up maple state for operations. * @mas: The maple state. * - * If mas->node == MAS_START, then set the min, max and depth to + * If mas->status == mas_start, then set the min, max and depth to * defaults. * * Return: - * - If mas->node is an error or not MAS_START, return NULL. - * - If it's an empty tree: NULL & mas->node == MAS_NONE - * - If it's a single entry: The entry & mas->node == MAS_ROOT - * - If it's a tree: NULL & mas->node == safe root node. + * - If mas->node is an error or not mas_start, return NULL. + * - If it's an empty tree: NULL & mas->status == ma_none + * - If it's a single entry: The entry & mas->status == mas_root + * - If it's a tree: NULL & mas->status == safe root node. */ static inline struct maple_enode *mas_start(struct ma_state *mas) { @@ -1402,6 +1403,7 @@ retry: /* Tree with nodes */ if (likely(xa_is_node(root))) { mas->depth = 1; + mas->status = ma_active; mas->node = mte_safe_root(root); mas->offset = 0; if (mte_dead_node(mas->node)) @@ -1412,13 +1414,14 @@ retry: /* empty tree */ if (unlikely(!root)) { - mas->node = MAS_NONE; + mas->node = NULL; + mas->status = ma_none; mas->offset = MAPLE_NODE_SLOTS; return NULL; } /* Single entry tree */ - mas->node = MAS_ROOT; + mas->status = ma_root; mas->offset = MAPLE_NODE_SLOTS; /* Single entry tree. */ @@ -2225,19 +2228,21 @@ static inline bool mas_next_sibling(struct ma_state *mas) } /* - * mte_node_or_node() - Return the encoded node or MAS_NONE. + * mte_node_or_none() - Set the enode and state. * @enode: The encoded maple node. * - * Shorthand to avoid setting %NULLs in the tree or maple_subtree_state. - * - * Return: @enode or MAS_NONE + * Set the node to the enode and the status. */ -static inline struct maple_enode *mte_node_or_none(struct maple_enode *enode) +static inline void mas_node_or_none(struct ma_state *mas, + struct maple_enode *enode) { - if (enode) - return enode; - - return ma_enode_ptr(MAS_NONE); + if (enode) { + mas->node = enode; + mas->status = ma_active; + } else { + mas->node = NULL; + mas->status = ma_none; + } } /* @@ -2557,13 +2562,15 @@ static inline void mast_set_split_parents(struct maple_subtree_state *mast, * The node will either be RCU freed or pushed back on the maple state. */ static inline void mas_topiary_node(struct ma_state *mas, - struct maple_enode *enode, bool in_rcu) + struct ma_state *tmp_mas, bool in_rcu) { struct maple_node *tmp; + struct maple_enode *enode; - if (enode == MAS_NONE) + if (mas_is_none(tmp_mas)) return; + enode = tmp_mas->node; tmp = mte_to_node(enode); mte_set_node_dead(enode); if (in_rcu) @@ -2603,8 +2610,8 @@ static inline void mas_topiary_replace(struct ma_state *mas, /* Update the parent pointers in the tree */ tmp[0] = *mas; tmp[0].offset = 0; - tmp[1].node = MAS_NONE; - tmp[2].node = MAS_NONE; + tmp[1].status = ma_none; + tmp[2].status = ma_none; while (!mte_is_leaf(tmp[0].node)) { n = 0; for (i = 0; i < 3; i++) { @@ -2624,7 +2631,7 @@ static inline void mas_topiary_replace(struct ma_state *mas, break; while (n < 3) - tmp_next[n++].node = MAS_NONE; + tmp_next[n++].status = ma_none; for (i = 0; i < 3; i++) tmp[i] = tmp_next[i]; @@ -2637,8 +2644,8 @@ static inline void mas_topiary_replace(struct ma_state *mas, tmp[0] = *mas; tmp[0].offset = 0; tmp[0].node = old_enode; - tmp[1].node = MAS_NONE; - tmp[2].node = MAS_NONE; + tmp[1].status = ma_none; + tmp[2].status = ma_none; in_rcu = mt_in_rcu(mas->tree); do { n = 0; @@ -2653,7 +2660,7 @@ static inline void mas_topiary_replace(struct ma_state *mas, if ((tmp_next[n].min >= tmp_next->index) && (tmp_next[n].max <= tmp_next->last)) { mat_add(&subtrees, tmp_next[n].node); - tmp_next[n].node = MAS_NONE; + tmp_next[n].status = ma_none; } else { n++; } @@ -2664,16 +2671,16 @@ static inline void mas_topiary_replace(struct ma_state *mas, break; while (n < 3) - tmp_next[n++].node = MAS_NONE; + tmp_next[n++].status = ma_none; for (i = 0; i < 3; i++) { - mas_topiary_node(mas, tmp[i].node, in_rcu); + mas_topiary_node(mas, &tmp[i], in_rcu); tmp[i] = tmp_next[i]; } } while (!mte_is_leaf(tmp[0].node)); for (i = 0; i < 3; i++) - mas_topiary_node(mas, tmp[i].node, in_rcu); + mas_topiary_node(mas, &tmp[i], in_rcu); mas_mat_destroy(mas, &subtrees); } @@ -2712,9 +2719,9 @@ static inline void mast_cp_to_nodes(struct maple_subtree_state *mast, { bool new_lmax = true; - mast->l->node = mte_node_or_none(left); - mast->m->node = mte_node_or_none(middle); - mast->r->node = mte_node_or_none(right); + mas_node_or_none(mast->l, left); + mas_node_or_none(mast->m, middle); + mas_node_or_none(mast->r, right); mast->l->min = mast->orig_l->min; if (split == mast->bn->b_end) { @@ -2894,7 +2901,7 @@ static int mas_spanning_rebalance(struct ma_state *mas, mast->l = &l_mas; mast->m = &m_mas; mast->r = &r_mas; - l_mas.node = r_mas.node = m_mas.node = MAS_NONE; + l_mas.status = r_mas.status = m_mas.status = ma_none; /* Check if this is not root and has sufficient data. */ if (((mast->orig_l->min != 0) || (mast->orig_r->max != ULONG_MAX)) && @@ -3421,7 +3428,6 @@ static int mas_split(struct ma_state *mas, struct maple_big_node *b_node) /* Try to push left. */ if (mas_push_data(mas, height, &mast, true)) break; - /* Try to push right. */ if (mas_push_data(mas, height, &mast, false)) break; @@ -3537,6 +3543,7 @@ static inline int mas_root_expand(struct ma_state *mas, void *entry) slots = ma_slots(node, type); node->parent = ma_parent_ptr(mas_tree_parent(mas)); mas->node = mt_mk_node(node, type); + mas->status = ma_active; if (mas->index) { if (contents) { @@ -3569,7 +3576,7 @@ static inline void mas_store_root(struct ma_state *mas, void *entry) mas_root_expand(mas, entry); else { rcu_assign_pointer(mas->tree->ma_root, entry); - mas->node = MAS_START; + mas->status = ma_start; } } @@ -3801,7 +3808,7 @@ static inline int mas_new_root(struct ma_state *mas, void *entry) mas->depth = 0; mas_set_height(mas); rcu_assign_pointer(mas->tree->ma_root, entry); - mas->node = MAS_START; + mas->status = ma_start; goto done; } @@ -3814,6 +3821,7 @@ static inline int mas_new_root(struct ma_state *mas, void *entry) slots = ma_slots(node, type); node->parent = ma_parent_ptr(mas_tree_parent(mas)); mas->node = mt_mk_node(node, type); + mas->status = ma_active; rcu_assign_pointer(slots[0], entry); pivots[0] = mas->last; mas->depth = 1; @@ -4367,11 +4375,13 @@ static __always_inline bool mas_rewalk_if_dead(struct ma_state *mas, /* * mas_prev_node() - Find the prev non-null entry at the same level in the - * tree. The prev value will be mas->node[mas->offset] or MAS_NONE. + * tree. The prev value will be mas->node[mas->offset] or the status will be + * ma_none. * @mas: The maple state * @min: The lower limit to search * - * The prev node value will be mas->node[mas->offset] or MAS_NONE. + * The prev node value will be mas->node[mas->offset] or the status will be + * ma_none. * Return: 1 if the node is dead, 0 otherwise. */ static int mas_prev_node(struct ma_state *mas, unsigned long min) @@ -4441,7 +4451,7 @@ no_entry: if (unlikely(ma_dead_node(node))) return 1; - mas->node = MAS_NONE; + mas->status = ma_underflow; return 0; } @@ -4455,8 +4465,7 @@ no_entry: * * Return: The entry in the previous slot which is possibly NULL */ -static void *mas_prev_slot(struct ma_state *mas, unsigned long min, bool empty, - bool set_underflow) +static void *mas_prev_slot(struct ma_state *mas, unsigned long min, bool empty) { void *entry; void __rcu **slots; @@ -4489,13 +4498,16 @@ again: mas->last = mas->index - 1; mas->index = mas_safe_min(mas, pivots, mas->offset); } else { + if (mas->index <= min) + goto underflow; + if (mas_prev_node(mas, min)) { mas_rewalk(mas, save_point); goto retry; } - if (mas_is_none(mas)) - goto underflow; + if (WARN_ON_ONCE(mas_is_underflow(mas))) + return NULL; mas->last = mas->max; node = mas_mn(mas); @@ -4509,12 +4521,15 @@ again: if (unlikely(mas_rewalk_if_dead(mas, node, save_point))) goto retry; + if (likely(entry)) return entry; if (!empty) { - if (mas->index <= min) - goto underflow; + if (mas->index <= min) { + mas->status = ma_underflow; + return NULL; + } goto again; } @@ -4522,8 +4537,7 @@ again: return entry; underflow: - if (set_underflow) - mas->node = MAS_UNDERFLOW; + mas->status = ma_underflow; return NULL; } @@ -4532,7 +4546,8 @@ underflow: * @mas: The maple state * @max: The maximum pivot value to check. * - * The next value will be mas->node[mas->offset] or MAS_NONE. + * The next value will be mas->node[mas->offset] or the status will have + * overflowed. * Return: 1 on dead node, 0 otherwise. */ static int mas_next_node(struct ma_state *mas, struct maple_node *node, @@ -4548,13 +4563,13 @@ static int mas_next_node(struct ma_state *mas, struct maple_node *node, void __rcu **slots; if (mas->max >= max) - goto no_entry; + goto overflow; min = mas->max + 1; level = 0; do { if (ma_is_root(node)) - goto no_entry; + goto overflow; /* Walk up. */ if (unlikely(mas_ascend(mas))) @@ -4605,11 +4620,11 @@ static int mas_next_node(struct ma_state *mas, struct maple_node *node, mas->min = min; return 0; -no_entry: +overflow: if (unlikely(ma_dead_node(node))) return 1; - mas->node = MAS_NONE; + mas->status = ma_overflow; return 0; } @@ -4624,8 +4639,7 @@ no_entry: * * Return: The entry in the next slot which is possibly NULL */ -static void *mas_next_slot(struct ma_state *mas, unsigned long max, bool empty, - bool set_overflow) +static void *mas_next_slot(struct ma_state *mas, unsigned long max, bool empty) { void __rcu **slots; unsigned long *pivots; @@ -4646,13 +4660,15 @@ retry: if (likely(mas->offset < mas->end)) pivot = pivots[mas->offset]; else - goto overflow; + pivot = mas->max; if (unlikely(mas_rewalk_if_dead(mas, node, save_point))) goto retry; - if (pivot >= max) - goto overflow; + if (pivot >= max) { /* Was at the limit, next will extend beyond */ + mas->status = ma_overflow; + return NULL; + } } if (likely(mas->offset < mas->end)) { @@ -4664,16 +4680,18 @@ again: else mas->last = mas->max; } else { + if (mas->last >= max) { + mas->status = ma_overflow; + return NULL; + } + if (mas_next_node(mas, node, max)) { mas_rewalk(mas, save_point); goto retry; } - if (WARN_ON_ONCE(mas_is_none(mas))) { - mas->node = MAS_OVERFLOW; + if (WARN_ON_ONCE(mas_is_overflow(mas))) return NULL; - goto overflow; - } mas->offset = 0; mas->index = mas->min; @@ -4691,20 +4709,18 @@ again: if (entry) return entry; + if (!empty) { - if (mas->last >= max) - goto overflow; + if (mas->last >= max) { + mas->status = ma_overflow; + return NULL; + } mas->index = mas->last + 1; goto again; } return entry; - -overflow: - if (set_overflow) - mas->node = MAS_OVERFLOW; - return NULL; } /* @@ -4723,11 +4739,11 @@ overflow: static inline void *mas_next_entry(struct ma_state *mas, unsigned long limit) { if (mas->last >= limit) { - mas->node = MAS_OVERFLOW; + mas->status = ma_overflow; return NULL; } - return mas_next_slot(mas, limit, false, true); + return mas_next_slot(mas, limit, false); } /* @@ -4895,7 +4911,7 @@ done: * @mas: The maple state. * * mas->index and mas->last will be set to the range if there is a value. If - * mas->node is MAS_NONE, reset to MAS_START. + * mas->status is ma_none, reset to ma_start * * Return: the entry at the location or %NULL. */ @@ -4904,7 +4920,7 @@ void *mas_walk(struct ma_state *mas) void *entry; if (!mas_is_active(mas) || !mas_is_start(mas)) - mas->node = MAS_START; + mas->status = ma_start; retry: entry = mas_state_walk(mas); if (mas_is_start(mas)) { @@ -4920,7 +4936,7 @@ retry: mas->index = 1; mas->last = ULONG_MAX; - mas->node = MAS_NONE; + mas->status = ma_none; return NULL; } @@ -5672,27 +5688,40 @@ static bool mas_next_setup(struct ma_state *mas, unsigned long max, bool was_none = mas_is_none(mas); if (unlikely(mas->last >= max)) { - mas->node = MAS_OVERFLOW; + mas->status = ma_overflow; return true; } - if (mas_is_active(mas)) + switch (mas->status) { + case ma_active: return false; - - if (mas_is_none(mas) || mas_is_paused(mas)) { - mas->node = MAS_START; - } else if (mas_is_overflow(mas)) { + case ma_none: + fallthrough; + case ma_pause: + mas->status = ma_start; + fallthrough; + case ma_start: + mas_walk(mas); /* Retries on dead nodes handled by mas_walk */ + break; + case ma_overflow: /* Overflowed before, but the max changed */ - mas->node = MAS_START; - } else if (mas_is_underflow(mas)) { - mas->node = MAS_START; + mas->status = ma_active; + break; + case ma_underflow: + /* The user expects the mas to be one before where it is */ + mas->status = ma_active; *entry = mas_walk(mas); if (*entry) return true; + break; + case ma_root: + break; + case ma_error: + return true; } - if (mas_is_start(mas)) - *entry = mas_walk(mas); /* Retries on dead nodes handled by mas_walk */ + if (likely(mas_is_active(mas))) /* Fast path */ + return false; if (mas_is_ptr(mas)) { *entry = NULL; @@ -5702,7 +5731,7 @@ static bool mas_next_setup(struct ma_state *mas, unsigned long max, } mas->index = 1; mas->last = ULONG_MAX; - mas->node = MAS_NONE; + mas->status = ma_none; return true; } @@ -5731,7 +5760,7 @@ void *mas_next(struct ma_state *mas, unsigned long max) return entry; /* Retries on dead nodes handled by mas_next_slot */ - return mas_next_slot(mas, max, false, true); + return mas_next_slot(mas, max, false); } EXPORT_SYMBOL_GPL(mas_next); @@ -5754,7 +5783,7 @@ void *mas_next_range(struct ma_state *mas, unsigned long max) return entry; /* Retries on dead nodes handled by mas_next_slot */ - return mas_next_slot(mas, max, true, true); + return mas_next_slot(mas, max, true); } EXPORT_SYMBOL_GPL(mas_next_range); @@ -5785,33 +5814,45 @@ EXPORT_SYMBOL_GPL(mt_next); static bool mas_prev_setup(struct ma_state *mas, unsigned long min, void **entry) { if (unlikely(mas->index <= min)) { - mas->node = MAS_UNDERFLOW; + mas->status = ma_underflow; return true; } - if (mas_is_active(mas)) + switch (mas->status) { + case ma_active: return false; - - if (mas_is_overflow(mas)) { - mas->node = MAS_START; + case ma_start: + break; + case ma_none: + fallthrough; + case ma_pause: + mas->status = ma_start; + break; + case ma_underflow: + /* underflowed before but the min changed */ + mas->status = ma_active; + break; + case ma_overflow: + /* User expects mas to be one after where it is */ + mas->status = ma_active; *entry = mas_walk(mas); if (*entry) return true; - } - - if (mas_is_none(mas) || mas_is_paused(mas)) { - mas->node = MAS_START; - } else if (mas_is_underflow(mas)) { - /* underflowed before but the min changed */ - mas->node = MAS_START; + break; + case ma_root: + break; + case ma_error: + return true; } if (mas_is_start(mas)) mas_walk(mas); if (unlikely(mas_is_ptr(mas))) { - if (!mas->index) - goto none; + if (!mas->index) { + mas->status = ma_none; + return true; + } mas->index = mas->last = 0; *entry = mas_root(mas); return true; @@ -5821,7 +5862,7 @@ static bool mas_prev_setup(struct ma_state *mas, unsigned long min, void **entry if (mas->index) { /* Walked to out-of-range pointer? */ mas->index = mas->last = 0; - mas->node = MAS_ROOT; + mas->status = ma_root; *entry = mas_root(mas); return true; } @@ -5829,10 +5870,6 @@ static bool mas_prev_setup(struct ma_state *mas, unsigned long min, void **entry } return false; - -none: - mas->node = MAS_NONE; - return true; } /** @@ -5841,7 +5878,7 @@ none: * @min: The minimum value to check. * * Must hold rcu_read_lock or the write lock. - * Will reset mas to MAS_START if the node is MAS_NONE. Will stop on not + * Will reset mas to ma_start if the status is ma_none. Will stop on not * searchable nodes. * * Return: the previous value or %NULL. @@ -5853,7 +5890,7 @@ void *mas_prev(struct ma_state *mas, unsigned long min) if (mas_prev_setup(mas, min, &entry)) return entry; - return mas_prev_slot(mas, min, false, true); + return mas_prev_slot(mas, min, false); } EXPORT_SYMBOL_GPL(mas_prev); @@ -5864,7 +5901,7 @@ EXPORT_SYMBOL_GPL(mas_prev); * * Sets @mas->index and @mas->last to the range. * Must hold rcu_read_lock or the write lock. - * Will reset mas to MAS_START if the node is MAS_NONE. Will stop on not + * Will reset mas to ma_start if the node is ma_none. Will stop on not * searchable nodes. * * Return: the previous value or %NULL. @@ -5876,7 +5913,7 @@ void *mas_prev_range(struct ma_state *mas, unsigned long min) if (mas_prev_setup(mas, min, &entry)) return entry; - return mas_prev_slot(mas, min, true, true); + return mas_prev_slot(mas, min, true); } EXPORT_SYMBOL_GPL(mas_prev_range); @@ -5919,7 +5956,8 @@ EXPORT_SYMBOL_GPL(mt_prev); */ void mas_pause(struct ma_state *mas) { - mas->node = MAS_PAUSE; + mas->status = ma_pause; + mas->node = NULL; } EXPORT_SYMBOL_GPL(mas_pause); @@ -5933,32 +5971,52 @@ EXPORT_SYMBOL_GPL(mas_pause); */ static __always_inline bool mas_find_setup(struct ma_state *mas, unsigned long max, void **entry) { - if (mas_is_active(mas)) { + switch (mas->status) { + case ma_active: if (mas->last < max) return false; - return true; - } - - if (mas_is_paused(mas)) { + case ma_start: + break; + case ma_pause: if (unlikely(mas->last >= max)) return true; mas->index = ++mas->last; - mas->node = MAS_START; - } else if (mas_is_none(mas)) { + mas->status = ma_start; + break; + case ma_none: if (unlikely(mas->last >= max)) return true; mas->index = mas->last; - mas->node = MAS_START; - } else if (mas_is_overflow(mas) || mas_is_underflow(mas)) { - if (mas->index > max) { - mas->node = MAS_OVERFLOW; + mas->status = ma_start; + break; + case ma_underflow: + /* mas is pointing at entry before unable to go lower */ + if (unlikely(mas->index >= max)) { + mas->status = ma_overflow; return true; } - mas->node = MAS_START; + mas->status = ma_active; + *entry = mas_walk(mas); + if (*entry) + return true; + break; + case ma_overflow: + if (unlikely(mas->last >= max)) + return true; + + mas->status = ma_active; + *entry = mas_walk(mas); + if (*entry) + return true; + break; + case ma_root: + break; + case ma_error: + return true; } if (mas_is_start(mas)) { @@ -5985,7 +6043,7 @@ static __always_inline bool mas_find_setup(struct ma_state *mas, unsigned long m return false; ptr_out_of_range: - mas->node = MAS_NONE; + mas->status = ma_none; mas->index = 1; mas->last = ULONG_MAX; return true; @@ -5999,7 +6057,7 @@ ptr_out_of_range: * * Must hold rcu_read_lock or the write lock. * If an entry exists, last and index are updated accordingly. - * May set @mas->node to MAS_NONE. + * May set @mas->status to ma_overflow. * * Return: The entry or %NULL. */ @@ -6011,7 +6069,10 @@ void *mas_find(struct ma_state *mas, unsigned long max) return entry; /* Retries on dead nodes handled by mas_next_slot */ - return mas_next_slot(mas, max, false, false); + entry = mas_next_slot(mas, max, false); + /* Ignore overflow */ + mas->status = ma_active; + return entry; } EXPORT_SYMBOL_GPL(mas_find); @@ -6023,7 +6084,7 @@ EXPORT_SYMBOL_GPL(mas_find); * * Must hold rcu_read_lock or the write lock. * If an entry exists, last and index are updated accordingly. - * May set @mas->node to MAS_NONE. + * May set @mas->status to ma_overflow. * * Return: The entry or %NULL. */ @@ -6035,7 +6096,7 @@ void *mas_find_range(struct ma_state *mas, unsigned long max) return entry; /* Retries on dead nodes handled by mas_next_slot */ - return mas_next_slot(mas, max, true, false); + return mas_next_slot(mas, max, true); } EXPORT_SYMBOL_GPL(mas_find_range); @@ -6050,33 +6111,45 @@ EXPORT_SYMBOL_GPL(mas_find_range); static bool mas_find_rev_setup(struct ma_state *mas, unsigned long min, void **entry) { - if (mas_is_active(mas)) { - if (mas->index > min) - return false; - - return true; - } - if (mas_is_paused(mas)) { + switch (mas->status) { + case ma_active: + goto active; + case ma_start: + break; + case ma_pause: if (unlikely(mas->index <= min)) { - mas->node = MAS_NONE; + mas->status = ma_underflow; return true; } - mas->node = MAS_START; mas->last = --mas->index; - } else if (mas_is_none(mas)) { + mas->status = ma_start; + break; + case ma_none: if (mas->index <= min) goto none; mas->last = mas->index; - mas->node = MAS_START; - } else if (mas_is_underflow(mas) || mas_is_overflow(mas)) { - if (mas->last <= min) { - mas->node = MAS_UNDERFLOW; + mas->status = ma_start; + break; + case ma_overflow: /* user expects the mas to be one after where it is */ + if (unlikely(mas->index <= min)) { + mas->status = ma_underflow; return true; } - mas->node = MAS_START; + mas->status = ma_active; + break; + case ma_underflow: /* user expects the mas to be one before where it is */ + if (unlikely(mas->index <= min)) + return true; + + mas->status = ma_active; + break; + case ma_root: + break; + case ma_error: + return true; } if (mas_is_start(mas)) { @@ -6099,19 +6172,20 @@ static bool mas_find_rev_setup(struct ma_state *mas, unsigned long min, * previous location is 0. */ mas->last = mas->index = 0; - mas->node = MAS_ROOT; + mas->status = ma_root; *entry = mas_root(mas); return true; } } +active: if (mas->index < min) return true; return false; none: - mas->node = MAS_NONE; + mas->status = ma_none; return true; } @@ -6124,7 +6198,7 @@ none: * * Must hold rcu_read_lock or the write lock. * If an entry exists, last and index are updated accordingly. - * May set @mas->node to MAS_NONE. + * May set @mas->status to ma_underflow. * * Return: The entry or %NULL. */ @@ -6136,7 +6210,7 @@ void *mas_find_rev(struct ma_state *mas, unsigned long min) return entry; /* Retries on dead nodes handled by mas_prev_slot */ - return mas_prev_slot(mas, min, false, false); + return mas_prev_slot(mas, min, false); } EXPORT_SYMBOL_GPL(mas_find_rev); @@ -6150,7 +6224,7 @@ EXPORT_SYMBOL_GPL(mas_find_rev); * * Must hold rcu_read_lock or the write lock. * If an entry exists, last and index are updated accordingly. - * May set @mas->node to MAS_NONE. + * May set @mas->status to ma_underflow. * * Return: The entry or %NULL. */ @@ -6162,7 +6236,7 @@ void *mas_find_range_rev(struct ma_state *mas, unsigned long min) return entry; /* Retries on dead nodes handled by mas_prev_slot */ - return mas_prev_slot(mas, min, true, false); + return mas_prev_slot(mas, min, true); } EXPORT_SYMBOL_GPL(mas_find_range_rev); @@ -6183,7 +6257,7 @@ void *mas_erase(struct ma_state *mas) MA_WR_STATE(wr_mas, mas, NULL); if (!mas_is_active(mas) || !mas_is_start(mas)) - mas->node = MAS_START; + mas->status = ma_start; /* Retry unnecessary when holding the write lock. */ entry = mas_state_walk(mas); @@ -6228,7 +6302,7 @@ bool mas_nomem(struct ma_state *mas, gfp_t gfp) if (!mas_allocated(mas)) return false; - mas->node = MAS_START; + mas->status = ma_start; return true; } @@ -6627,7 +6701,7 @@ static inline void mas_dup_build(struct ma_state *mas, struct ma_state *new_mas, node = mt_alloc_one(gfp); if (!node) { - new_mas->node = MAS_NONE; + new_mas->status = ma_none; mas_set_err(mas, -ENOMEM); return; } @@ -6971,11 +7045,11 @@ static inline struct maple_enode *mas_get_slot(struct ma_state *mas, static void mas_dfs_postorder(struct ma_state *mas, unsigned long max) { - struct maple_enode *p = MAS_NONE, *mn = mas->node; + struct maple_enode *p, *mn = mas->node; unsigned long p_min, p_max; mas_next_node(mas, mas_mn(mas), max); - if (!mas_is_none(mas)) + if (!mas_is_overflow(mas)) return; if (mte_is_root(mn)) @@ -6988,7 +7062,7 @@ static void mas_dfs_postorder(struct ma_state *mas, unsigned long max) p_min = mas->min; p_max = mas->max; mas_prev_node(mas, 0); - } while (!mas_is_none(mas)); + } while (!mas_is_underflow(mas)); mas->node = p; mas->max = p_max; @@ -7443,7 +7517,7 @@ static void mt_validate_nulls(struct maple_tree *mt) MA_STATE(mas, mt, 0, 0); mas_start(&mas); - if (mas_is_none(&mas) || (mas.node == MAS_ROOT)) + if (mas_is_none(&mas) || (mas_is_ptr(&mas))) return; while (!mte_is_leaf(mas.node)) @@ -7460,7 +7534,7 @@ static void mt_validate_nulls(struct maple_tree *mt) last = entry; if (offset == mas_data_end(&mas)) { mas_next_node(&mas, mas_mn(&mas), ULONG_MAX); - if (mas_is_none(&mas)) + if (mas_is_overflow(&mas)) return; offset = 0; slots = ma_slots(mte_to_node(mas.node), @@ -7469,7 +7543,7 @@ static void mt_validate_nulls(struct maple_tree *mt) offset++; } - } while (!mas_is_none(&mas)); + } while (!mas_is_overflow(&mas)); } /* @@ -7490,7 +7564,7 @@ void mt_validate(struct maple_tree *mt) while (!mte_is_leaf(mas.node)) mas_descend(&mas); - while (!mas_is_none(&mas)) { + while (!mas_is_overflow(&mas)) { MAS_WARN_ON(&mas, mte_dead_node(mas.node)); end = mas_data_end(&mas); if (MAS_WARN_ON(&mas, (end < mt_min_slot_count(mas.node)) && @@ -7515,16 +7589,35 @@ EXPORT_SYMBOL_GPL(mt_validate); void mas_dump(const struct ma_state *mas) { pr_err("MAS: tree=%p enode=%p ", mas->tree, mas->node); - if (mas_is_none(mas)) - pr_err("(MAS_NONE) "); - else if (mas_is_ptr(mas)) - pr_err("(MAS_ROOT) "); - else if (mas_is_start(mas)) - pr_err("(MAS_START) "); - else if (mas_is_paused(mas)) - pr_err("(MAS_PAUSED) "); - - pr_err("[%u] index=%lx last=%lx\n", mas->offset, mas->index, mas->last); + switch (mas->status) { + case ma_active: + pr_err("(ma_active)"); + break; + case ma_none: + pr_err("(ma_none)"); + break; + case ma_root: + pr_err("(ma_root)"); + break; + case ma_start: + pr_err("(ma_start) "); + break; + case ma_pause: + pr_err("(ma_pause) "); + break; + case ma_overflow: + pr_err("(ma_overflow) "); + break; + case ma_underflow: + pr_err("(ma_underflow) "); + break; + case ma_error: + pr_err("(ma_error) "); + break; + } + + pr_err("[%u/%u] index=%lx last=%lx\n", mas->offset, mas->end, + mas->index, mas->last); pr_err(" min=%lx max=%lx alloc=%p, depth=%u, flags=%x\n", mas->min, mas->max, mas->alloc, mas->depth, mas->mas_flags); if (mas->index > mas->last) diff --git a/lib/test_maple_tree.c b/lib/test_maple_tree.c index 3e4597fb49d3..e7a5d688c9e0 100644 --- a/lib/test_maple_tree.c +++ b/lib/test_maple_tree.c @@ -54,6 +54,11 @@ atomic_t maple_tree_tests_passed; #else #define cond_resched() do {} while (0) #endif + +#define mas_is_none(x) ((x)->status == ma_none) +#define mas_is_overflow(x) ((x)->status == ma_overflow) +#define mas_is_underflow(x) ((x)->status == ma_underflow) + static int __init mtree_insert_index(struct maple_tree *mt, unsigned long index, gfp_t gfp) { @@ -582,7 +587,7 @@ static noinline void __init check_find(struct maple_tree *mt) MT_BUG_ON(mt, last != mas.last); - mas.node = MAS_NONE; + mas.status = ma_none; mas.index = ULONG_MAX; mas.last = ULONG_MAX; entry2 = mas_prev(&mas, 0); @@ -2178,7 +2183,7 @@ static noinline void __init next_prev_test(struct maple_tree *mt) MT_BUG_ON(mt, val != NULL); MT_BUG_ON(mt, mas.index != 0); MT_BUG_ON(mt, mas.last != 5); - MT_BUG_ON(mt, mas.node != MAS_UNDERFLOW); + MT_BUG_ON(mt, !mas_is_underflow(&mas)); mas.index = 0; mas.last = 5; @@ -3042,10 +3047,6 @@ static noinline void __init check_empty_area_fill(struct maple_tree *mt) * DNE active active range of NULL */ -#define mas_active(x) (((x).node != MAS_ROOT) && \ - ((x).node != MAS_START) && \ - ((x).node != MAS_PAUSE) && \ - ((x).node != MAS_NONE)) static noinline void __init check_state_handling(struct maple_tree *mt) { MA_STATE(mas, mt, 0, 0); @@ -3060,7 +3061,7 @@ static noinline void __init check_state_handling(struct maple_tree *mt) /* prev: Start -> underflow*/ entry = mas_prev(&mas, 0); MT_BUG_ON(mt, entry != NULL); - MT_BUG_ON(mt, mas.node != MAS_UNDERFLOW); + MT_BUG_ON(mt, mas.status != ma_underflow); /* prev: Start -> root */ mas_set(&mas, 10); @@ -3068,7 +3069,7 @@ static noinline void __init check_state_handling(struct maple_tree *mt) MT_BUG_ON(mt, entry != ptr); MT_BUG_ON(mt, mas.index != 0); MT_BUG_ON(mt, mas.last != 0); - MT_BUG_ON(mt, mas.node != MAS_ROOT); + MT_BUG_ON(mt, mas.status != ma_root); /* prev: pause -> root */ mas_set(&mas, 10); @@ -3077,7 +3078,7 @@ static noinline void __init check_state_handling(struct maple_tree *mt) MT_BUG_ON(mt, entry != ptr); MT_BUG_ON(mt, mas.index != 0); MT_BUG_ON(mt, mas.last != 0); - MT_BUG_ON(mt, mas.node != MAS_ROOT); + MT_BUG_ON(mt, mas.status != ma_root); /* next: start -> none */ mas_set(&mas, 0); @@ -3085,7 +3086,7 @@ static noinline void __init check_state_handling(struct maple_tree *mt) MT_BUG_ON(mt, mas.index != 1); MT_BUG_ON(mt, mas.last != ULONG_MAX); MT_BUG_ON(mt, entry != NULL); - MT_BUG_ON(mt, mas.node != MAS_NONE); + MT_BUG_ON(mt, mas.status != ma_none); /* next: start -> none*/ mas_set(&mas, 10); @@ -3093,7 +3094,7 @@ static noinline void __init check_state_handling(struct maple_tree *mt) MT_BUG_ON(mt, mas.index != 1); MT_BUG_ON(mt, mas.last != ULONG_MAX); MT_BUG_ON(mt, entry != NULL); - MT_BUG_ON(mt, mas.node != MAS_NONE); + MT_BUG_ON(mt, mas.status != ma_none); /* find: start -> root */ mas_set(&mas, 0); @@ -3101,21 +3102,21 @@ static noinline void __init check_state_handling(struct maple_tree *mt) MT_BUG_ON(mt, entry != ptr); MT_BUG_ON(mt, mas.index != 0); MT_BUG_ON(mt, mas.last != 0); - MT_BUG_ON(mt, mas.node != MAS_ROOT); + MT_BUG_ON(mt, mas.status != ma_root); /* find: root -> none */ entry = mas_find(&mas, ULONG_MAX); MT_BUG_ON(mt, entry != NULL); MT_BUG_ON(mt, mas.index != 1); MT_BUG_ON(mt, mas.last != ULONG_MAX); - MT_BUG_ON(mt, mas.node != MAS_NONE); + MT_BUG_ON(mt, mas.status != ma_none); /* find: none -> none */ entry = mas_find(&mas, ULONG_MAX); MT_BUG_ON(mt, entry != NULL); MT_BUG_ON(mt, mas.index != 1); MT_BUG_ON(mt, mas.last != ULONG_MAX); - MT_BUG_ON(mt, mas.node != MAS_NONE); + MT_BUG_ON(mt, mas.status != ma_none); /* find: start -> none */ mas_set(&mas, 10); @@ -3123,14 +3124,14 @@ static noinline void __init check_state_handling(struct maple_tree *mt) MT_BUG_ON(mt, entry != NULL); MT_BUG_ON(mt, mas.index != 1); MT_BUG_ON(mt, mas.last != ULONG_MAX); - MT_BUG_ON(mt, mas.node != MAS_NONE); + MT_BUG_ON(mt, mas.status != ma_none); /* find_rev: none -> root */ entry = mas_find_rev(&mas, 0); MT_BUG_ON(mt, entry != ptr); MT_BUG_ON(mt, mas.index != 0); MT_BUG_ON(mt, mas.last != 0); - MT_BUG_ON(mt, mas.node != MAS_ROOT); + MT_BUG_ON(mt, mas.status != ma_root); /* find_rev: start -> root */ mas_set(&mas, 0); @@ -3138,21 +3139,21 @@ static noinline void __init check_state_handling(struct maple_tree *mt) MT_BUG_ON(mt, entry != ptr); MT_BUG_ON(mt, mas.index != 0); MT_BUG_ON(mt, mas.last != 0); - MT_BUG_ON(mt, mas.node != MAS_ROOT); + MT_BUG_ON(mt, mas.status != ma_root); /* find_rev: root -> none */ entry = mas_find_rev(&mas, 0); MT_BUG_ON(mt, entry != NULL); MT_BUG_ON(mt, mas.index != 0); MT_BUG_ON(mt, mas.last != 0); - MT_BUG_ON(mt, mas.node != MAS_NONE); + MT_BUG_ON(mt, mas.status != ma_none); /* find_rev: none -> none */ entry = mas_find_rev(&mas, 0); MT_BUG_ON(mt, entry != NULL); MT_BUG_ON(mt, mas.index != 0); MT_BUG_ON(mt, mas.last != 0); - MT_BUG_ON(mt, mas.node != MAS_NONE); + MT_BUG_ON(mt, mas.status != ma_none); /* find_rev: start -> root */ mas_set(&mas, 10); @@ -3160,7 +3161,7 @@ static noinline void __init check_state_handling(struct maple_tree *mt) MT_BUG_ON(mt, entry != ptr); MT_BUG_ON(mt, mas.index != 0); MT_BUG_ON(mt, mas.last != 0); - MT_BUG_ON(mt, mas.node != MAS_ROOT); + MT_BUG_ON(mt, mas.status != ma_root); /* walk: start -> none */ mas_set(&mas, 10); @@ -3168,7 +3169,7 @@ static noinline void __init check_state_handling(struct maple_tree *mt) MT_BUG_ON(mt, entry != NULL); MT_BUG_ON(mt, mas.index != 1); MT_BUG_ON(mt, mas.last != ULONG_MAX); - MT_BUG_ON(mt, mas.node != MAS_NONE); + MT_BUG_ON(mt, mas.status != ma_none); /* walk: pause -> none*/ mas_set(&mas, 10); @@ -3177,7 +3178,7 @@ static noinline void __init check_state_handling(struct maple_tree *mt) MT_BUG_ON(mt, entry != NULL); MT_BUG_ON(mt, mas.index != 1); MT_BUG_ON(mt, mas.last != ULONG_MAX); - MT_BUG_ON(mt, mas.node != MAS_NONE); + MT_BUG_ON(mt, mas.status != ma_none); /* walk: none -> none */ mas.index = mas.last = 10; @@ -3185,14 +3186,14 @@ static noinline void __init check_state_handling(struct maple_tree *mt) MT_BUG_ON(mt, entry != NULL); MT_BUG_ON(mt, mas.index != 1); MT_BUG_ON(mt, mas.last != ULONG_MAX); - MT_BUG_ON(mt, mas.node != MAS_NONE); + MT_BUG_ON(mt, mas.status != ma_none); /* walk: none -> none */ entry = mas_walk(&mas); MT_BUG_ON(mt, entry != NULL); MT_BUG_ON(mt, mas.index != 1); MT_BUG_ON(mt, mas.last != ULONG_MAX); - MT_BUG_ON(mt, mas.node != MAS_NONE); + MT_BUG_ON(mt, mas.status != ma_none); /* walk: start -> root */ mas_set(&mas, 0); @@ -3200,7 +3201,7 @@ static noinline void __init check_state_handling(struct maple_tree *mt) MT_BUG_ON(mt, entry != ptr); MT_BUG_ON(mt, mas.index != 0); MT_BUG_ON(mt, mas.last != 0); - MT_BUG_ON(mt, mas.node != MAS_ROOT); + MT_BUG_ON(mt, mas.status != ma_root); /* walk: pause -> root */ mas_set(&mas, 0); @@ -3209,22 +3210,22 @@ static noinline void __init check_state_handling(struct maple_tree *mt) MT_BUG_ON(mt, entry != ptr); MT_BUG_ON(mt, mas.index != 0); MT_BUG_ON(mt, mas.last != 0); - MT_BUG_ON(mt, mas.node != MAS_ROOT); + MT_BUG_ON(mt, mas.status != ma_root); /* walk: none -> root */ - mas.node = MAS_NONE; + mas.status = ma_none; entry = mas_walk(&mas); MT_BUG_ON(mt, entry != ptr); MT_BUG_ON(mt, mas.index != 0); MT_BUG_ON(mt, mas.last != 0); - MT_BUG_ON(mt, mas.node != MAS_ROOT); + MT_BUG_ON(mt, mas.status != ma_root); /* walk: root -> root */ entry = mas_walk(&mas); MT_BUG_ON(mt, entry != ptr); MT_BUG_ON(mt, mas.index != 0); MT_BUG_ON(mt, mas.last != 0); - MT_BUG_ON(mt, mas.node != MAS_ROOT); + MT_BUG_ON(mt, mas.status != ma_root); /* walk: root -> none */ mas_set(&mas, 10); @@ -3232,7 +3233,7 @@ static noinline void __init check_state_handling(struct maple_tree *mt) MT_BUG_ON(mt, entry != NULL); MT_BUG_ON(mt, mas.index != 1); MT_BUG_ON(mt, mas.last != ULONG_MAX); - MT_BUG_ON(mt, mas.node != MAS_NONE); + MT_BUG_ON(mt, mas.status != ma_none); /* walk: none -> root */ mas.index = mas.last = 0; @@ -3240,7 +3241,7 @@ static noinline void __init check_state_handling(struct maple_tree *mt) MT_BUG_ON(mt, entry != ptr); MT_BUG_ON(mt, mas.index != 0); MT_BUG_ON(mt, mas.last != 0); - MT_BUG_ON(mt, mas.node != MAS_ROOT); + MT_BUG_ON(mt, mas.status != ma_root); mas_unlock(&mas); @@ -3258,7 +3259,7 @@ static noinline void __init check_state_handling(struct maple_tree *mt) MT_BUG_ON(mt, entry != ptr); MT_BUG_ON(mt, mas.index != 0x1000); MT_BUG_ON(mt, mas.last != 0x1500); - MT_BUG_ON(mt, !mas_active(mas)); + MT_BUG_ON(mt, !mas_is_active(&mas)); /* next: pause ->active */ mas_set(&mas, 0); @@ -3267,126 +3268,132 @@ static noinline void __init check_state_handling(struct maple_tree *mt) MT_BUG_ON(mt, entry != ptr); MT_BUG_ON(mt, mas.index != 0x1000); MT_BUG_ON(mt, mas.last != 0x1500); - MT_BUG_ON(mt, !mas_active(mas)); + MT_BUG_ON(mt, !mas_is_active(&mas)); /* next: none ->active */ mas.index = mas.last = 0; mas.offset = 0; - mas.node = MAS_NONE; + mas.status = ma_none; entry = mas_next(&mas, ULONG_MAX); MT_BUG_ON(mt, entry != ptr); MT_BUG_ON(mt, mas.index != 0x1000); MT_BUG_ON(mt, mas.last != 0x1500); - MT_BUG_ON(mt, !mas_active(mas)); + MT_BUG_ON(mt, !mas_is_active(&mas)); - /* next:active ->active */ - entry = mas_next(&mas, ULONG_MAX); + /* next:active ->active (spanning limit) */ + entry = mas_next(&mas, 0x2100); MT_BUG_ON(mt, entry != ptr2); MT_BUG_ON(mt, mas.index != 0x2000); MT_BUG_ON(mt, mas.last != 0x2500); - MT_BUG_ON(mt, !mas_active(mas)); + MT_BUG_ON(mt, !mas_is_active(&mas)); - /* next:active -> active beyond data */ + /* next:active -> overflow (limit reached) beyond data */ entry = mas_next(&mas, 0x2999); MT_BUG_ON(mt, entry != NULL); MT_BUG_ON(mt, mas.index != 0x2501); MT_BUG_ON(mt, mas.last != 0x2fff); - MT_BUG_ON(mt, !mas_active(mas)); + MT_BUG_ON(mt, !mas_is_overflow(&mas)); - /* Continue after last range ends after max */ + /* next:overflow -> active (limit changed) */ entry = mas_next(&mas, ULONG_MAX); MT_BUG_ON(mt, entry != ptr3); MT_BUG_ON(mt, mas.index != 0x3000); MT_BUG_ON(mt, mas.last != 0x3500); - MT_BUG_ON(mt, !mas_active(mas)); + MT_BUG_ON(mt, !mas_is_active(&mas)); - /* next:active -> active continued */ + /* next:active -> overflow (limit reached) */ entry = mas_next(&mas, ULONG_MAX); MT_BUG_ON(mt, entry != NULL); MT_BUG_ON(mt, mas.index != 0x3501); MT_BUG_ON(mt, mas.last != ULONG_MAX); - MT_BUG_ON(mt, !mas_active(mas)); - - /* next:active -> overflow */ - entry = mas_next(&mas, ULONG_MAX); - MT_BUG_ON(mt, entry != NULL); - MT_BUG_ON(mt, mas.index != 0x3501); - MT_BUG_ON(mt, mas.last != ULONG_MAX); - MT_BUG_ON(mt, mas.node != MAS_OVERFLOW); + MT_BUG_ON(mt, !mas_is_overflow(&mas)); /* next:overflow -> overflow */ entry = mas_next(&mas, ULONG_MAX); MT_BUG_ON(mt, entry != NULL); MT_BUG_ON(mt, mas.index != 0x3501); MT_BUG_ON(mt, mas.last != ULONG_MAX); - MT_BUG_ON(mt, mas.node != MAS_OVERFLOW); + MT_BUG_ON(mt, !mas_is_overflow(&mas)); /* prev:overflow -> active */ entry = mas_prev(&mas, 0); MT_BUG_ON(mt, entry != ptr3); MT_BUG_ON(mt, mas.index != 0x3000); MT_BUG_ON(mt, mas.last != 0x3500); - MT_BUG_ON(mt, !mas_active(mas)); + MT_BUG_ON(mt, !mas_is_active(&mas)); /* next: none -> active, skip value at location */ mas_set(&mas, 0); entry = mas_next(&mas, ULONG_MAX); - mas.node = MAS_NONE; + mas.status = ma_none; mas.offset = 0; entry = mas_next(&mas, ULONG_MAX); MT_BUG_ON(mt, entry != ptr2); MT_BUG_ON(mt, mas.index != 0x2000); MT_BUG_ON(mt, mas.last != 0x2500); - MT_BUG_ON(mt, !mas_active(mas)); + MT_BUG_ON(mt, !mas_is_active(&mas)); /* prev:active ->active */ entry = mas_prev(&mas, 0); MT_BUG_ON(mt, entry != ptr); MT_BUG_ON(mt, mas.index != 0x1000); MT_BUG_ON(mt, mas.last != 0x1500); - MT_BUG_ON(mt, !mas_active(mas)); + MT_BUG_ON(mt, !mas_is_active(&mas)); - /* prev:active -> active spanning end range */ + /* prev:active -> underflow (span limit) */ + mas_next(&mas, ULONG_MAX); + entry = mas_prev(&mas, 0x1200); + MT_BUG_ON(mt, entry != ptr); + MT_BUG_ON(mt, mas.index != 0x1000); + MT_BUG_ON(mt, mas.last != 0x1500); + MT_BUG_ON(mt, !mas_is_active(&mas)); /* spanning limit */ + entry = mas_prev(&mas, 0x1200); /* underflow */ + MT_BUG_ON(mt, entry != NULL); + MT_BUG_ON(mt, mas.index != 0x1000); + MT_BUG_ON(mt, mas.last != 0x1500); + MT_BUG_ON(mt, !mas_is_underflow(&mas)); + + /* prev:underflow -> underflow (lower limit) spanning end range */ entry = mas_prev(&mas, 0x0100); MT_BUG_ON(mt, entry != NULL); MT_BUG_ON(mt, mas.index != 0); MT_BUG_ON(mt, mas.last != 0x0FFF); - MT_BUG_ON(mt, !mas_active(mas)); + MT_BUG_ON(mt, !mas_is_underflow(&mas)); - /* prev:active -> underflow */ + /* prev:underflow -> underflow */ entry = mas_prev(&mas, 0); MT_BUG_ON(mt, entry != NULL); MT_BUG_ON(mt, mas.index != 0); MT_BUG_ON(mt, mas.last != 0x0FFF); - MT_BUG_ON(mt, mas.node != MAS_UNDERFLOW); + MT_BUG_ON(mt, !mas_is_underflow(&mas)); /* prev:underflow -> underflow */ entry = mas_prev(&mas, 0); MT_BUG_ON(mt, entry != NULL); MT_BUG_ON(mt, mas.index != 0); MT_BUG_ON(mt, mas.last != 0x0FFF); - MT_BUG_ON(mt, mas.node != MAS_UNDERFLOW); + MT_BUG_ON(mt, !mas_is_underflow(&mas)); /* next:underflow -> active */ entry = mas_next(&mas, ULONG_MAX); MT_BUG_ON(mt, entry != ptr); MT_BUG_ON(mt, mas.index != 0x1000); MT_BUG_ON(mt, mas.last != 0x1500); - MT_BUG_ON(mt, !mas_active(mas)); + MT_BUG_ON(mt, !mas_is_active(&mas)); /* prev:first value -> underflow */ entry = mas_prev(&mas, 0x1000); MT_BUG_ON(mt, entry != NULL); MT_BUG_ON(mt, mas.index != 0x1000); MT_BUG_ON(mt, mas.last != 0x1500); - MT_BUG_ON(mt, mas.node != MAS_UNDERFLOW); + MT_BUG_ON(mt, !mas_is_underflow(&mas)); /* find:underflow -> first value */ entry = mas_find(&mas, ULONG_MAX); MT_BUG_ON(mt, entry != ptr); MT_BUG_ON(mt, mas.index != 0x1000); MT_BUG_ON(mt, mas.last != 0x1500); - MT_BUG_ON(mt, !mas_active(mas)); + MT_BUG_ON(mt, !mas_is_active(&mas)); /* prev: pause ->active */ mas_set(&mas, 0x3600); @@ -3397,21 +3404,21 @@ static noinline void __init check_state_handling(struct maple_tree *mt) MT_BUG_ON(mt, entry != ptr2); MT_BUG_ON(mt, mas.index != 0x2000); MT_BUG_ON(mt, mas.last != 0x2500); - MT_BUG_ON(mt, !mas_active(mas)); + MT_BUG_ON(mt, !mas_is_active(&mas)); - /* prev:active -> active spanning min */ + /* prev:active -> underflow spanning min */ entry = mas_prev(&mas, 0x1600); MT_BUG_ON(mt, entry != NULL); MT_BUG_ON(mt, mas.index != 0x1501); MT_BUG_ON(mt, mas.last != 0x1FFF); - MT_BUG_ON(mt, !mas_active(mas)); + MT_BUG_ON(mt, !mas_is_underflow(&mas)); /* prev: active ->active, continue */ entry = mas_prev(&mas, 0); MT_BUG_ON(mt, entry != ptr); MT_BUG_ON(mt, mas.index != 0x1000); MT_BUG_ON(mt, mas.last != 0x1500); - MT_BUG_ON(mt, !mas_active(mas)); + MT_BUG_ON(mt, !mas_is_active(&mas)); /* find: start ->active */ mas_set(&mas, 0); @@ -3419,7 +3426,7 @@ static noinline void __init check_state_handling(struct maple_tree *mt) MT_BUG_ON(mt, entry != ptr); MT_BUG_ON(mt, mas.index != 0x1000); MT_BUG_ON(mt, mas.last != 0x1500); - MT_BUG_ON(mt, !mas_active(mas)); + MT_BUG_ON(mt, !mas_is_active(&mas)); /* find: pause ->active */ mas_set(&mas, 0); @@ -3428,7 +3435,7 @@ static noinline void __init check_state_handling(struct maple_tree *mt) MT_BUG_ON(mt, entry != ptr); MT_BUG_ON(mt, mas.index != 0x1000); MT_BUG_ON(mt, mas.last != 0x1500); - MT_BUG_ON(mt, !mas_active(mas)); + MT_BUG_ON(mt, !mas_is_active(&mas)); /* find: start ->active on value */; mas_set(&mas, 1200); @@ -3436,14 +3443,14 @@ static noinline void __init check_state_handling(struct maple_tree *mt) MT_BUG_ON(mt, entry != ptr); MT_BUG_ON(mt, mas.index != 0x1000); MT_BUG_ON(mt, mas.last != 0x1500); - MT_BUG_ON(mt, !mas_active(mas)); + MT_BUG_ON(mt, !mas_is_active(&mas)); /* find:active ->active */ entry = mas_find(&mas, ULONG_MAX); MT_BUG_ON(mt, entry != ptr2); MT_BUG_ON(mt, mas.index != 0x2000); MT_BUG_ON(mt, mas.last != 0x2500); - MT_BUG_ON(mt, !mas_active(mas)); + MT_BUG_ON(mt, !mas_is_active(&mas)); /* find:active -> active (NULL)*/ @@ -3451,35 +3458,35 @@ static noinline void __init check_state_handling(struct maple_tree *mt) MT_BUG_ON(mt, entry != NULL); MT_BUG_ON(mt, mas.index != 0x2501); MT_BUG_ON(mt, mas.last != 0x2FFF); - MT_BUG_ON(mt, !mas_active(mas)); + MAS_BUG_ON(&mas, !mas_is_active(&mas)); /* find: overflow ->active */ entry = mas_find(&mas, 0x5000); MT_BUG_ON(mt, entry != ptr3); MT_BUG_ON(mt, mas.index != 0x3000); MT_BUG_ON(mt, mas.last != 0x3500); - MT_BUG_ON(mt, !mas_active(mas)); + MT_BUG_ON(mt, !mas_is_active(&mas)); /* find:active -> active (NULL) end*/ entry = mas_find(&mas, ULONG_MAX); MT_BUG_ON(mt, entry != NULL); MT_BUG_ON(mt, mas.index != 0x3501); MT_BUG_ON(mt, mas.last != ULONG_MAX); - MT_BUG_ON(mt, !mas_active(mas)); + MAS_BUG_ON(&mas, !mas_is_active(&mas)); /* find_rev: active (END) ->active */ entry = mas_find_rev(&mas, 0); MT_BUG_ON(mt, entry != ptr3); MT_BUG_ON(mt, mas.index != 0x3000); MT_BUG_ON(mt, mas.last != 0x3500); - MT_BUG_ON(mt, !mas_active(mas)); + MT_BUG_ON(mt, !mas_is_active(&mas)); /* find_rev:active ->active */ entry = mas_find_rev(&mas, 0); MT_BUG_ON(mt, entry != ptr2); MT_BUG_ON(mt, mas.index != 0x2000); MT_BUG_ON(mt, mas.last != 0x2500); - MT_BUG_ON(mt, !mas_active(mas)); + MT_BUG_ON(mt, !mas_is_active(&mas)); /* find_rev: pause ->active */ mas_pause(&mas); @@ -3487,14 +3494,14 @@ static noinline void __init check_state_handling(struct maple_tree *mt) MT_BUG_ON(mt, entry != ptr); MT_BUG_ON(mt, mas.index != 0x1000); MT_BUG_ON(mt, mas.last != 0x1500); - MT_BUG_ON(mt, !mas_active(mas)); + MT_BUG_ON(mt, !mas_is_active(&mas)); - /* find_rev:active -> active */ + /* find_rev:active -> underflow */ entry = mas_find_rev(&mas, 0); MT_BUG_ON(mt, entry != NULL); MT_BUG_ON(mt, mas.index != 0); MT_BUG_ON(mt, mas.last != 0x0FFF); - MT_BUG_ON(mt, !mas_active(mas)); + MT_BUG_ON(mt, !mas_is_underflow(&mas)); /* find_rev: start ->active */ mas_set(&mas, 0x1200); @@ -3502,7 +3509,7 @@ static noinline void __init check_state_handling(struct maple_tree *mt) MT_BUG_ON(mt, entry != ptr); MT_BUG_ON(mt, mas.index != 0x1000); MT_BUG_ON(mt, mas.last != 0x1500); - MT_BUG_ON(mt, !mas_active(mas)); + MT_BUG_ON(mt, !mas_is_active(&mas)); /* mas_walk start ->active */ mas_set(&mas, 0x1200); @@ -3510,7 +3517,7 @@ static noinline void __init check_state_handling(struct maple_tree *mt) MT_BUG_ON(mt, entry != ptr); MT_BUG_ON(mt, mas.index != 0x1000); MT_BUG_ON(mt, mas.last != 0x1500); - MT_BUG_ON(mt, !mas_active(mas)); + MT_BUG_ON(mt, !mas_is_active(&mas)); /* mas_walk start ->active */ mas_set(&mas, 0x1600); @@ -3518,7 +3525,7 @@ static noinline void __init check_state_handling(struct maple_tree *mt) MT_BUG_ON(mt, entry != NULL); MT_BUG_ON(mt, mas.index != 0x1501); MT_BUG_ON(mt, mas.last != 0x1fff); - MT_BUG_ON(mt, !mas_active(mas)); + MT_BUG_ON(mt, !mas_is_active(&mas)); /* mas_walk pause ->active */ mas_set(&mas, 0x1200); @@ -3527,7 +3534,7 @@ static noinline void __init check_state_handling(struct maple_tree *mt) MT_BUG_ON(mt, entry != ptr); MT_BUG_ON(mt, mas.index != 0x1000); MT_BUG_ON(mt, mas.last != 0x1500); - MT_BUG_ON(mt, !mas_active(mas)); + MT_BUG_ON(mt, !mas_is_active(&mas)); /* mas_walk pause -> active */ mas_set(&mas, 0x1600); @@ -3536,25 +3543,25 @@ static noinline void __init check_state_handling(struct maple_tree *mt) MT_BUG_ON(mt, entry != NULL); MT_BUG_ON(mt, mas.index != 0x1501); MT_BUG_ON(mt, mas.last != 0x1fff); - MT_BUG_ON(mt, !mas_active(mas)); + MT_BUG_ON(mt, !mas_is_active(&mas)); /* mas_walk none -> active */ mas_set(&mas, 0x1200); - mas.node = MAS_NONE; + mas.status = ma_none; entry = mas_walk(&mas); MT_BUG_ON(mt, entry != ptr); MT_BUG_ON(mt, mas.index != 0x1000); MT_BUG_ON(mt, mas.last != 0x1500); - MT_BUG_ON(mt, !mas_active(mas)); + MT_BUG_ON(mt, !mas_is_active(&mas)); /* mas_walk none -> active */ mas_set(&mas, 0x1600); - mas.node = MAS_NONE; + mas.status = ma_none; entry = mas_walk(&mas); MT_BUG_ON(mt, entry != NULL); MT_BUG_ON(mt, mas.index != 0x1501); MT_BUG_ON(mt, mas.last != 0x1fff); - MT_BUG_ON(mt, !mas_active(mas)); + MT_BUG_ON(mt, !mas_is_active(&mas)); /* mas_walk active -> active */ mas.index = 0x1200; @@ -3564,7 +3571,7 @@ static noinline void __init check_state_handling(struct maple_tree *mt) MT_BUG_ON(mt, entry != ptr); MT_BUG_ON(mt, mas.index != 0x1000); MT_BUG_ON(mt, mas.last != 0x1500); - MT_BUG_ON(mt, !mas_active(mas)); + MT_BUG_ON(mt, !mas_is_active(&mas)); /* mas_walk active -> active */ mas.index = 0x1600; @@ -3573,7 +3580,7 @@ static noinline void __init check_state_handling(struct maple_tree *mt) MT_BUG_ON(mt, entry != NULL); MT_BUG_ON(mt, mas.index != 0x1501); MT_BUG_ON(mt, mas.last != 0x1fff); - MT_BUG_ON(mt, !mas_active(mas)); + MT_BUG_ON(mt, !mas_is_active(&mas)); mas_unlock(&mas); } diff --git a/mm/internal.h b/mm/internal.h index 0005b8adbd5c..8450562744cf 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1163,13 +1163,13 @@ static inline void vma_iter_store(struct vma_iterator *vmi, { #if defined(CONFIG_DEBUG_VM_MAPLE_TREE) - if (MAS_WARN_ON(&vmi->mas, vmi->mas.node != MAS_START && + if (MAS_WARN_ON(&vmi->mas, vmi->mas.status != ma_start && vmi->mas.index > vma->vm_start)) { pr_warn("%lx > %lx\n store vma %lx-%lx\n into slot %lx-%lx\n", vmi->mas.index, vma->vm_start, vma->vm_start, vma->vm_end, vmi->mas.index, vmi->mas.last); } - if (MAS_WARN_ON(&vmi->mas, vmi->mas.node != MAS_START && + if (MAS_WARN_ON(&vmi->mas, vmi->mas.status != ma_start && vmi->mas.last < vma->vm_start)) { pr_warn("%lx < %lx\nstore vma %lx-%lx\ninto slot %lx-%lx\n", vmi->mas.last, vma->vm_start, vma->vm_start, vma->vm_end, @@ -1177,7 +1177,7 @@ static inline void vma_iter_store(struct vma_iterator *vmi, } #endif - if (vmi->mas.node != MAS_START && + if (vmi->mas.status != ma_start && ((vmi->mas.index > vma->vm_start) || (vmi->mas.last < vma->vm_start))) vma_iter_invalidate(vmi); @@ -1188,7 +1188,7 @@ static inline void vma_iter_store(struct vma_iterator *vmi, static inline int vma_iter_store_gfp(struct vma_iterator *vmi, struct vm_area_struct *vma, gfp_t gfp) { - if (vmi->mas.node != MAS_START && + if (vmi->mas.status != ma_start && ((vmi->mas.index > vma->vm_start) || (vmi->mas.last < vma->vm_start))) vma_iter_invalidate(vmi); diff --git a/tools/testing/radix-tree/maple.c b/tools/testing/radix-tree/maple.c index 7095fb0ec026..857c439e6bbc 100644 --- a/tools/testing/radix-tree/maple.c +++ b/tools/testing/radix-tree/maple.c @@ -118,6 +118,7 @@ static noinline void __init check_new_node(struct maple_tree *mt) MT_BUG_ON(mt, mas.alloc == NULL); MT_BUG_ON(mt, mas.alloc->slot[0] == NULL); mas_push_node(&mas, mn); + mas_reset(&mas); mas_nomem(&mas, GFP_KERNEL); /* free */ mtree_unlock(mt); @@ -141,7 +142,7 @@ static noinline void __init check_new_node(struct maple_tree *mt) mn->parent = ma_parent_ptr(mn); ma_free_rcu(mn); - mas.node = MAS_START; + mas.status = ma_start; mas_nomem(&mas, GFP_KERNEL); /* Allocate 3 nodes, will fail. */ mas_node_count(&mas, 3); @@ -158,6 +159,7 @@ static noinline void __init check_new_node(struct maple_tree *mt) /* Ensure we counted 3. */ MT_BUG_ON(mt, mas_allocated(&mas) != 3); /* Free. */ + mas_reset(&mas); mas_nomem(&mas, GFP_KERNEL); /* Set allocation request to 1. */ @@ -272,6 +274,7 @@ static noinline void __init check_new_node(struct maple_tree *mt) ma_free_rcu(mn); MT_BUG_ON(mt, mas_allocated(&mas) != i - j - 1); } + mas_reset(&mas); MT_BUG_ON(mt, mas_nomem(&mas, GFP_KERNEL)); } @@ -294,6 +297,7 @@ static noinline void __init check_new_node(struct maple_tree *mt) smn = smn->slot[0]; /* next. */ } MT_BUG_ON(mt, mas_allocated(&mas) != total); + mas_reset(&mas); mas_nomem(&mas, GFP_KERNEL); /* Free. */ MT_BUG_ON(mt, mas_allocated(&mas) != 0); @@ -441,7 +445,7 @@ static noinline void __init check_new_node(struct maple_tree *mt) mas.node = MA_ERROR(-ENOMEM); mas_node_count(&mas, 10); /* Request */ mas_nomem(&mas, GFP_KERNEL); /* Fill request */ - mas.node = MAS_START; + mas.status = ma_start; MT_BUG_ON(mt, mas_allocated(&mas) != 10); mas_destroy(&mas); @@ -452,7 +456,7 @@ static noinline void __init check_new_node(struct maple_tree *mt) mas.node = MA_ERROR(-ENOMEM); mas_node_count(&mas, 10 + MAPLE_ALLOC_SLOTS - 1); /* Request */ mas_nomem(&mas, GFP_KERNEL); /* Fill request */ - mas.node = MAS_START; + mas.status = ma_start; MT_BUG_ON(mt, mas_allocated(&mas) != 10 + MAPLE_ALLOC_SLOTS - 1); mas_destroy(&mas); @@ -941,7 +945,7 @@ retry: ret = mas_descend_walk(mas, range_min, range_max); if (unlikely(mte_dead_node(mas->node))) { - mas->node = MAS_START; + mas->status = ma_start; goto retry; } @@ -961,10 +965,10 @@ static inline void *mas_range_load(struct ma_state *mas, unsigned long index = mas->index; if (mas_is_none(mas) || mas_is_paused(mas)) - mas->node = MAS_START; + mas->status = ma_start; retry: if (mas_tree_walk(mas, range_min, range_max)) - if (unlikely(mas->node == MAS_ROOT)) + if (unlikely(mas->status == ma_root)) return mas_root(mas); if (likely(mas->offset != MAPLE_NODE_SLOTS)) @@ -35337,7 +35341,7 @@ static void mas_dfs_preorder(struct ma_state *mas) unsigned char end, slot = 0; unsigned long *pivots; - if (mas->node == MAS_START) { + if (mas->status == ma_start) { mas_start(mas); return; } @@ -35374,7 +35378,7 @@ walk_up: return; done: - mas->node = MAS_NONE; + mas->status = ma_none; } @@ -35833,7 +35837,7 @@ static noinline void __init check_nomem(struct maple_tree *mt) mas_store(&ms, &ms); /* insert 1 -> &ms, fails. */ MT_BUG_ON(mt, ms.node != MA_ERROR(-ENOMEM)); mas_nomem(&ms, GFP_KERNEL); /* Node allocated in here. */ - MT_BUG_ON(mt, ms.node != MAS_START); + MT_BUG_ON(mt, ms.status != ma_start); mtree_unlock(mt); MT_BUG_ON(mt, mtree_insert(mt, 2, mt, GFP_KERNEL) != 0); mtree_lock(mt); @@ -35952,7 +35956,7 @@ static int __init compare_tree(struct maple_tree *mt_a, struct maple_tree *mt_b) if (mas_is_ptr(&mas_a) || mas_is_ptr(&mas_b)) { if (!(mas_is_ptr(&mas_a) && mas_is_ptr(&mas_b))) { - pr_err("One is MAS_ROOT and the other is not.\n"); + pr_err("One is ma_root and the other is not.\n"); return -1; } return 0; @@ -35961,7 +35965,7 @@ static int __init compare_tree(struct maple_tree *mt_a, struct maple_tree *mt_b) while (!mas_is_none(&mas_a) || !mas_is_none(&mas_b)) { if (mas_is_none(&mas_a) || mas_is_none(&mas_b)) { - pr_err("One is MAS_NONE and the other is not.\n"); + pr_err("One is ma_none and the other is not.\n"); return -1; } -- cgit v1.2.3 From 9a40d45c1f2c49273c04938ec3d7849f685eb3c1 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Wed, 1 Nov 2023 13:16:26 -0400 Subject: maple_tree: remove mas_searchable() Now that the status of the maple state is outside of the node, the mas_searchable() function can be dropped for easier open-coding of what is going on. Link: https://lkml.kernel.org/r/20231101171629.3612299-10-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Peng Zhang Signed-off-by: Andrew Morton --- lib/maple_tree.c | 66 ++++++++++------------------------------ tools/testing/radix-tree/maple.c | 4 ++- 2 files changed, 19 insertions(+), 51 deletions(-) (limited to 'tools') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 187a9796188e..c7016066f12b 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -285,17 +285,6 @@ static inline bool mas_is_underflow(struct ma_state *mas) return mas->status == ma_underflow; } -static inline bool mas_searchable(struct ma_state *mas) -{ - if (mas_is_none(mas)) - return false; - - if (mas_is_ptr(mas)) - return false; - - return true; -} - static __always_inline struct maple_node *mte_to_node( const struct maple_enode *entry) { @@ -6030,12 +6019,11 @@ static __always_inline bool mas_find_setup(struct ma_state *mas, unsigned long m } - if (unlikely(!mas_searchable(mas))) { - if (unlikely(mas_is_ptr(mas))) - goto ptr_out_of_range; + if (unlikely(mas_is_ptr(mas))) + goto ptr_out_of_range; + if (unlikely(mas_is_none(mas))) return true; - } if (mas->index == max) return true; @@ -6162,20 +6150,18 @@ static bool mas_find_rev_setup(struct ma_state *mas, unsigned long min, return true; } - if (unlikely(!mas_searchable(mas))) { - if (mas_is_ptr(mas)) - goto none; + if (unlikely(mas_is_ptr(mas))) + goto none; - if (mas_is_none(mas)) { - /* - * Walked to the location, and there was nothing so the - * previous location is 0. - */ - mas->last = mas->index = 0; - mas->status = ma_root; - *entry = mas_root(mas); - return true; - } + if (unlikely(mas_is_none(mas))) { + /* + * Walked to the location, and there was nothing so the previous + * location is 0. + */ + mas->last = mas->index = 0; + mas->status = ma_root; + *entry = mas_root(mas); + return true; } active: @@ -6905,7 +6891,7 @@ retry: if (entry) goto unlock; - while (mas_searchable(&mas) && (mas.last < max)) { + while (mas_is_active(&mas) && (mas.last < max)) { entry = mas_next_entry(&mas, max); if (likely(entry && !xa_is_zero(entry))) break; @@ -6987,26 +6973,6 @@ unsigned int mt_nr_allocated(void) return kmem_cache_nr_allocated(maple_node_cache); } -/* - * mas_dead_node() - Check if the maple state is pointing to a dead node. - * @mas: The maple state - * @index: The index to restore in @mas. - * - * Used in test code. - * Return: 1 if @mas has been reset to MAS_START, 0 otherwise. - */ -static inline int mas_dead_node(struct ma_state *mas, unsigned long index) -{ - if (unlikely(!mas_searchable(mas) || mas_is_start(mas))) - return 0; - - if (likely(!mte_dead_node(mas->node))) - return 0; - - mas_rewalk(mas, index); - return 1; -} - void mt_cache_shrink(void) { } @@ -7558,7 +7524,7 @@ void mt_validate(struct maple_tree *mt) MA_STATE(mas, mt, 0, 0); rcu_read_lock(); mas_start(&mas); - if (!mas_searchable(&mas)) + if (!mas_is_active(&mas)) goto done; while (!mte_is_leaf(mas.node)) diff --git a/tools/testing/radix-tree/maple.c b/tools/testing/radix-tree/maple.c index 857c439e6bbc..56ae47291ee0 100644 --- a/tools/testing/radix-tree/maple.c +++ b/tools/testing/radix-tree/maple.c @@ -974,8 +974,10 @@ retry: if (likely(mas->offset != MAPLE_NODE_SLOTS)) entry = mas_get_slot(mas, mas->offset); - if (mas_dead_node(mas, index)) + if (mas_is_active(mas) && mte_dead_node(mas->node)) { + mas_set(mas, index); goto retry; + } return entry; } -- cgit v1.2.3 From a697dc2be925d4814f26d7588347ccdd2c5525ed Mon Sep 17 00:00:00 2001 From: Domenico Cerasuolo Date: Thu, 30 Nov 2023 11:40:22 -0800 Subject: selftests: cgroup: update per-memcg zswap writeback selftest The memcg-zswap self test is updated to adjust to the behavior change implemented by commit 87730b165089 ("zswap: make shrinking memcg-aware"), where zswap performs writeback for specific memcg. Link: https://lkml.kernel.org/r/20231130194023.4102148-6-nphamcs@gmail.com Signed-off-by: Domenico Cerasuolo Signed-off-by: Nhat Pham Tested-by: Bagas Sanjaya Acked-by: Chris Li (Google) Cc: Dan Streetman Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: Roman Gushchin Cc: Seth Jennings Cc: Shakeel Butt Cc: Shuah Khan Cc: Vitaly Wool Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- tools/testing/selftests/cgroup/test_zswap.c | 74 +++++++++++++++++++---------- 1 file changed, 50 insertions(+), 24 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/cgroup/test_zswap.c b/tools/testing/selftests/cgroup/test_zswap.c index c99d2adaca3f..47fdaa146443 100644 --- a/tools/testing/selftests/cgroup/test_zswap.c +++ b/tools/testing/selftests/cgroup/test_zswap.c @@ -50,9 +50,9 @@ static int get_zswap_stored_pages(size_t *value) return read_int("/sys/kernel/debug/zswap/stored_pages", value); } -static int get_zswap_written_back_pages(size_t *value) +static int get_cg_wb_count(const char *cg) { - return read_int("/sys/kernel/debug/zswap/written_back_pages", value); + return cg_read_key_long(cg, "memory.stat", "zswp_wb"); } static long get_zswpout(const char *cgroup) @@ -73,6 +73,24 @@ static int allocate_bytes(const char *cgroup, void *arg) return 0; } +static char *setup_test_group_1M(const char *root, const char *name) +{ + char *group_name = cg_name(root, name); + + if (!group_name) + return NULL; + if (cg_create(group_name)) + goto fail; + if (cg_write(group_name, "memory.max", "1M")) { + cg_destroy(group_name); + goto fail; + } + return group_name; +fail: + free(group_name); + return NULL; +} + /* * Sanity test to check that pages are written into zswap. */ @@ -117,43 +135,51 @@ out: /* * When trying to store a memcg page in zswap, if the memcg hits its memory - * limit in zswap, writeback should not be triggered. - * - * This was fixed with commit 0bdf0efa180a("zswap: do not shrink if cgroup may - * not zswap"). Needs to be revised when a per memcg writeback mechanism is - * implemented. + * limit in zswap, writeback should affect only the zswapped pages of that + * memcg. */ static int test_no_invasive_cgroup_shrink(const char *root) { - size_t written_back_before, written_back_after; int ret = KSFT_FAIL; - char *test_group; + size_t control_allocation_size = MB(10); + char *control_allocation, *wb_group = NULL, *control_group = NULL; /* Set up */ - test_group = cg_name(root, "no_shrink_test"); - if (!test_group) - goto out; - if (cg_create(test_group)) + wb_group = setup_test_group_1M(root, "per_memcg_wb_test1"); + if (!wb_group) + return KSFT_FAIL; + if (cg_write(wb_group, "memory.zswap.max", "10K")) goto out; - if (cg_write(test_group, "memory.max", "1M")) + control_group = setup_test_group_1M(root, "per_memcg_wb_test2"); + if (!control_group) goto out; - if (cg_write(test_group, "memory.zswap.max", "10K")) + + /* Push some test_group2 memory into zswap */ + if (cg_enter_current(control_group)) goto out; - if (get_zswap_written_back_pages(&written_back_before)) + control_allocation = malloc(control_allocation_size); + for (int i = 0; i < control_allocation_size; i += 4095) + control_allocation[i] = 'a'; + if (cg_read_key_long(control_group, "memory.stat", "zswapped") < 1) goto out; - /* Allocate 10x memory.max to push memory into zswap */ - if (cg_run(test_group, allocate_bytes, (void *)MB(10))) + /* Allocate 10x memory.max to push wb_group memory into zswap and trigger wb */ + if (cg_run(wb_group, allocate_bytes, (void *)MB(10))) goto out; - /* Verify that no writeback happened because of the memcg allocation */ - if (get_zswap_written_back_pages(&written_back_after)) - goto out; - if (written_back_after == written_back_before) + /* Verify that only zswapped memory from gwb_group has been written back */ + if (get_cg_wb_count(wb_group) > 0 && get_cg_wb_count(control_group) == 0) ret = KSFT_PASS; out: - cg_destroy(test_group); - free(test_group); + cg_enter_current(root); + if (control_group) { + cg_destroy(control_group); + free(control_group); + } + cg_destroy(wb_group); + free(wb_group); + if (control_allocation) + free(control_allocation); return ret; } -- cgit v1.2.3 From 4b86316ef18231109e4ebb3661ffc69d816fb56f Mon Sep 17 00:00:00 2001 From: Nico Pache Date: Wed, 29 Nov 2023 15:11:40 -0700 Subject: selftests/mm: dont run ksm_functional_tests twice ksm functional test is already being run. Remove the duplicate call to ./ksm_functional_tests. Link: https://lkml.kernel.org/r/20231129221140.614713-1-npache@redhat.com Fixes: 93fb70aa5904 ("selftests/vm: add KSM unmerge tests") Signed-off-by: Nico Pache Acked-by: Joel Savitz Cc: David Hildenbrand Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/run_vmtests.sh | 2 -- 1 file changed, 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh index 00757445278e..c0212258b852 100755 --- a/tools/testing/selftests/mm/run_vmtests.sh +++ b/tools/testing/selftests/mm/run_vmtests.sh @@ -334,8 +334,6 @@ CATEGORY="ksm_numa" run_test ./ksm_tests -N -m 0 CATEGORY="ksm" run_test ./ksm_functional_tests -run_test ./ksm_functional_tests - # protection_keys tests if [ -x ./protection_keys_32 ] then -- cgit v1.2.3 From 3649caed1c9b7aa57049620c498596c17fc7af9e Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 30 Nov 2023 02:36:49 +0000 Subject: selftests/damon: test quota goals directory Add DAMON selftests for testing creation/existence of quota goals directories and files, and simple valid input writes. Link: https://lkml.kernel.org/r/20231130023652.50284-7-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: David Gow Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/damon/sysfs.sh | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/damon/sysfs.sh b/tools/testing/selftests/damon/sysfs.sh index 56f0230a8b92..e9a976d296e2 100755 --- a/tools/testing/selftests/damon/sysfs.sh +++ b/tools/testing/selftests/damon/sysfs.sh @@ -150,6 +150,32 @@ test_weights() ensure_file "$weights_dir/age_permil" "exist" "600" } +test_goal() +{ + goal_dir=$1 + ensure_dir "$goal_dir" "exist" + ensure_file "$goal_dir/target_value" "exist" "600" + ensure_file "$goal_dir/current_value" "exist" "600" +} + +test_goals() +{ + goals_dir=$1 + ensure_dir "$goals_dir" "exist" + ensure_file "$goals_dir/nr_goals" "exist" "600" + + ensure_write_succ "$goals_dir/nr_goals" "1" "valid input" + test_goal "$goals_dir/0" + + ensure_write_succ "$goals_dir/nr_goals" "2" "valid input" + test_goal "$goals_dir/0" + test_goal "$goals_dir/1" + + ensure_write_succ "$goals_dir/nr_goals" "0" "valid input" + ensure_dir "$goals_dir/0" "not_exist" + ensure_dir "$goals_dir/1" "not_exist" +} + test_quotas() { quotas_dir=$1 @@ -158,6 +184,7 @@ test_quotas() ensure_file "$quotas_dir/bytes" "exist" 600 ensure_file "$quotas_dir/reset_interval_ms" "exist" 600 test_weights "$quotas_dir/weights" + test_goals "$quotas_dir/goals" } test_access_pattern() -- cgit v1.2.3 From b6aab3384cafba151c53d3b5f7e1f8d073aadf03 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Thu, 7 Dec 2023 16:12:06 +0000 Subject: selftests/mm/kugepaged: restore thp settings at exit Previously, the saved thp settings would be restored upon a signal or at the natural end of the test suite. But there are some tests that directly call exit() upon failure. In this case, the thp settings were not being restored, which could then influence other tests. Fix this by installing an atexit() handler to do the actual restore. The signal handler can now just call exit() and the atexit handler is invoked. Link: https://lkml.kernel.org/r/20231207161211.2374093-6-ryan.roberts@arm.com Signed-off-by: Ryan Roberts Reviewed-by: Alistair Popple Reviewed-by: David Hildenbrand Tested-by: Kefeng Wang Tested-by: John Hubbard Cc: Anshuman Khandual Cc: Barry Song Cc: Catalin Marinas Cc: David Rientjes Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Itaru Kitayama Cc: Kirill A. Shutemov Cc: Luis Chamberlain Cc: Matthew Wilcox (Oracle) Cc: Vlastimil Babka Cc: Yang Shi Cc: Yin Fengwei Cc: Yu Zhao Cc: Zi Yan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/khugepaged.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/mm/khugepaged.c b/tools/testing/selftests/mm/khugepaged.c index 030667cb5533..fc47a1c4944c 100644 --- a/tools/testing/selftests/mm/khugepaged.c +++ b/tools/testing/selftests/mm/khugepaged.c @@ -374,18 +374,22 @@ static void pop_settings(void) write_settings(current_settings()); } -static void restore_settings(int sig) +static void restore_settings_atexit(void) { if (skip_settings_restore) - goto out; + return; printf("Restore THP and khugepaged settings..."); write_settings(&saved_settings); success("OK"); - if (sig) - exit(EXIT_FAILURE); -out: - exit(exit_status); + + skip_settings_restore = true; +} + +static void restore_settings(int sig) +{ + /* exit() will invoke the restore_settings_atexit handler. */ + exit(sig ? EXIT_FAILURE : exit_status); } static void save_settings(void) @@ -415,6 +419,7 @@ static void save_settings(void) success("OK"); + atexit(restore_settings_atexit); signal(SIGTERM, restore_settings); signal(SIGINT, restore_settings); signal(SIGHUP, restore_settings); -- cgit v1.2.3 From 00679a183ac6d2584723cfc2a2c07c8285f802dc Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Thu, 7 Dec 2023 16:12:07 +0000 Subject: selftests/mm: factor out thp settings management The khugepaged test has a useful framework for save/restore/pop/push of all thp settings via the sysfs interface. This will be useful to explicitly control multi-size THP settings in other tests, so let's move it out of khugepaged and into its own thp_settings.[c|h] utility. Link: https://lkml.kernel.org/r/20231207161211.2374093-7-ryan.roberts@arm.com Signed-off-by: Ryan Roberts Tested-by: Alistair Popple Acked-by: David Hildenbrand Tested-by: Kefeng Wang Tested-by: John Hubbard Cc: Anshuman Khandual Cc: Barry Song Cc: Catalin Marinas Cc: David Rientjes Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Itaru Kitayama Cc: Kirill A. Shutemov Cc: Luis Chamberlain Cc: Matthew Wilcox (Oracle) Cc: Vlastimil Babka Cc: Yang Shi Cc: Yin Fengwei Cc: Yu Zhao Cc: Zi Yan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/Makefile | 4 +- tools/testing/selftests/mm/khugepaged.c | 346 ++---------------------------- tools/testing/selftests/mm/thp_settings.c | 296 +++++++++++++++++++++++++ tools/testing/selftests/mm/thp_settings.h | 71 ++++++ 4 files changed, 391 insertions(+), 326 deletions(-) create mode 100644 tools/testing/selftests/mm/thp_settings.c create mode 100644 tools/testing/selftests/mm/thp_settings.h (limited to 'tools') diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile index dede0bcf97a3..2453add65d12 100644 --- a/tools/testing/selftests/mm/Makefile +++ b/tools/testing/selftests/mm/Makefile @@ -117,8 +117,8 @@ TEST_FILES += va_high_addr_switch.sh include ../lib.mk -$(TEST_GEN_PROGS): vm_util.c -$(TEST_GEN_FILES): vm_util.c +$(TEST_GEN_PROGS): vm_util.c thp_settings.c +$(TEST_GEN_FILES): vm_util.c thp_settings.c $(OUTPUT)/uffd-stress: uffd-common.c $(OUTPUT)/uffd-unit-tests: uffd-common.c diff --git a/tools/testing/selftests/mm/khugepaged.c b/tools/testing/selftests/mm/khugepaged.c index fc47a1c4944c..b15e7fd70176 100644 --- a/tools/testing/selftests/mm/khugepaged.c +++ b/tools/testing/selftests/mm/khugepaged.c @@ -22,13 +22,13 @@ #include "linux/magic.h" #include "vm_util.h" +#include "thp_settings.h" #define BASE_ADDR ((void *)(1UL << 30)) static unsigned long hpage_pmd_size; static unsigned long page_size; static int hpage_pmd_nr; -#define THP_SYSFS "/sys/kernel/mm/transparent_hugepage/" #define PID_SMAPS "/proc/self/smaps" #define TEST_FILE "collapse_test_file" @@ -71,78 +71,7 @@ struct file_info { }; static struct file_info finfo; - -enum thp_enabled { - THP_ALWAYS, - THP_MADVISE, - THP_NEVER, -}; - -static const char *thp_enabled_strings[] = { - "always", - "madvise", - "never", - NULL -}; - -enum thp_defrag { - THP_DEFRAG_ALWAYS, - THP_DEFRAG_DEFER, - THP_DEFRAG_DEFER_MADVISE, - THP_DEFRAG_MADVISE, - THP_DEFRAG_NEVER, -}; - -static const char *thp_defrag_strings[] = { - "always", - "defer", - "defer+madvise", - "madvise", - "never", - NULL -}; - -enum shmem_enabled { - SHMEM_ALWAYS, - SHMEM_WITHIN_SIZE, - SHMEM_ADVISE, - SHMEM_NEVER, - SHMEM_DENY, - SHMEM_FORCE, -}; - -static const char *shmem_enabled_strings[] = { - "always", - "within_size", - "advise", - "never", - "deny", - "force", - NULL -}; - -struct khugepaged_settings { - bool defrag; - unsigned int alloc_sleep_millisecs; - unsigned int scan_sleep_millisecs; - unsigned int max_ptes_none; - unsigned int max_ptes_swap; - unsigned int max_ptes_shared; - unsigned long pages_to_scan; -}; - -struct settings { - enum thp_enabled thp_enabled; - enum thp_defrag thp_defrag; - enum shmem_enabled shmem_enabled; - bool use_zero_page; - struct khugepaged_settings khugepaged; - unsigned long read_ahead_kb; -}; - -static struct settings saved_settings; static bool skip_settings_restore; - static int exit_status; static void success(const char *msg) @@ -161,226 +90,13 @@ static void skip(const char *msg) printf(" \e[33m%s\e[0m\n", msg); } -static int read_file(const char *path, char *buf, size_t buflen) -{ - int fd; - ssize_t numread; - - fd = open(path, O_RDONLY); - if (fd == -1) - return 0; - - numread = read(fd, buf, buflen - 1); - if (numread < 1) { - close(fd); - return 0; - } - - buf[numread] = '\0'; - close(fd); - - return (unsigned int) numread; -} - -static int write_file(const char *path, const char *buf, size_t buflen) -{ - int fd; - ssize_t numwritten; - - fd = open(path, O_WRONLY); - if (fd == -1) { - printf("open(%s)\n", path); - exit(EXIT_FAILURE); - return 0; - } - - numwritten = write(fd, buf, buflen - 1); - close(fd); - if (numwritten < 1) { - printf("write(%s)\n", buf); - exit(EXIT_FAILURE); - return 0; - } - - return (unsigned int) numwritten; -} - -static int read_string(const char *name, const char *strings[]) -{ - char path[PATH_MAX]; - char buf[256]; - char *c; - int ret; - - ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name); - if (ret >= PATH_MAX) { - printf("%s: Pathname is too long\n", __func__); - exit(EXIT_FAILURE); - } - - if (!read_file(path, buf, sizeof(buf))) { - perror(path); - exit(EXIT_FAILURE); - } - - c = strchr(buf, '['); - if (!c) { - printf("%s: Parse failure\n", __func__); - exit(EXIT_FAILURE); - } - - c++; - memmove(buf, c, sizeof(buf) - (c - buf)); - - c = strchr(buf, ']'); - if (!c) { - printf("%s: Parse failure\n", __func__); - exit(EXIT_FAILURE); - } - *c = '\0'; - - ret = 0; - while (strings[ret]) { - if (!strcmp(strings[ret], buf)) - return ret; - ret++; - } - - printf("Failed to parse %s\n", name); - exit(EXIT_FAILURE); -} - -static void write_string(const char *name, const char *val) -{ - char path[PATH_MAX]; - int ret; - - ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name); - if (ret >= PATH_MAX) { - printf("%s: Pathname is too long\n", __func__); - exit(EXIT_FAILURE); - } - - if (!write_file(path, val, strlen(val) + 1)) { - perror(path); - exit(EXIT_FAILURE); - } -} - -static const unsigned long _read_num(const char *path) -{ - char buf[21]; - - if (read_file(path, buf, sizeof(buf)) < 0) { - perror("read_file(read_num)"); - exit(EXIT_FAILURE); - } - - return strtoul(buf, NULL, 10); -} - -static const unsigned long read_num(const char *name) -{ - char path[PATH_MAX]; - int ret; - - ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name); - if (ret >= PATH_MAX) { - printf("%s: Pathname is too long\n", __func__); - exit(EXIT_FAILURE); - } - return _read_num(path); -} - -static void _write_num(const char *path, unsigned long num) -{ - char buf[21]; - - sprintf(buf, "%ld", num); - if (!write_file(path, buf, strlen(buf) + 1)) { - perror(path); - exit(EXIT_FAILURE); - } -} - -static void write_num(const char *name, unsigned long num) -{ - char path[PATH_MAX]; - int ret; - - ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name); - if (ret >= PATH_MAX) { - printf("%s: Pathname is too long\n", __func__); - exit(EXIT_FAILURE); - } - _write_num(path, num); -} - -static void write_settings(struct settings *settings) -{ - struct khugepaged_settings *khugepaged = &settings->khugepaged; - - write_string("enabled", thp_enabled_strings[settings->thp_enabled]); - write_string("defrag", thp_defrag_strings[settings->thp_defrag]); - write_string("shmem_enabled", - shmem_enabled_strings[settings->shmem_enabled]); - write_num("use_zero_page", settings->use_zero_page); - - write_num("khugepaged/defrag", khugepaged->defrag); - write_num("khugepaged/alloc_sleep_millisecs", - khugepaged->alloc_sleep_millisecs); - write_num("khugepaged/scan_sleep_millisecs", - khugepaged->scan_sleep_millisecs); - write_num("khugepaged/max_ptes_none", khugepaged->max_ptes_none); - write_num("khugepaged/max_ptes_swap", khugepaged->max_ptes_swap); - write_num("khugepaged/max_ptes_shared", khugepaged->max_ptes_shared); - write_num("khugepaged/pages_to_scan", khugepaged->pages_to_scan); - - if (file_ops && finfo.type == VMA_FILE) - _write_num(finfo.dev_queue_read_ahead_path, - settings->read_ahead_kb); -} - -#define MAX_SETTINGS_DEPTH 4 -static struct settings settings_stack[MAX_SETTINGS_DEPTH]; -static int settings_index; - -static struct settings *current_settings(void) -{ - if (!settings_index) { - printf("Fail: No settings set"); - exit(EXIT_FAILURE); - } - return settings_stack + settings_index - 1; -} - -static void push_settings(struct settings *settings) -{ - if (settings_index >= MAX_SETTINGS_DEPTH) { - printf("Fail: Settings stack exceeded"); - exit(EXIT_FAILURE); - } - settings_stack[settings_index++] = *settings; - write_settings(current_settings()); -} - -static void pop_settings(void) -{ - if (settings_index <= 0) { - printf("Fail: Settings stack empty"); - exit(EXIT_FAILURE); - } - --settings_index; - write_settings(current_settings()); -} - static void restore_settings_atexit(void) { if (skip_settings_restore) return; printf("Restore THP and khugepaged settings..."); - write_settings(&saved_settings); + thp_restore_settings(); success("OK"); skip_settings_restore = true; @@ -395,27 +111,9 @@ static void restore_settings(int sig) static void save_settings(void) { printf("Save THP and khugepaged settings..."); - saved_settings = (struct settings) { - .thp_enabled = read_string("enabled", thp_enabled_strings), - .thp_defrag = read_string("defrag", thp_defrag_strings), - .shmem_enabled = - read_string("shmem_enabled", shmem_enabled_strings), - .use_zero_page = read_num("use_zero_page"), - }; - saved_settings.khugepaged = (struct khugepaged_settings) { - .defrag = read_num("khugepaged/defrag"), - .alloc_sleep_millisecs = - read_num("khugepaged/alloc_sleep_millisecs"), - .scan_sleep_millisecs = - read_num("khugepaged/scan_sleep_millisecs"), - .max_ptes_none = read_num("khugepaged/max_ptes_none"), - .max_ptes_swap = read_num("khugepaged/max_ptes_swap"), - .max_ptes_shared = read_num("khugepaged/max_ptes_shared"), - .pages_to_scan = read_num("khugepaged/pages_to_scan"), - }; if (file_ops && finfo.type == VMA_FILE) - saved_settings.read_ahead_kb = - _read_num(finfo.dev_queue_read_ahead_path); + thp_set_read_ahead_path(finfo.dev_queue_read_ahead_path); + thp_save_settings(); success("OK"); @@ -798,7 +496,7 @@ static void __madvise_collapse(const char *msg, char *p, int nr_hpages, struct mem_ops *ops, bool expect) { int ret; - struct settings settings = *current_settings(); + struct thp_settings settings = *thp_current_settings(); printf("%s...", msg); @@ -808,7 +506,7 @@ static void __madvise_collapse(const char *msg, char *p, int nr_hpages, */ settings.thp_enabled = THP_NEVER; settings.shmem_enabled = SHMEM_NEVER; - push_settings(&settings); + thp_push_settings(&settings); /* Clear VM_NOHUGEPAGE */ madvise(p, nr_hpages * hpage_pmd_size, MADV_HUGEPAGE); @@ -820,7 +518,7 @@ static void __madvise_collapse(const char *msg, char *p, int nr_hpages, else success("OK"); - pop_settings(); + thp_pop_settings(); } static void madvise_collapse(const char *msg, char *p, int nr_hpages, @@ -850,13 +548,13 @@ static bool wait_for_scan(const char *msg, char *p, int nr_hpages, madvise(p, nr_hpages * hpage_pmd_size, MADV_HUGEPAGE); /* Wait until the second full_scan completed */ - full_scans = read_num("khugepaged/full_scans") + 2; + full_scans = thp_read_num("khugepaged/full_scans") + 2; printf("%s...", msg); while (timeout--) { if (ops->check_huge(p, nr_hpages)) break; - if (read_num("khugepaged/full_scans") >= full_scans) + if (thp_read_num("khugepaged/full_scans") >= full_scans) break; printf("."); usleep(TICK); @@ -911,11 +609,11 @@ static bool is_tmpfs(struct mem_ops *ops) static void alloc_at_fault(void) { - struct settings settings = *current_settings(); + struct thp_settings settings = *thp_current_settings(); char *p; settings.thp_enabled = THP_ALWAYS; - push_settings(&settings); + thp_push_settings(&settings); p = alloc_mapping(1); *p = 1; @@ -925,7 +623,7 @@ static void alloc_at_fault(void) else fail("Fail"); - pop_settings(); + thp_pop_settings(); madvise(p, page_size, MADV_DONTNEED); printf("Split huge PMD on MADV_DONTNEED..."); @@ -973,11 +671,11 @@ static void collapse_single_pte_entry(struct collapse_context *c, struct mem_ops static void collapse_max_ptes_none(struct collapse_context *c, struct mem_ops *ops) { int max_ptes_none = hpage_pmd_nr / 2; - struct settings settings = *current_settings(); + struct thp_settings settings = *thp_current_settings(); void *p; settings.khugepaged.max_ptes_none = max_ptes_none; - push_settings(&settings); + thp_push_settings(&settings); p = ops->setup_area(1); @@ -1002,7 +700,7 @@ static void collapse_max_ptes_none(struct collapse_context *c, struct mem_ops *o } skip: ops->cleanup_area(p, hpage_pmd_size); - pop_settings(); + thp_pop_settings(); } static void collapse_swapin_single_pte(struct collapse_context *c, struct mem_ops *ops) @@ -1033,7 +731,7 @@ out: static void collapse_max_ptes_swap(struct collapse_context *c, struct mem_ops *ops) { - int max_ptes_swap = read_num("khugepaged/max_ptes_swap"); + int max_ptes_swap = thp_read_num("khugepaged/max_ptes_swap"); void *p; p = ops->setup_area(1); @@ -1250,11 +948,11 @@ static void collapse_fork_compound(struct collapse_context *c, struct mem_ops *o fail("Fail"); ops->fault(p, 0, page_size); - write_num("khugepaged/max_ptes_shared", hpage_pmd_nr - 1); + thp_write_num("khugepaged/max_ptes_shared", hpage_pmd_nr - 1); c->collapse("Collapse PTE table full of compound pages in child", p, 1, ops, true); - write_num("khugepaged/max_ptes_shared", - current_settings()->khugepaged.max_ptes_shared); + thp_write_num("khugepaged/max_ptes_shared", + thp_current_settings()->khugepaged.max_ptes_shared); validate_memory(p, 0, hpage_pmd_size); ops->cleanup_area(p, hpage_pmd_size); @@ -1275,7 +973,7 @@ static void collapse_fork_compound(struct collapse_context *c, struct mem_ops *o static void collapse_max_ptes_shared(struct collapse_context *c, struct mem_ops *ops) { - int max_ptes_shared = read_num("khugepaged/max_ptes_shared"); + int max_ptes_shared = thp_read_num("khugepaged/max_ptes_shared"); int wstatus; void *p; @@ -1443,7 +1141,7 @@ static void parse_test_type(int argc, const char **argv) int main(int argc, const char **argv) { - struct settings default_settings = { + struct thp_settings default_settings = { .thp_enabled = THP_MADVISE, .thp_defrag = THP_DEFRAG_ALWAYS, .shmem_enabled = SHMEM_ADVISE, @@ -1484,7 +1182,7 @@ int main(int argc, const char **argv) default_settings.khugepaged.pages_to_scan = hpage_pmd_nr * 8; save_settings(); - push_settings(&default_settings); + thp_push_settings(&default_settings); alloc_at_fault(); diff --git a/tools/testing/selftests/mm/thp_settings.c b/tools/testing/selftests/mm/thp_settings.c new file mode 100644 index 000000000000..5e8ec792cac7 --- /dev/null +++ b/tools/testing/selftests/mm/thp_settings.c @@ -0,0 +1,296 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include + +#include "thp_settings.h" + +#define THP_SYSFS "/sys/kernel/mm/transparent_hugepage/" +#define MAX_SETTINGS_DEPTH 4 +static struct thp_settings settings_stack[MAX_SETTINGS_DEPTH]; +static int settings_index; +static struct thp_settings saved_settings; +static char dev_queue_read_ahead_path[PATH_MAX]; + +static const char * const thp_enabled_strings[] = { + "always", + "madvise", + "never", + NULL +}; + +static const char * const thp_defrag_strings[] = { + "always", + "defer", + "defer+madvise", + "madvise", + "never", + NULL +}; + +static const char * const shmem_enabled_strings[] = { + "always", + "within_size", + "advise", + "never", + "deny", + "force", + NULL +}; + +int read_file(const char *path, char *buf, size_t buflen) +{ + int fd; + ssize_t numread; + + fd = open(path, O_RDONLY); + if (fd == -1) + return 0; + + numread = read(fd, buf, buflen - 1); + if (numread < 1) { + close(fd); + return 0; + } + + buf[numread] = '\0'; + close(fd); + + return (unsigned int) numread; +} + +int write_file(const char *path, const char *buf, size_t buflen) +{ + int fd; + ssize_t numwritten; + + fd = open(path, O_WRONLY); + if (fd == -1) { + printf("open(%s)\n", path); + exit(EXIT_FAILURE); + return 0; + } + + numwritten = write(fd, buf, buflen - 1); + close(fd); + if (numwritten < 1) { + printf("write(%s)\n", buf); + exit(EXIT_FAILURE); + return 0; + } + + return (unsigned int) numwritten; +} + +const unsigned long read_num(const char *path) +{ + char buf[21]; + + if (read_file(path, buf, sizeof(buf)) < 0) { + perror("read_file()"); + exit(EXIT_FAILURE); + } + + return strtoul(buf, NULL, 10); +} + +void write_num(const char *path, unsigned long num) +{ + char buf[21]; + + sprintf(buf, "%ld", num); + if (!write_file(path, buf, strlen(buf) + 1)) { + perror(path); + exit(EXIT_FAILURE); + } +} + +int thp_read_string(const char *name, const char * const strings[]) +{ + char path[PATH_MAX]; + char buf[256]; + char *c; + int ret; + + ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name); + if (ret >= PATH_MAX) { + printf("%s: Pathname is too long\n", __func__); + exit(EXIT_FAILURE); + } + + if (!read_file(path, buf, sizeof(buf))) { + perror(path); + exit(EXIT_FAILURE); + } + + c = strchr(buf, '['); + if (!c) { + printf("%s: Parse failure\n", __func__); + exit(EXIT_FAILURE); + } + + c++; + memmove(buf, c, sizeof(buf) - (c - buf)); + + c = strchr(buf, ']'); + if (!c) { + printf("%s: Parse failure\n", __func__); + exit(EXIT_FAILURE); + } + *c = '\0'; + + ret = 0; + while (strings[ret]) { + if (!strcmp(strings[ret], buf)) + return ret; + ret++; + } + + printf("Failed to parse %s\n", name); + exit(EXIT_FAILURE); +} + +void thp_write_string(const char *name, const char *val) +{ + char path[PATH_MAX]; + int ret; + + ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name); + if (ret >= PATH_MAX) { + printf("%s: Pathname is too long\n", __func__); + exit(EXIT_FAILURE); + } + + if (!write_file(path, val, strlen(val) + 1)) { + perror(path); + exit(EXIT_FAILURE); + } +} + +const unsigned long thp_read_num(const char *name) +{ + char path[PATH_MAX]; + int ret; + + ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name); + if (ret >= PATH_MAX) { + printf("%s: Pathname is too long\n", __func__); + exit(EXIT_FAILURE); + } + return read_num(path); +} + +void thp_write_num(const char *name, unsigned long num) +{ + char path[PATH_MAX]; + int ret; + + ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name); + if (ret >= PATH_MAX) { + printf("%s: Pathname is too long\n", __func__); + exit(EXIT_FAILURE); + } + write_num(path, num); +} + +void thp_read_settings(struct thp_settings *settings) +{ + *settings = (struct thp_settings) { + .thp_enabled = thp_read_string("enabled", thp_enabled_strings), + .thp_defrag = thp_read_string("defrag", thp_defrag_strings), + .shmem_enabled = + thp_read_string("shmem_enabled", shmem_enabled_strings), + .use_zero_page = thp_read_num("use_zero_page"), + }; + settings->khugepaged = (struct khugepaged_settings) { + .defrag = thp_read_num("khugepaged/defrag"), + .alloc_sleep_millisecs = + thp_read_num("khugepaged/alloc_sleep_millisecs"), + .scan_sleep_millisecs = + thp_read_num("khugepaged/scan_sleep_millisecs"), + .max_ptes_none = thp_read_num("khugepaged/max_ptes_none"), + .max_ptes_swap = thp_read_num("khugepaged/max_ptes_swap"), + .max_ptes_shared = thp_read_num("khugepaged/max_ptes_shared"), + .pages_to_scan = thp_read_num("khugepaged/pages_to_scan"), + }; + if (dev_queue_read_ahead_path[0]) + settings->read_ahead_kb = read_num(dev_queue_read_ahead_path); +} + +void thp_write_settings(struct thp_settings *settings) +{ + struct khugepaged_settings *khugepaged = &settings->khugepaged; + + thp_write_string("enabled", thp_enabled_strings[settings->thp_enabled]); + thp_write_string("defrag", thp_defrag_strings[settings->thp_defrag]); + thp_write_string("shmem_enabled", + shmem_enabled_strings[settings->shmem_enabled]); + thp_write_num("use_zero_page", settings->use_zero_page); + + thp_write_num("khugepaged/defrag", khugepaged->defrag); + thp_write_num("khugepaged/alloc_sleep_millisecs", + khugepaged->alloc_sleep_millisecs); + thp_write_num("khugepaged/scan_sleep_millisecs", + khugepaged->scan_sleep_millisecs); + thp_write_num("khugepaged/max_ptes_none", khugepaged->max_ptes_none); + thp_write_num("khugepaged/max_ptes_swap", khugepaged->max_ptes_swap); + thp_write_num("khugepaged/max_ptes_shared", khugepaged->max_ptes_shared); + thp_write_num("khugepaged/pages_to_scan", khugepaged->pages_to_scan); + + if (dev_queue_read_ahead_path[0]) + write_num(dev_queue_read_ahead_path, settings->read_ahead_kb); +} + +struct thp_settings *thp_current_settings(void) +{ + if (!settings_index) { + printf("Fail: No settings set"); + exit(EXIT_FAILURE); + } + return settings_stack + settings_index - 1; +} + +void thp_push_settings(struct thp_settings *settings) +{ + if (settings_index >= MAX_SETTINGS_DEPTH) { + printf("Fail: Settings stack exceeded"); + exit(EXIT_FAILURE); + } + settings_stack[settings_index++] = *settings; + thp_write_settings(thp_current_settings()); +} + +void thp_pop_settings(void) +{ + if (settings_index <= 0) { + printf("Fail: Settings stack empty"); + exit(EXIT_FAILURE); + } + --settings_index; + thp_write_settings(thp_current_settings()); +} + +void thp_restore_settings(void) +{ + thp_write_settings(&saved_settings); +} + +void thp_save_settings(void) +{ + thp_read_settings(&saved_settings); +} + +void thp_set_read_ahead_path(char *path) +{ + if (!path) { + dev_queue_read_ahead_path[0] = '\0'; + return; + } + + strncpy(dev_queue_read_ahead_path, path, + sizeof(dev_queue_read_ahead_path)); + dev_queue_read_ahead_path[sizeof(dev_queue_read_ahead_path) - 1] = '\0'; +} diff --git a/tools/testing/selftests/mm/thp_settings.h b/tools/testing/selftests/mm/thp_settings.h new file mode 100644 index 000000000000..ff3d98c30617 --- /dev/null +++ b/tools/testing/selftests/mm/thp_settings.h @@ -0,0 +1,71 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __THP_SETTINGS_H__ +#define __THP_SETTINGS_H__ + +#include +#include +#include + +enum thp_enabled { + THP_ALWAYS, + THP_MADVISE, + THP_NEVER, +}; + +enum thp_defrag { + THP_DEFRAG_ALWAYS, + THP_DEFRAG_DEFER, + THP_DEFRAG_DEFER_MADVISE, + THP_DEFRAG_MADVISE, + THP_DEFRAG_NEVER, +}; + +enum shmem_enabled { + SHMEM_ALWAYS, + SHMEM_WITHIN_SIZE, + SHMEM_ADVISE, + SHMEM_NEVER, + SHMEM_DENY, + SHMEM_FORCE, +}; + +struct khugepaged_settings { + bool defrag; + unsigned int alloc_sleep_millisecs; + unsigned int scan_sleep_millisecs; + unsigned int max_ptes_none; + unsigned int max_ptes_swap; + unsigned int max_ptes_shared; + unsigned long pages_to_scan; +}; + +struct thp_settings { + enum thp_enabled thp_enabled; + enum thp_defrag thp_defrag; + enum shmem_enabled shmem_enabled; + bool use_zero_page; + struct khugepaged_settings khugepaged; + unsigned long read_ahead_kb; +}; + +int read_file(const char *path, char *buf, size_t buflen); +int write_file(const char *path, const char *buf, size_t buflen); +const unsigned long read_num(const char *path); +void write_num(const char *path, unsigned long num); + +int thp_read_string(const char *name, const char * const strings[]); +void thp_write_string(const char *name, const char *val); +const unsigned long thp_read_num(const char *name); +void thp_write_num(const char *name, unsigned long num); + +void thp_write_settings(struct thp_settings *settings); +void thp_read_settings(struct thp_settings *settings); +struct thp_settings *thp_current_settings(void); +void thp_push_settings(struct thp_settings *settings); +void thp_pop_settings(void); +void thp_restore_settings(void); +void thp_save_settings(void); + +void thp_set_read_ahead_path(char *path); + +#endif /* __THP_SETTINGS_H__ */ -- cgit v1.2.3 From 4f5070a5e40db2e9dbf5fff4ec678d6fbb338d5c Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Thu, 7 Dec 2023 16:12:08 +0000 Subject: selftests/mm: support multi-size THP interface in thp_settings Save and restore the new per-size hugepage enabled setting, if available on the running kernel. Since the number of per-size directories is not fixed, solve this as simply as possible by catering for a maximum number in the thp_settings struct (20). Each array index is the order. The value of THP_NEVER is changed to 0 so that all of these new settings default to THP_NEVER and the user only needs to fill in the ones they want to enable. Link: https://lkml.kernel.org/r/20231207161211.2374093-8-ryan.roberts@arm.com Signed-off-by: Ryan Roberts Tested-by: Kefeng Wang Tested-by: John Hubbard Cc: Alistair Popple Cc: Anshuman Khandual Cc: Barry Song Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Rientjes Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Itaru Kitayama Cc: Kirill A. Shutemov Cc: Luis Chamberlain Cc: Matthew Wilcox (Oracle) Cc: Vlastimil Babka Cc: Yang Shi Cc: Yin Fengwei Cc: Yu Zhao Cc: Zi Yan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/khugepaged.c | 3 ++ tools/testing/selftests/mm/thp_settings.c | 55 ++++++++++++++++++++++++++++++- tools/testing/selftests/mm/thp_settings.h | 11 ++++++- 3 files changed, 67 insertions(+), 2 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/mm/khugepaged.c b/tools/testing/selftests/mm/khugepaged.c index b15e7fd70176..7bd3baa9d34b 100644 --- a/tools/testing/selftests/mm/khugepaged.c +++ b/tools/testing/selftests/mm/khugepaged.c @@ -1141,6 +1141,7 @@ static void parse_test_type(int argc, const char **argv) int main(int argc, const char **argv) { + int hpage_pmd_order; struct thp_settings default_settings = { .thp_enabled = THP_MADVISE, .thp_defrag = THP_DEFRAG_ALWAYS, @@ -1175,11 +1176,13 @@ int main(int argc, const char **argv) exit(EXIT_FAILURE); } hpage_pmd_nr = hpage_pmd_size / page_size; + hpage_pmd_order = __builtin_ctz(hpage_pmd_nr); default_settings.khugepaged.max_ptes_none = hpage_pmd_nr - 1; default_settings.khugepaged.max_ptes_swap = hpage_pmd_nr / 8; default_settings.khugepaged.max_ptes_shared = hpage_pmd_nr / 2; default_settings.khugepaged.pages_to_scan = hpage_pmd_nr * 8; + default_settings.hugepages[hpage_pmd_order].enabled = THP_INHERIT; save_settings(); thp_push_settings(&default_settings); diff --git a/tools/testing/selftests/mm/thp_settings.c b/tools/testing/selftests/mm/thp_settings.c index 5e8ec792cac7..a4163438108e 100644 --- a/tools/testing/selftests/mm/thp_settings.c +++ b/tools/testing/selftests/mm/thp_settings.c @@ -16,9 +16,10 @@ static struct thp_settings saved_settings; static char dev_queue_read_ahead_path[PATH_MAX]; static const char * const thp_enabled_strings[] = { + "never", "always", + "inherit", "madvise", - "never", NULL }; @@ -198,6 +199,10 @@ void thp_write_num(const char *name, unsigned long num) void thp_read_settings(struct thp_settings *settings) { + unsigned long orders = thp_supported_orders(); + char path[PATH_MAX]; + int i; + *settings = (struct thp_settings) { .thp_enabled = thp_read_string("enabled", thp_enabled_strings), .thp_defrag = thp_read_string("defrag", thp_defrag_strings), @@ -218,11 +223,26 @@ void thp_read_settings(struct thp_settings *settings) }; if (dev_queue_read_ahead_path[0]) settings->read_ahead_kb = read_num(dev_queue_read_ahead_path); + + for (i = 0; i < NR_ORDERS; i++) { + if (!((1 << i) & orders)) { + settings->hugepages[i].enabled = THP_NEVER; + continue; + } + snprintf(path, PATH_MAX, "hugepages-%ukB/enabled", + (getpagesize() >> 10) << i); + settings->hugepages[i].enabled = + thp_read_string(path, thp_enabled_strings); + } } void thp_write_settings(struct thp_settings *settings) { struct khugepaged_settings *khugepaged = &settings->khugepaged; + unsigned long orders = thp_supported_orders(); + char path[PATH_MAX]; + int enabled; + int i; thp_write_string("enabled", thp_enabled_strings[settings->thp_enabled]); thp_write_string("defrag", thp_defrag_strings[settings->thp_defrag]); @@ -242,6 +262,15 @@ void thp_write_settings(struct thp_settings *settings) if (dev_queue_read_ahead_path[0]) write_num(dev_queue_read_ahead_path, settings->read_ahead_kb); + + for (i = 0; i < NR_ORDERS; i++) { + if (!((1 << i) & orders)) + continue; + snprintf(path, PATH_MAX, "hugepages-%ukB/enabled", + (getpagesize() >> 10) << i); + enabled = settings->hugepages[i].enabled; + thp_write_string(path, thp_enabled_strings[enabled]); + } } struct thp_settings *thp_current_settings(void) @@ -294,3 +323,27 @@ void thp_set_read_ahead_path(char *path) sizeof(dev_queue_read_ahead_path)); dev_queue_read_ahead_path[sizeof(dev_queue_read_ahead_path) - 1] = '\0'; } + +unsigned long thp_supported_orders(void) +{ + unsigned long orders = 0; + char path[PATH_MAX]; + char buf[256]; + int ret; + int i; + + for (i = 0; i < NR_ORDERS; i++) { + ret = snprintf(path, PATH_MAX, THP_SYSFS "hugepages-%ukB/enabled", + (getpagesize() >> 10) << i); + if (ret >= PATH_MAX) { + printf("%s: Pathname is too long\n", __func__); + exit(EXIT_FAILURE); + } + + ret = read_file(path, buf, sizeof(buf)); + if (ret) + orders |= 1UL << i; + } + + return orders; +} diff --git a/tools/testing/selftests/mm/thp_settings.h b/tools/testing/selftests/mm/thp_settings.h index ff3d98c30617..71cbff05f4c7 100644 --- a/tools/testing/selftests/mm/thp_settings.h +++ b/tools/testing/selftests/mm/thp_settings.h @@ -7,9 +7,10 @@ #include enum thp_enabled { + THP_NEVER, THP_ALWAYS, + THP_INHERIT, THP_MADVISE, - THP_NEVER, }; enum thp_defrag { @@ -29,6 +30,12 @@ enum shmem_enabled { SHMEM_FORCE, }; +#define NR_ORDERS 20 + +struct hugepages_settings { + enum thp_enabled enabled; +}; + struct khugepaged_settings { bool defrag; unsigned int alloc_sleep_millisecs; @@ -46,6 +53,7 @@ struct thp_settings { bool use_zero_page; struct khugepaged_settings khugepaged; unsigned long read_ahead_kb; + struct hugepages_settings hugepages[NR_ORDERS]; }; int read_file(const char *path, char *buf, size_t buflen); @@ -67,5 +75,6 @@ void thp_restore_settings(void); void thp_save_settings(void); void thp_set_read_ahead_path(char *path); +unsigned long thp_supported_orders(void); #endif /* __THP_SETTINGS_H__ */ -- cgit v1.2.3 From 9f0704eae8a4edc8dca9c8a297f798d505a4103a Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Thu, 7 Dec 2023 16:12:09 +0000 Subject: selftests/mm/khugepaged: enlighten for multi-size THP The `collapse_max_ptes_none` test was previously failing when a THP size less than PMD-size had enabled="always". The root cause is because the test faults in 1 page less than the threshold it set for collapsing. But when THP is enabled always, we "over allocate" and therefore the threshold is passed, and collapse unexpectedly succeeds. Solve this by enlightening khugepaged selftest. Add a command line option to pass in the desired THP size that should be used for all anonymous allocations. The harness will then explicitly configure a THP size as requested and modify the `collapse_max_ptes_none` test so that it faults in the threshold minus the number of pages in the configured THP size. If no command line option is provided, default to order 0, as per previous behaviour. I chose to use an order in the command line interface, since this makes the interface agnostic of base page size, making it easier to invoke from run_vmtests.sh. Link: https://lkml.kernel.org/r/20231207161211.2374093-9-ryan.roberts@arm.com Signed-off-by: Ryan Roberts Tested-by: Kefeng Wang Tested-by: John Hubbard Cc: Alistair Popple Cc: Anshuman Khandual Cc: Barry Song Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Rientjes Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Itaru Kitayama Cc: Kirill A. Shutemov Cc: Luis Chamberlain Cc: Matthew Wilcox (Oracle) Cc: Vlastimil Babka Cc: Yang Shi Cc: Yin Fengwei Cc: Yu Zhao Cc: Zi Yan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/khugepaged.c | 48 ++++++++++++++++++++++++------- tools/testing/selftests/mm/run_vmtests.sh | 2 ++ 2 files changed, 39 insertions(+), 11 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/mm/khugepaged.c b/tools/testing/selftests/mm/khugepaged.c index 7bd3baa9d34b..829320a519e7 100644 --- a/tools/testing/selftests/mm/khugepaged.c +++ b/tools/testing/selftests/mm/khugepaged.c @@ -28,6 +28,7 @@ static unsigned long hpage_pmd_size; static unsigned long page_size; static int hpage_pmd_nr; +static int anon_order; #define PID_SMAPS "/proc/self/smaps" #define TEST_FILE "collapse_test_file" @@ -607,6 +608,11 @@ static bool is_tmpfs(struct mem_ops *ops) return ops == &__file_ops && finfo.type == VMA_SHMEM; } +static bool is_anon(struct mem_ops *ops) +{ + return ops == &__anon_ops; +} + static void alloc_at_fault(void) { struct thp_settings settings = *thp_current_settings(); @@ -673,6 +679,7 @@ static void collapse_max_ptes_none(struct collapse_context *c, struct mem_ops *o int max_ptes_none = hpage_pmd_nr / 2; struct thp_settings settings = *thp_current_settings(); void *p; + int fault_nr_pages = is_anon(ops) ? 1 << anon_order : 1; settings.khugepaged.max_ptes_none = max_ptes_none; thp_push_settings(&settings); @@ -686,10 +693,10 @@ static void collapse_max_ptes_none(struct collapse_context *c, struct mem_ops *o goto skip; } - ops->fault(p, 0, (hpage_pmd_nr - max_ptes_none - 1) * page_size); + ops->fault(p, 0, (hpage_pmd_nr - max_ptes_none - fault_nr_pages) * page_size); c->collapse("Maybe collapse with max_ptes_none exceeded", p, 1, ops, !c->enforce_pte_scan_limits); - validate_memory(p, 0, (hpage_pmd_nr - max_ptes_none - 1) * page_size); + validate_memory(p, 0, (hpage_pmd_nr - max_ptes_none - fault_nr_pages) * page_size); if (c->enforce_pte_scan_limits) { ops->fault(p, 0, (hpage_pmd_nr - max_ptes_none) * page_size); @@ -1076,7 +1083,7 @@ static void madvise_retracted_page_tables(struct collapse_context *c, static void usage(void) { - fprintf(stderr, "\nUsage: ./khugepaged [dir]\n\n"); + fprintf(stderr, "\nUsage: ./khugepaged [OPTIONS] [dir]\n\n"); fprintf(stderr, "\t\t: :\n"); fprintf(stderr, "\t\t: [all|khugepaged|madvise]\n"); fprintf(stderr, "\t\t: [all|anon|file|shmem]\n"); @@ -1085,15 +1092,34 @@ static void usage(void) fprintf(stderr, "\tCONFIG_READ_ONLY_THP_FOR_FS=y\n"); fprintf(stderr, "\n\tif [dir] is a (sub)directory of a tmpfs mount, tmpfs must be\n"); fprintf(stderr, "\tmounted with huge=madvise option for khugepaged tests to work\n"); + fprintf(stderr, "\n\tSupported Options:\n"); + fprintf(stderr, "\t\t-h: This help message.\n"); + fprintf(stderr, "\t\t-s: mTHP size, expressed as page order.\n"); + fprintf(stderr, "\t\t Defaults to 0. Use this size for anon allocations.\n"); exit(1); } -static void parse_test_type(int argc, const char **argv) +static void parse_test_type(int argc, char **argv) { + int opt; char *buf; const char *token; - if (argc == 1) { + while ((opt = getopt(argc, argv, "s:h")) != -1) { + switch (opt) { + case 's': + anon_order = atoi(optarg); + break; + case 'h': + default: + usage(); + } + } + + argv += optind; + argc -= optind; + + if (argc == 0) { /* Backwards compatibility */ khugepaged_context = &__khugepaged_context; madvise_context = &__madvise_context; @@ -1101,7 +1127,7 @@ static void parse_test_type(int argc, const char **argv) return; } - buf = strdup(argv[1]); + buf = strdup(argv[0]); token = strsep(&buf, ":"); if (!strcmp(token, "all")) { @@ -1135,11 +1161,13 @@ static void parse_test_type(int argc, const char **argv) if (!file_ops) return; - if (argc != 3) + if (argc != 2) usage(); + + get_finfo(argv[1]); } -int main(int argc, const char **argv) +int main(int argc, char **argv) { int hpage_pmd_order; struct thp_settings default_settings = { @@ -1164,9 +1192,6 @@ int main(int argc, const char **argv) parse_test_type(argc, argv); - if (file_ops) - get_finfo(argv[2]); - setbuf(stdout, NULL); page_size = getpagesize(); @@ -1183,6 +1208,7 @@ int main(int argc, const char **argv) default_settings.khugepaged.max_ptes_shared = hpage_pmd_nr / 2; default_settings.khugepaged.pages_to_scan = hpage_pmd_nr * 8; default_settings.hugepages[hpage_pmd_order].enabled = THP_INHERIT; + default_settings.hugepages[anon_order].enabled = THP_ALWAYS; save_settings(); thp_push_settings(&default_settings); diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh index c0212258b852..87f513f5cf91 100755 --- a/tools/testing/selftests/mm/run_vmtests.sh +++ b/tools/testing/selftests/mm/run_vmtests.sh @@ -357,6 +357,8 @@ CATEGORY="cow" run_test ./cow CATEGORY="thp" run_test ./khugepaged +CATEGORY="thp" run_test ./khugepaged -s 2 + CATEGORY="thp" run_test ./transhuge-stress -d 20 CATEGORY="thp" run_test ./split_huge_page_test -- cgit v1.2.3 From 12dc16b38463a671bc91dc2df10f3a014a27ff3b Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Thu, 7 Dec 2023 16:12:10 +0000 Subject: selftests/mm/cow: generalize do_run_with_thp() helper do_run_with_thp() prepares (PMD-sized) THP memory into different states before running tests. With the introduction of multi-size THP, we would like to reuse this logic to also test those smaller THP sizes. So let's add a thpsize parameter which tells the function what size THP it should operate on. A separate commit will utilize this change to add new tests for multi-size THP, where available. Link: https://lkml.kernel.org/r/20231207161211.2374093-10-ryan.roberts@arm.com Signed-off-by: Ryan Roberts Reviewed-by: David Hildenbrand Tested-by: Kefeng Wang Tested-by: John Hubbard Cc: Alistair Popple Cc: Anshuman Khandual Cc: Barry Song Cc: Catalin Marinas Cc: David Rientjes Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Itaru Kitayama Cc: Kirill A. Shutemov Cc: Luis Chamberlain Cc: Matthew Wilcox (Oracle) Cc: Vlastimil Babka Cc: Yang Shi Cc: Yin Fengwei Cc: Yu Zhao Cc: Zi Yan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/cow.c | 121 ++++++++++++++++++++++----------------- 1 file changed, 67 insertions(+), 54 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/mm/cow.c b/tools/testing/selftests/mm/cow.c index 6f2f83990441..a284918b1172 100644 --- a/tools/testing/selftests/mm/cow.c +++ b/tools/testing/selftests/mm/cow.c @@ -32,7 +32,7 @@ static size_t pagesize; static int pagemap_fd; -static size_t thpsize; +static size_t pmdsize; static int nr_hugetlbsizes; static size_t hugetlbsizes[10]; static int gup_fd; @@ -734,7 +734,7 @@ enum thp_run { THP_RUN_PARTIAL_SHARED, }; -static void do_run_with_thp(test_fn fn, enum thp_run thp_run) +static void do_run_with_thp(test_fn fn, enum thp_run thp_run, size_t thpsize) { char *mem, *mmap_mem, *tmp, *mremap_mem = MAP_FAILED; size_t size, mmap_size, mremap_size; @@ -759,11 +759,11 @@ static void do_run_with_thp(test_fn fn, enum thp_run thp_run) } /* - * Try to populate a THP. Touch the first sub-page and test if we get - * another sub-page populated automatically. + * Try to populate a THP. Touch the first sub-page and test if + * we get the last sub-page populated automatically. */ mem[0] = 0; - if (!pagemap_is_populated(pagemap_fd, mem + pagesize)) { + if (!pagemap_is_populated(pagemap_fd, mem + thpsize - pagesize)) { ksft_test_result_skip("Did not get a THP populated\n"); goto munmap; } @@ -773,12 +773,14 @@ static void do_run_with_thp(test_fn fn, enum thp_run thp_run) switch (thp_run) { case THP_RUN_PMD: case THP_RUN_PMD_SWAPOUT: + assert(thpsize == pmdsize); break; case THP_RUN_PTE: case THP_RUN_PTE_SWAPOUT: /* * Trigger PTE-mapping the THP by temporarily mapping a single - * subpage R/O. + * subpage R/O. This is a noop if the THP is not pmdsize (and + * therefore already PTE-mapped). */ ret = mprotect(mem + pagesize, pagesize, PROT_READ); if (ret) { @@ -875,52 +877,60 @@ munmap: munmap(mremap_mem, mremap_size); } -static void run_with_thp(test_fn fn, const char *desc) +static void run_with_thp(test_fn fn, const char *desc, size_t size) { - ksft_print_msg("[RUN] %s ... with THP\n", desc); - do_run_with_thp(fn, THP_RUN_PMD); + ksft_print_msg("[RUN] %s ... with THP (%zu kB)\n", + desc, size / 1024); + do_run_with_thp(fn, THP_RUN_PMD, size); } -static void run_with_thp_swap(test_fn fn, const char *desc) +static void run_with_thp_swap(test_fn fn, const char *desc, size_t size) { - ksft_print_msg("[RUN] %s ... with swapped-out THP\n", desc); - do_run_with_thp(fn, THP_RUN_PMD_SWAPOUT); + ksft_print_msg("[RUN] %s ... with swapped-out THP (%zu kB)\n", + desc, size / 1024); + do_run_with_thp(fn, THP_RUN_PMD_SWAPOUT, size); } -static void run_with_pte_mapped_thp(test_fn fn, const char *desc) +static void run_with_pte_mapped_thp(test_fn fn, const char *desc, size_t size) { - ksft_print_msg("[RUN] %s ... with PTE-mapped THP\n", desc); - do_run_with_thp(fn, THP_RUN_PTE); + ksft_print_msg("[RUN] %s ... with PTE-mapped THP (%zu kB)\n", + desc, size / 1024); + do_run_with_thp(fn, THP_RUN_PTE, size); } -static void run_with_pte_mapped_thp_swap(test_fn fn, const char *desc) +static void run_with_pte_mapped_thp_swap(test_fn fn, const char *desc, size_t size) { - ksft_print_msg("[RUN] %s ... with swapped-out, PTE-mapped THP\n", desc); - do_run_with_thp(fn, THP_RUN_PTE_SWAPOUT); + ksft_print_msg("[RUN] %s ... with swapped-out, PTE-mapped THP (%zu kB)\n", + desc, size / 1024); + do_run_with_thp(fn, THP_RUN_PTE_SWAPOUT, size); } -static void run_with_single_pte_of_thp(test_fn fn, const char *desc) +static void run_with_single_pte_of_thp(test_fn fn, const char *desc, size_t size) { - ksft_print_msg("[RUN] %s ... with single PTE of THP\n", desc); - do_run_with_thp(fn, THP_RUN_SINGLE_PTE); + ksft_print_msg("[RUN] %s ... with single PTE of THP (%zu kB)\n", + desc, size / 1024); + do_run_with_thp(fn, THP_RUN_SINGLE_PTE, size); } -static void run_with_single_pte_of_thp_swap(test_fn fn, const char *desc) +static void run_with_single_pte_of_thp_swap(test_fn fn, const char *desc, size_t size) { - ksft_print_msg("[RUN] %s ... with single PTE of swapped-out THP\n", desc); - do_run_with_thp(fn, THP_RUN_SINGLE_PTE_SWAPOUT); + ksft_print_msg("[RUN] %s ... with single PTE of swapped-out THP (%zu kB)\n", + desc, size / 1024); + do_run_with_thp(fn, THP_RUN_SINGLE_PTE_SWAPOUT, size); } -static void run_with_partial_mremap_thp(test_fn fn, const char *desc) +static void run_with_partial_mremap_thp(test_fn fn, const char *desc, size_t size) { - ksft_print_msg("[RUN] %s ... with partially mremap()'ed THP\n", desc); - do_run_with_thp(fn, THP_RUN_PARTIAL_MREMAP); + ksft_print_msg("[RUN] %s ... with partially mremap()'ed THP (%zu kB)\n", + desc, size / 1024); + do_run_with_thp(fn, THP_RUN_PARTIAL_MREMAP, size); } -static void run_with_partial_shared_thp(test_fn fn, const char *desc) +static void run_with_partial_shared_thp(test_fn fn, const char *desc, size_t size) { - ksft_print_msg("[RUN] %s ... with partially shared THP\n", desc); - do_run_with_thp(fn, THP_RUN_PARTIAL_SHARED); + ksft_print_msg("[RUN] %s ... with partially shared THP (%zu kB)\n", + desc, size / 1024); + do_run_with_thp(fn, THP_RUN_PARTIAL_SHARED, size); } static void run_with_hugetlb(test_fn fn, const char *desc, size_t hugetlbsize) @@ -1091,15 +1101,15 @@ static void run_anon_test_case(struct test_case const *test_case) run_with_base_page(test_case->fn, test_case->desc); run_with_base_page_swap(test_case->fn, test_case->desc); - if (thpsize) { - run_with_thp(test_case->fn, test_case->desc); - run_with_thp_swap(test_case->fn, test_case->desc); - run_with_pte_mapped_thp(test_case->fn, test_case->desc); - run_with_pte_mapped_thp_swap(test_case->fn, test_case->desc); - run_with_single_pte_of_thp(test_case->fn, test_case->desc); - run_with_single_pte_of_thp_swap(test_case->fn, test_case->desc); - run_with_partial_mremap_thp(test_case->fn, test_case->desc); - run_with_partial_shared_thp(test_case->fn, test_case->desc); + if (pmdsize) { + run_with_thp(test_case->fn, test_case->desc, pmdsize); + run_with_thp_swap(test_case->fn, test_case->desc, pmdsize); + run_with_pte_mapped_thp(test_case->fn, test_case->desc, pmdsize); + run_with_pte_mapped_thp_swap(test_case->fn, test_case->desc, pmdsize); + run_with_single_pte_of_thp(test_case->fn, test_case->desc, pmdsize); + run_with_single_pte_of_thp_swap(test_case->fn, test_case->desc, pmdsize); + run_with_partial_mremap_thp(test_case->fn, test_case->desc, pmdsize); + run_with_partial_shared_thp(test_case->fn, test_case->desc, pmdsize); } for (i = 0; i < nr_hugetlbsizes; i++) run_with_hugetlb(test_case->fn, test_case->desc, @@ -1120,7 +1130,7 @@ static int tests_per_anon_test_case(void) { int tests = 2 + nr_hugetlbsizes; - if (thpsize) + if (pmdsize) tests += 8; return tests; } @@ -1329,7 +1339,7 @@ static void run_anon_thp_test_cases(void) { int i; - if (!thpsize) + if (!pmdsize) return; ksft_print_msg("[INFO] Anonymous THP tests\n"); @@ -1338,13 +1348,13 @@ static void run_anon_thp_test_cases(void) struct test_case const *test_case = &anon_thp_test_cases[i]; ksft_print_msg("[RUN] %s\n", test_case->desc); - do_run_with_thp(test_case->fn, THP_RUN_PMD); + do_run_with_thp(test_case->fn, THP_RUN_PMD, pmdsize); } } static int tests_per_anon_thp_test_case(void) { - return thpsize ? 1 : 0; + return pmdsize ? 1 : 0; } typedef void (*non_anon_test_fn)(char *mem, const char *smem, size_t size); @@ -1419,7 +1429,7 @@ static void run_with_huge_zeropage(non_anon_test_fn fn, const char *desc) } /* For alignment purposes, we need twice the thp size. */ - mmap_size = 2 * thpsize; + mmap_size = 2 * pmdsize; mmap_mem = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); if (mmap_mem == MAP_FAILED) { @@ -1434,11 +1444,11 @@ static void run_with_huge_zeropage(non_anon_test_fn fn, const char *desc) } /* We need a THP-aligned memory area. */ - mem = (char *)(((uintptr_t)mmap_mem + thpsize) & ~(thpsize - 1)); - smem = (char *)(((uintptr_t)mmap_smem + thpsize) & ~(thpsize - 1)); + mem = (char *)(((uintptr_t)mmap_mem + pmdsize) & ~(pmdsize - 1)); + smem = (char *)(((uintptr_t)mmap_smem + pmdsize) & ~(pmdsize - 1)); - ret = madvise(mem, thpsize, MADV_HUGEPAGE); - ret |= madvise(smem, thpsize, MADV_HUGEPAGE); + ret = madvise(mem, pmdsize, MADV_HUGEPAGE); + ret |= madvise(smem, pmdsize, MADV_HUGEPAGE); if (ret) { ksft_test_result_fail("MADV_HUGEPAGE failed\n"); goto munmap; @@ -1457,7 +1467,7 @@ static void run_with_huge_zeropage(non_anon_test_fn fn, const char *desc) goto munmap; } - fn(mem, smem, thpsize); + fn(mem, smem, pmdsize); munmap: munmap(mmap_mem, mmap_size); if (mmap_smem != MAP_FAILED) @@ -1650,7 +1660,7 @@ static void run_non_anon_test_case(struct non_anon_test_case const *test_case) run_with_zeropage(test_case->fn, test_case->desc); run_with_memfd(test_case->fn, test_case->desc); run_with_tmpfile(test_case->fn, test_case->desc); - if (thpsize) + if (pmdsize) run_with_huge_zeropage(test_case->fn, test_case->desc); for (i = 0; i < nr_hugetlbsizes; i++) run_with_memfd_hugetlb(test_case->fn, test_case->desc, @@ -1671,7 +1681,7 @@ static int tests_per_non_anon_test_case(void) { int tests = 3 + nr_hugetlbsizes; - if (thpsize) + if (pmdsize) tests += 1; return tests; } @@ -1683,10 +1693,13 @@ int main(int argc, char **argv) ksft_print_header(); pagesize = getpagesize(); - thpsize = read_pmd_pagesize(); - if (thpsize) + pmdsize = read_pmd_pagesize(); + if (pmdsize) { + ksft_print_msg("[INFO] detected PMD size: %zu KiB\n", + pmdsize / 1024); ksft_print_msg("[INFO] detected THP size: %zu KiB\n", - thpsize / 1024); + pmdsize / 1024); + } nr_hugetlbsizes = detect_hugetlb_page_sizes(hugetlbsizes, ARRAY_SIZE(hugetlbsizes)); detect_huge_zeropage(); -- cgit v1.2.3 From c0f79103322c322ea9342d52c2d81528b7b56232 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Thu, 7 Dec 2023 16:12:11 +0000 Subject: selftests/mm/cow: add tests for anonymous multi-size THP Add tests similar to the existing PMD-sized THP tests, but which operate on memory backed by (PTE-mapped) multi-size THP. This reuses all the existing infrastructure. If the test suite detects that multi-size THP is not supported by the kernel, the new tests are skipped. Link: https://lkml.kernel.org/r/20231207161211.2374093-11-ryan.roberts@arm.com Signed-off-by: Ryan Roberts Reviewed-by: David Hildenbrand Tested-by: Kefeng Wang Tested-by: John Hubbard Cc: Alistair Popple Cc: Anshuman Khandual Cc: Barry Song Cc: Catalin Marinas Cc: David Rientjes Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Itaru Kitayama Cc: Kirill A. Shutemov Cc: Luis Chamberlain Cc: Matthew Wilcox (Oracle) Cc: Vlastimil Babka Cc: Yang Shi Cc: Yin Fengwei Cc: Yu Zhao Cc: Zi Yan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/cow.c | 82 ++++++++++++++++++++++++++++++++++------ 1 file changed, 70 insertions(+), 12 deletions(-) (limited to 'tools') diff --git a/tools/testing/selftests/mm/cow.c b/tools/testing/selftests/mm/cow.c index a284918b1172..363bf5f801be 100644 --- a/tools/testing/selftests/mm/cow.c +++ b/tools/testing/selftests/mm/cow.c @@ -29,15 +29,49 @@ #include "../../../../mm/gup_test.h" #include "../kselftest.h" #include "vm_util.h" +#include "thp_settings.h" static size_t pagesize; static int pagemap_fd; static size_t pmdsize; +static int nr_thpsizes; +static size_t thpsizes[20]; static int nr_hugetlbsizes; static size_t hugetlbsizes[10]; static int gup_fd; static bool has_huge_zeropage; +static int sz2ord(size_t size) +{ + return __builtin_ctzll(size / pagesize); +} + +static int detect_thp_sizes(size_t sizes[], int max) +{ + int count = 0; + unsigned long orders; + size_t kb; + int i; + + /* thp not supported at all. */ + if (!pmdsize) + return 0; + + orders = 1UL << sz2ord(pmdsize); + orders |= thp_supported_orders(); + + for (i = 0; orders && count < max; i++) { + if (!(orders & (1UL << i))) + continue; + orders &= ~(1UL << i); + kb = (pagesize >> 10) << i; + sizes[count++] = kb * 1024; + ksft_print_msg("[INFO] detected THP size: %zu KiB\n", kb); + } + + return count; +} + static void detect_huge_zeropage(void) { int fd = open("/sys/kernel/mm/transparent_hugepage/use_zero_page", @@ -1101,15 +1135,27 @@ static void run_anon_test_case(struct test_case const *test_case) run_with_base_page(test_case->fn, test_case->desc); run_with_base_page_swap(test_case->fn, test_case->desc); - if (pmdsize) { - run_with_thp(test_case->fn, test_case->desc, pmdsize); - run_with_thp_swap(test_case->fn, test_case->desc, pmdsize); - run_with_pte_mapped_thp(test_case->fn, test_case->desc, pmdsize); - run_with_pte_mapped_thp_swap(test_case->fn, test_case->desc, pmdsize); - run_with_single_pte_of_thp(test_case->fn, test_case->desc, pmdsize); - run_with_single_pte_of_thp_swap(test_case->fn, test_case->desc, pmdsize); - run_with_partial_mremap_thp(test_case->fn, test_case->desc, pmdsize); - run_with_partial_shared_thp(test_case->fn, test_case->desc, pmdsize); + for (i = 0; i < nr_thpsizes; i++) { + size_t size = thpsizes[i]; + struct thp_settings settings = *thp_current_settings(); + + settings.hugepages[sz2ord(pmdsize)].enabled = THP_NEVER; + settings.hugepages[sz2ord(size)].enabled = THP_ALWAYS; + thp_push_settings(&settings); + + if (size == pmdsize) { + run_with_thp(test_case->fn, test_case->desc, size); + run_with_thp_swap(test_case->fn, test_case->desc, size); + } + + run_with_pte_mapped_thp(test_case->fn, test_case->desc, size); + run_with_pte_mapped_thp_swap(test_case->fn, test_case->desc, size); + run_with_single_pte_of_thp(test_case->fn, test_case->desc, size); + run_with_single_pte_of_thp_swap(test_case->fn, test_case->desc, size); + run_with_partial_mremap_thp(test_case->fn, test_case->desc, size); + run_with_partial_shared_thp(test_case->fn, test_case->desc, size); + + thp_pop_settings(); } for (i = 0; i < nr_hugetlbsizes; i++) run_with_hugetlb(test_case->fn, test_case->desc, @@ -1130,8 +1176,9 @@ static int tests_per_anon_test_case(void) { int tests = 2 + nr_hugetlbsizes; + tests += 6 * nr_thpsizes; if (pmdsize) - tests += 8; + tests += 2; return tests; } @@ -1689,16 +1736,22 @@ static int tests_per_non_anon_test_case(void) int main(int argc, char **argv) { int err; + struct thp_settings default_settings; ksft_print_header(); pagesize = getpagesize(); pmdsize = read_pmd_pagesize(); if (pmdsize) { + /* Only if THP is supported. */ + thp_read_settings(&default_settings); + default_settings.hugepages[sz2ord(pmdsize)].enabled = THP_INHERIT; + thp_save_settings(); + thp_push_settings(&default_settings); + ksft_print_msg("[INFO] detected PMD size: %zu KiB\n", pmdsize / 1024); - ksft_print_msg("[INFO] detected THP size: %zu KiB\n", - pmdsize / 1024); + nr_thpsizes = detect_thp_sizes(thpsizes, ARRAY_SIZE(thpsizes)); } nr_hugetlbsizes = detect_hugetlb_page_sizes(hugetlbsizes, ARRAY_SIZE(hugetlbsizes)); @@ -1717,6 +1770,11 @@ int main(int argc, char **argv) run_anon_thp_test_cases(); run_non_anon_test_cases(); + if (pmdsize) { + /* Only if THP is supported. */ + thp_restore_settings(); + } + err = ksft_get_fail_cnt(); if (err) ksft_exit_fail_msg("%d out of %d tests failed\n", -- cgit v1.2.3 From 03d69d49da496e31246f41a017b32b68b9d2362e Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Fri, 8 Dec 2023 10:04:50 +0800 Subject: maple_tree: fix warning comparing pointer to 0 Avoid pointer type value compared with 0 to make code clear. ./tools/testing/radix-tree/maple.c:34142:15-16: WARNING comparing pointer to 0. Link: https://lkml.kernel.org/r/20231208020450.7003-1-jiapeng.chong@linux.alibaba.com Reported-by: Abaci Robot Closes: https://bugzilla.openanolis.cn/show_bug.cgi?id=7696 Signed-off-by: Jiapeng Chong Cc: Liam R. Howlett Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- tools/testing/radix-tree/maple.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'tools') diff --git a/tools/testing/radix-tree/maple.c b/tools/testing/radix-tree/maple.c index 35cc8c2a10f4..f1caf4bcf937 100644 --- a/tools/testing/radix-tree/maple.c +++ b/tools/testing/radix-tree/maple.c @@ -34139,7 +34139,7 @@ STORE, 140501948112896, 140501948116991, mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); check_erase2_testset(mt, set27, ARRAY_SIZE(set27)); rcu_barrier(); - MT_BUG_ON(mt, 0 != mtree_load(mt, 140415537422336)); + MT_BUG_ON(mt, NULL != mtree_load(mt, 140415537422336)); mt_set_non_kernel(0); mt_validate(mt); mtree_destroy(mt); @@ -34263,7 +34263,7 @@ STORE, 140501948112896, 140501948116991, mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); check_erase2_testset(mt, set37, ARRAY_SIZE(set37)); rcu_barrier(); - MT_BUG_ON(mt, 0 != mtree_load(mt, 94637033459712)); + MT_BUG_ON(mt, NULL != mtree_load(mt, 94637033459712)); mt_validate(mt); mtree_destroy(mt); @@ -34271,7 +34271,7 @@ STORE, 140501948112896, 140501948116991, mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); check_erase2_testset(mt, set38, ARRAY_SIZE(set38)); rcu_barrier(); - MT_BUG_ON(mt, 0 != mtree_load(mt, 94637033459712)); + MT_BUG_ON(mt, NULL != mtree_load(mt, 94637033459712)); mt_validate(mt); mtree_destroy(mt); -- cgit v1.2.3 From 306abb63a8cab566bf80860c5430b1fa316646b7 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 12 Dec 2023 19:48:06 +0000 Subject: selftests/damon: implement a python module for test-purpose DAMON sysfs controls Patch series "selftests/damon: add Python-written DAMON functionality tests", v2. DAMON exports most of its functionality via its sysfs interface. Hence most DAMON functionality tests could be implemented using the interface. However, because the interfaces require simple but multiple operations for many controls, writing all such tests from the scratch could be repetitive and time consuming. Implement a minimum DAMON sysfs control module, and a couple of DAMON functionality tests using the control module. The first test is for ensuring minimum accuracy of data access monitoring, and the second test is for finding if a previously found and fixed bug is introduced again. Note that the DAMON sysfs control module is only for avoiding duplicating code in tests. For convenient and general control of DAMON, users should use DAMON user-space tools that developed for the purpose, such as damo[1]. [1] https://github.com/damonitor/damo Patches Sequence ---------------- This patchset is constructed with five patches. The first three patches implement a Python-written test implementation-purpose DAMON sysfs control module. The implementation is incrementally done in the sequence of the basic data structure (first patch) first, kdamonds start command (second patch) next, and finally DAMOS tried bytes update command (third patch). Then two patches for implementing selftests using the module follows. The fourth patch implements a basic functionality test of DAMON for working set estimation accuracy. Finally, the fifth patch implements a corner case test for a previously found bug. This patch (of 5): Implement a python module for DAMON sysfs controls. The module is aimed to be useful for writing DAMON functionality tests in future. Nonetheless, this module is only representing a subset of DAMON sysfs files. Following commits will implement more DAMON sysfs controls. Link: https://lkml.kernel.org/r/20231212194810.54457-1-sj@kernel.org Link: https://lkml.kernel.org/r/20231212194810.54457-2-sj@kernel.org Signed-off-by: SeongJae Park Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/damon/_damon_sysfs.py | 102 ++++++++++++++++++++++++++ 1 file changed, 102 insertions(+) create mode 100644 tools/testing/selftests/damon/_damon_sysfs.py (limited to 'tools') diff --git a/tools/testing/selftests/damon/_damon_sysfs.py b/tools/testing/selftests/damon/_damon_sysfs.py new file mode 100644 index 000000000000..78101846ab66 --- /dev/null +++ b/tools/testing/selftests/damon/_damon_sysfs.py @@ -0,0 +1,102 @@ +# SPDX-License-Identifier: GPL-2.0 + +class DamosAccessPattern: + size = None + nr_accesses = None + age = None + scheme = None + + def __init__(self, size=None, nr_accesses=None, age=None): + self.size = size + self.nr_accesses = nr_accesses + self.age = age + + if self.size == None: + self.size = [0, 2**64 - 1] + if self.nr_accesses == None: + self.nr_accesses = [0, 2**64 - 1] + if self.age == None: + self.age = [0, 2**64 - 1] + +class Damos: + action = None + access_pattern = None + # todo: Support quotas, watermarks, stats, tried_regions + idx = None + context = None + + def __init__(self, action='stat', access_pattern=DamosAccessPattern()): + self.action = action + self.access_pattern = access_pattern + self.access_pattern.scheme = self + +class DamonTarget: + pid = None + # todo: Support target regions if test is made + idx = None + context = None + + def __init__(self, pid): + self.pid = pid + +class DamonAttrs: + sample_us = None + aggr_us = None + update_us = None + min_nr_regions = None + max_nr_regions = None + context = None + + def __init__(self, sample_us=5000, aggr_us=100000, update_us=1000000, + min_nr_regions=10, max_nr_regions=1000): + self.sample_us = sample_us + self.aggr_us = aggr_us + self.update_us = update_us + self.min_nr_regions = min_nr_regions + self.max_nr_regions = max_nr_regions + +class DamonCtx: + ops = None + monitoring_attrs = None + targets = None + schemes = None + kdamond = None + idx = None + + def __init__(self, ops='paddr', monitoring_attrs=DamonAttrs(), targets=[], + schemes=[]): + self.ops = ops + self.monitoring_attrs = monitoring_attrs + self.monitoring_attrs.context = self + + self.targets = targets + for idx, target in enumerate(self.targets): + target.idx = idx + target.context = self + + self.schemes = schemes + for idx, scheme in enumerate(self.schemes): + scheme.idx = idx + scheme.context = self + +class Kdamond: + state = None + pid = None + contexts = None + idx = None # index of this kdamond between siblings + kdamonds = None # parent + + def __init__(self, contexts=[]): + self.contexts = contexts + for idx, context in enumerate(self.contexts): + context.idx = idx + context.kdamond = self + +class Kdamonds: + kdamonds = [] + + def __init__(self, kdamonds=[]): + self.kdamonds = kdamonds + for idx, kdamond in enumerate(self.kdamonds): + kdamond.idx = idx + kdamond.kdamonds = self -- cgit v1.2.3 From f5f0e5a2bef9e46f7a674b71d7f2a4c4b7e6bc5d Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 12 Dec 2023 19:48:07 +0000 Subject: selftests/damon/_damon_sysfs: implement kdamonds start function Extend the tests-writing-purpose DAMON sysfs control module to support the kdamonds start functionality. Link: https://lkml.kernel.org/r/20231212194810.54457-3-sj@kernel.org Signed-off-by: SeongJae Park Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/damon/_damon_sysfs.py | 206 ++++++++++++++++++++++++++ 1 file changed, 206 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/damon/_damon_sysfs.py b/tools/testing/selftests/damon/_damon_sysfs.py index 78101846ab66..6b99f87a5f1e 100644 --- a/tools/testing/selftests/damon/_damon_sysfs.py +++ b/tools/testing/selftests/damon/_damon_sysfs.py @@ -1,5 +1,28 @@ # SPDX-License-Identifier: GPL-2.0 +import os + +sysfs_root = '/sys/kernel/mm/damon/admin' + +def write_file(path, string): + "Returns error string if failed, or None otherwise" + string = '%s' % string + try: + with open(path, 'w') as f: + f.write(string) + except Exception as e: + return '%s' % e + return None + +def read_file(path): + '''Returns the read content and error string. The read content is None if + the reading failed''' + try: + with open(path, 'r') as f: + return f.read(), None + except Exception as e: + return None, '%s' % e + class DamosAccessPattern: size = None nr_accesses = None @@ -18,6 +41,35 @@ class DamosAccessPattern: if self.age == None: self.age = [0, 2**64 - 1] + def sysfs_dir(self): + return os.path.join(self.scheme.sysfs_dir(), 'access_pattern') + + def stage(self): + err = write_file( + os.path.join(self.sysfs_dir(), 'sz', 'min'), self.size[0]) + if err != None: + return err + err = write_file( + os.path.join(self.sysfs_dir(), 'sz', 'max'), self.size[1]) + if err != None: + return err + err = write_file(os.path.join(self.sysfs_dir(), 'nr_accesses', 'min'), + self.nr_accesses[0]) + if err != None: + return err + err = write_file(os.path.join(self.sysfs_dir(), 'nr_accesses', 'max'), + self.nr_accesses[1]) + if err != None: + return err + err = write_file( + os.path.join(self.sysfs_dir(), 'age', 'min'), self.age[0]) + if err != None: + return err + err = write_file( + os.path.join(self.sysfs_dir(), 'age', 'max'), self.age[1]) + if err != None: + return err + class Damos: action = None access_pattern = None @@ -30,6 +82,39 @@ class Damos: self.access_pattern = access_pattern self.access_pattern.scheme = self + def sysfs_dir(self): + return os.path.join( + self.context.sysfs_dir(), 'schemes', '%d' % self.idx) + + def stage(self): + err = write_file(os.path.join(self.sysfs_dir(), 'action'), self.action) + if err != None: + return err + err = self.access_pattern.stage() + if err != None: + return err + + # disable quotas + err = write_file(os.path.join(self.sysfs_dir(), 'quotas', 'ms'), '0') + if err != None: + return err + err = write_file( + os.path.join(self.sysfs_dir(), 'quotas', 'bytes'), '0') + if err != None: + return err + + # disable watermarks + err = write_file( + os.path.join(self.sysfs_dir(), 'watermarks', 'metric'), 'none') + if err != None: + return err + + # disable filters + err = write_file( + os.path.join(self.sysfs_dir(), 'filters', 'nr_filters'), '0') + if err != None: + return err + class DamonTarget: pid = None # todo: Support target regions if test is made @@ -39,6 +124,18 @@ class DamonTarget: def __init__(self, pid): self.pid = pid + def sysfs_dir(self): + return os.path.join( + self.context.sysfs_dir(), 'targets', '%d' % self.idx) + + def stage(self): + err = write_file( + os.path.join(self.sysfs_dir(), 'regions', 'nr_regions'), '0') + if err != None: + return err + return write_file( + os.path.join(self.sysfs_dir(), 'pid_target'), self.pid) + class DamonAttrs: sample_us = None aggr_us = None @@ -55,6 +152,40 @@ class DamonAttrs: self.min_nr_regions = min_nr_regions self.max_nr_regions = max_nr_regions + def interval_sysfs_dir(self): + return os.path.join(self.context.sysfs_dir(), 'monitoring_attrs', + 'intervals') + + def nr_regions_range_sysfs_dir(self): + return os.path.join(self.context.sysfs_dir(), 'monitoring_attrs', + 'nr_regions') + + def stage(self): + err = write_file(os.path.join(self.interval_sysfs_dir(), 'sample_us'), + self.sample_us) + if err != None: + return err + err = write_file(os.path.join(self.interval_sysfs_dir(), 'aggr_us'), + self.aggr_us) + if err != None: + return err + err = write_file(os.path.join(self.interval_sysfs_dir(), 'update_us'), + self.update_us) + if err != None: + return err + + err = write_file( + os.path.join(self.nr_regions_range_sysfs_dir(), 'min'), + self.min_nr_regions) + if err != None: + return err + + err = write_file( + os.path.join(self.nr_regions_range_sysfs_dir(), 'max'), + self.max_nr_regions) + if err != None: + return err + class DamonCtx: ops = None monitoring_attrs = None @@ -79,6 +210,46 @@ class DamonCtx: scheme.idx = idx scheme.context = self + def sysfs_dir(self): + return os.path.join(self.kdamond.sysfs_dir(), 'contexts', + '%d' % self.idx) + + def stage(self): + err = write_file( + os.path.join(self.sysfs_dir(), 'operations'), self.ops) + if err != None: + return err + err = self.monitoring_attrs.stage() + if err != None: + return err + + nr_targets_file = os.path.join( + self.sysfs_dir(), 'targets', 'nr_targets') + content, err = read_file(nr_targets_file) + if err != None: + return err + if int(content) != len(self.targets): + err = write_file(nr_targets_file, '%d' % len(self.targets)) + if err != None: + return err + for target in self.targets: + err = target.stage() + if err != None: + return err + + nr_schemes_file = os.path.join( + self.sysfs_dir(), 'schemes', 'nr_schemes') + content, err = read_file(nr_schemes_file) + if int(content) != len(self.schemes): + err = write_file(nr_schemes_file, '%d' % len(self.schemes)) + if err != None: + return err + for scheme in self.schemes: + err = scheme.stage() + if err != None: + return err + return None + class Kdamond: state = None pid = None @@ -92,6 +263,27 @@ class Kdamond: context.idx = idx context.kdamond = self + def sysfs_dir(self): + return os.path.join(self.kdamonds.sysfs_dir(), '%d' % self.idx) + + def start(self): + nr_contexts_file = os.path.join(self.sysfs_dir(), + 'contexts', 'nr_contexts') + content, err = read_file(nr_contexts_file) + if err != None: + return err + if int(content) != len(self.contexts): + err = write_file(nr_contexts_file, '%d' % len(self.contexts)) + if err != None: + return err + + for context in self.contexts: + err = context.stage() + if err != None: + return err + err = write_file(os.path.join(self.sysfs_dir(), 'state'), 'on') + return err + class Kdamonds: kdamonds = [] @@ -100,3 +292,17 @@ class Kdamonds: for idx, kdamond in enumerate(self.kdamonds): kdamond.idx = idx kdamond.kdamonds = self + + def sysfs_dir(self): + return os.path.join(sysfs_root, 'kdamonds') + + def start(self): + err = write_file(os.path.join(self.sysfs_dir(), 'nr_kdamonds'), + '%s' % len(self.kdamonds)) + if err != None: + return err + for kdamond in self.kdamonds: + err = kdamond.start() + if err != None: + return err + return None -- cgit v1.2.3 From 3402c6ce398e33bf1733f619756dd068ca2e2aa5 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 12 Dec 2023 19:48:08 +0000 Subject: selftests/damon/_damon_sysfs: implement updat_schemes_tried_bytes command Implement update_schemes_tried_bytes command of DAMON sysfs interface in _damon_sysfs.py. It is not only making the update, but also read the updated value from the sysfs interface and store it in the Kdamond python objects so that the user of the module can easily get the value. Link: https://lkml.kernel.org/r/20231212194810.54457-4-sj@kernel.org Signed-off-by: SeongJae Park Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/damon/_damon_sysfs.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'tools') diff --git a/tools/testing/selftests/damon/_damon_sysfs.py b/tools/testing/selftests/damon/_damon_sysfs.py index 6b99f87a5f1e..e98cf4b6a4b7 100644 --- a/tools/testing/selftests/damon/_damon_sysfs.py +++ b/tools/testing/selftests/damon/_damon_sysfs.py @@ -76,6 +76,7 @@ class Damos: # todo: Support quotas, watermarks, stats, tried_regions idx = None context = None + tried_bytes = None def __init__(self, action='stat', access_pattern=DamosAccessPattern()): self.action = action @@ -284,6 +285,19 @@ class Kdamond: err = write_file(os.path.join(self.sysfs_dir(), 'state'), 'on') return err + def update_schemes_tried_bytes(self): + err = write_file(os.path.join(self.sysfs_dir(), 'state'), + 'update_schemes_tried_bytes') + if err != None: + return err + for context in self.contexts: + for scheme in context.schemes: + content, err = read_file(os.path.join(scheme.sysfs_dir(), + 'tried_regions', 'total_bytes')) + if err != None: + return err + scheme.tried_bytes = int(content) + class Kdamonds: kdamonds = [] -- cgit v1.2.3 From b5906f5f7359f561c5915dc146ced1bc2733401c Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 12 Dec 2023 19:48:09 +0000 Subject: selftests/damon: add a test for update_schemes_tried_regions sysfs command Add a selftest for verifying the accuracy of DAMON's access monitoring functionality. The test starts a program of artificial access pattern, monitor the access pattern using DAMON, and check if DAMON finds expected amount of hot data region (working set size) with only acceptable error rate. Note that the acceptable error rate is set with only naive assumptions and small number of tests. Hence failures of the test may not always mean DAMON is broken. Rather than that, those could be a signal to better understand the real accuracy level of DAMON in wider environments. Based on further finding, we could optimize DAMON or adjust the expectation of the test. Link: https://lkml.kernel.org/r/20231212194810.54457-5-sj@kernel.org Signed-off-by: SeongJae Park Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/damon/Makefile | 2 + tools/testing/selftests/damon/access_memory.c | 41 ++++++++++++++++ ..._update_schemes_tried_regions_wss_estimation.py | 55 ++++++++++++++++++++++ 3 files changed, 98 insertions(+) create mode 100644 tools/testing/selftests/damon/access_memory.c create mode 100644 tools/testing/selftests/damon/sysfs_update_schemes_tried_regions_wss_estimation.py (limited to 'tools') diff --git a/tools/testing/selftests/damon/Makefile b/tools/testing/selftests/damon/Makefile index b71247ba7196..90ffafc42c5e 100644 --- a/tools/testing/selftests/damon/Makefile +++ b/tools/testing/selftests/damon/Makefile @@ -2,6 +2,7 @@ # Makefile for damon selftests TEST_GEN_FILES += huge_count_read_write +TEST_GEN_FILES += access_memory TEST_FILES = _chk_dependency.sh _debugfs_common.sh TEST_PROGS = debugfs_attrs.sh debugfs_schemes.sh debugfs_target_ids.sh @@ -9,6 +10,7 @@ TEST_PROGS += debugfs_empty_targets.sh debugfs_huge_count_read_write.sh TEST_PROGS += debugfs_duplicate_context_creation.sh TEST_PROGS += debugfs_rm_non_contexts.sh TEST_PROGS += sysfs.sh sysfs_update_removed_scheme_dir.sh +TEST_PROGS += sysfs_update_schemes_tried_regions_wss_estimation.py TEST_PROGS += reclaim.sh lru_sort.sh include ../lib.mk diff --git a/tools/testing/selftests/damon/access_memory.c b/tools/testing/selftests/damon/access_memory.c new file mode 100644 index 000000000000..585a2fa54329 --- /dev/null +++ b/tools/testing/selftests/damon/access_memory.c @@ -0,0 +1,41 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Artificial memory access program for testing DAMON. + */ + +#include +#include +#include +#include + +int main(int argc, char *argv[]) +{ + char **regions; + clock_t start_clock; + int nr_regions; + int sz_region; + int access_time_ms; + int i; + + if (argc != 4) { + printf("Usage: %s