From 39d0bd86c499ecd6abae42a9b7112056c5560691 Mon Sep 17 00:00:00 2001 From: Liam Howlett Date: Mon, 27 Feb 2023 09:36:00 -0800 Subject: maple_tree: be more cautious about dead nodes Patch series "Fix VMA tree modification under mmap read lock". Syzbot reported a BUG_ON in mm/mmap.c which was found to be caused by an inconsistency between threads walking the VMA maple tree. The inconsistency is caused by the page fault handler modifying the maple tree while holding the mmap_lock for read. This only happens for stack VMAs. We had thought this was safe as it only modifies a single pivot in the tree. Unfortunately, syzbot constructed a test case where the stack had no guard page and grew the stack to abut the next VMA. This causes us to delete the NULL entry between the two VMAs and rewrite the node. We considered several options for fixing this, including dropping the mmap_lock, then reacquiring it for write; and relaxing the definition of the tree to permit a zero-length NULL entry in the node. We decided the best option was to backport some of the RCU patches from -next, which solve the problem by allocating a new node and RCU-freeing the old node. Since the problem exists in 6.1, we preferred a solution which is similar to the one we intended to merge next merge window. These patches have been in -next since next-20230301, and have received intensive testing in Android as part of the RCU page fault patchset. They were also sent as part of the "Per-VMA locks" v4 patch series. Patches 1 to 7 are bug fixes for RCU mode of the tree and patch 8 enables RCU mode for the tree. Performance v6.3-rc3 vs patched v6.3-rc3: Running these changes through mmtests showed there was a 15-20% performance decrease in will-it-scale/brk1-processes. This tests creating and inserting a single VMA repeatedly through the brk interface and isn't representative of any real world applications. This patch (of 8): ma_pivots() and ma_data_end() may be called with a dead node. Ensure to that the node isn't dead before using the returned values. This is necessary for RCU mode of the maple tree. Link: https://lkml.kernel.org/r/20230327185532.2354250-1-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20230227173632.3292573-1-surenb@google.com Link: https://lkml.kernel.org/r/20230227173632.3292573-2-surenb@google.com Fixes: 54a611b60590 ("Maple Tree: add new data structure") Signed-off-by: Liam Howlett Signed-off-by: Suren Baghdasaryan Cc: Andy Lutomirski Cc: Arjun Roy Cc: Axel Rasmussen Cc: Chris Li Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: David Rientjes Cc: Eric Dumazet Cc: freak07 Cc: Greg Thelen Cc: Hugh Dickins Cc: Ingo Molnar Cc: Jann Horn Cc: Joel Fernandes Cc: Johannes Weiner Cc: Kent Overstreet Cc: Laurent Dufour Cc: Lorenzo Stoakes Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Mike Rapoport Cc: Minchan Kim Cc: Paul E. McKenney Cc: Peter Oskolkov Cc: Peter Xu Cc: Peter Zijlstra Cc: Punit Agrawal Cc: Sebastian Andrzej Siewior Cc: Shakeel Butt Cc: Soheil Hassas Yeganeh Cc: Song Liu Cc: Vlastimil Babka Cc: Will Deacon Cc: Signed-off-by: Andrew Morton --- lib/maple_tree.c | 52 +++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 43 insertions(+), 9 deletions(-) (limited to 'lib/maple_tree.c') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 9e2735cbc2b4..095b9cb1f4f1 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -544,6 +544,7 @@ static inline bool ma_dead_node(const struct maple_node *node) return (parent == node); } + /* * mte_dead_node() - check if the @enode is dead. * @enode: The encoded maple node @@ -625,6 +626,8 @@ static inline unsigned int mas_alloc_req(const struct ma_state *mas) * @node - the maple node * @type - the node type * + * In the event of a dead node, this array may be %NULL + * * Return: A pointer to the maple node pivots */ static inline unsigned long *ma_pivots(struct maple_node *node, @@ -1096,8 +1099,11 @@ static int mas_ascend(struct ma_state *mas) a_type = mas_parent_enum(mas, p_enode); a_node = mte_parent(p_enode); a_slot = mte_parent_slot(p_enode); - pivots = ma_pivots(a_node, a_type); a_enode = mt_mk_node(a_node, a_type); + pivots = ma_pivots(a_node, a_type); + + if (unlikely(ma_dead_node(a_node))) + return 1; if (!set_min && a_slot) { set_min = true; @@ -1401,6 +1407,9 @@ static inline unsigned char ma_data_end(struct maple_node *node, { unsigned char offset; + if (!pivots) + return 0; + if (type == maple_arange_64) return ma_meta_end(node, type); @@ -1436,6 +1445,9 @@ static inline unsigned char mas_data_end(struct ma_state *mas) return ma_meta_end(node, type); pivots = ma_pivots(node, type); + if (unlikely(ma_dead_node(node))) + return 0; + offset = mt_pivots[type] - 1; if (likely(!pivots[offset])) return ma_meta_end(node, type); @@ -4505,6 +4517,9 @@ static inline int mas_prev_node(struct ma_state *mas, unsigned long min) node = mas_mn(mas); slots = ma_slots(node, mt); pivots = ma_pivots(node, mt); + if (unlikely(ma_dead_node(node))) + return 1; + mas->max = pivots[offset]; if (offset) mas->min = pivots[offset - 1] + 1; @@ -4526,6 +4541,9 @@ static inline int mas_prev_node(struct ma_state *mas, unsigned long min) slots = ma_slots(node, mt); pivots = ma_pivots(node, mt); offset = ma_data_end(node, mt, pivots, mas->max); + if (unlikely(ma_dead_node(node))) + return 1; + if (offset) mas->min = pivots[offset - 1] + 1; @@ -4574,6 +4592,7 @@ static inline int mas_next_node(struct ma_state *mas, struct maple_node *node, struct maple_enode *enode; int level = 0; unsigned char offset; + unsigned char node_end; enum maple_type mt; void __rcu **slots; @@ -4597,7 +4616,11 @@ static inline int mas_next_node(struct ma_state *mas, struct maple_node *node, node = mas_mn(mas); mt = mte_node_type(mas->node); pivots = ma_pivots(node, mt); - } while (unlikely(offset == ma_data_end(node, mt, pivots, mas->max))); + node_end = ma_data_end(node, mt, pivots, mas->max); + if (unlikely(ma_dead_node(node))) + return 1; + + } while (unlikely(offset == node_end)); slots = ma_slots(node, mt); pivot = mas_safe_pivot(mas, pivots, ++offset, mt); @@ -4613,6 +4636,9 @@ static inline int mas_next_node(struct ma_state *mas, struct maple_node *node, mt = mte_node_type(mas->node); slots = ma_slots(node, mt); pivots = ma_pivots(node, mt); + if (unlikely(ma_dead_node(node))) + return 1; + offset = 0; pivot = pivots[0]; } @@ -4659,11 +4685,14 @@ static inline void *mas_next_nentry(struct ma_state *mas, return NULL; } - pivots = ma_pivots(node, type); slots = ma_slots(node, type); - mas->index = mas_safe_min(mas, pivots, mas->offset); + pivots = ma_pivots(node, type); count = ma_data_end(node, type, pivots, mas->max); - if (ma_dead_node(node)) + if (unlikely(ma_dead_node(node))) + return NULL; + + mas->index = mas_safe_min(mas, pivots, mas->offset); + if (unlikely(ma_dead_node(node))) return NULL; if (mas->index > max) @@ -4817,6 +4846,11 @@ retry: slots = ma_slots(mn, mt); pivots = ma_pivots(mn, mt); + if (unlikely(ma_dead_node(mn))) { + mas_rewalk(mas, index); + goto retry; + } + if (offset == mt_pivots[mt]) pivot = mas->max; else @@ -6617,11 +6651,11 @@ static inline void *mas_first_entry(struct ma_state *mas, struct maple_node *mn, while (likely(!ma_is_leaf(mt))) { MT_BUG_ON(mas->tree, mte_dead_node(mas->node)); slots = ma_slots(mn, mt); - pivots = ma_pivots(mn, mt); - max = pivots[0]; entry = mas_slot(mas, slots, 0); + pivots = ma_pivots(mn, mt); if (unlikely(ma_dead_node(mn))) return NULL; + max = pivots[0]; mas->node = entry; mn = mas_mn(mas); mt = mte_node_type(mas->node); @@ -6641,13 +6675,13 @@ static inline void *mas_first_entry(struct ma_state *mas, struct maple_node *mn, if (likely(entry)) return entry; - pivots = ma_pivots(mn, mt); - mas->index = pivots[0] + 1; mas->offset = 1; entry = mas_slot(mas, slots, 1); + pivots = ma_pivots(mn, mt); if (unlikely(ma_dead_node(mn))) return NULL; + mas->index = pivots[0] + 1; if (mas->index > limit) goto none; -- cgit v1.2.3 From a7b92d59c885018cb7bb88539892278e4fd64b29 Mon Sep 17 00:00:00 2001 From: Liam Howlett Date: Mon, 27 Feb 2023 09:36:01 -0800 Subject: maple_tree: detect dead nodes in mas_start() When initially starting a search, the root node may already be in the process of being replaced in RCU mode. Detect and restart the walk if this is the case. This is necessary for RCU mode of the maple tree. Link: https://lkml.kernel.org/r/20230227173632.3292573-3-surenb@google.com Fixes: 54a611b60590 ("Maple Tree: add new data structure") Signed-off-by: Liam Howlett Signed-off-by: Suren Baghdasaryan Cc: Signed-off-by: Andrew Morton --- lib/maple_tree.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'lib/maple_tree.c') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 095b9cb1f4f1..3d53339656e1 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -1360,12 +1360,16 @@ static inline struct maple_enode *mas_start(struct ma_state *mas) mas->max = ULONG_MAX; mas->depth = 0; +retry: root = mas_root(mas); /* Tree with nodes */ if (likely(xa_is_node(root))) { mas->depth = 1; mas->node = mte_safe_root(root); mas->offset = 0; + if (mte_dead_node(mas->node)) + goto retry; + return NULL; } -- cgit v1.2.3 From 2e5b4921f8efc9e845f4f04741797d16f36847eb Mon Sep 17 00:00:00 2001 From: Liam Howlett Date: Mon, 27 Feb 2023 09:36:02 -0800 Subject: maple_tree: fix freeing of nodes in rcu mode The walk to destroy the nodes was not always setting the node type and would result in a destroy method potentially using the values as nodes. Avoid this by setting the correct node types. This is necessary for the RCU mode of the maple tree. Link: https://lkml.kernel.org/r/20230227173632.3292573-4-surenb@google.com Fixes: 54a611b60590 ("Maple Tree: add new data structure") Signed-off-by: Liam Howlett Signed-off-by: Suren Baghdasaryan Cc: Signed-off-by: Andrew Morton --- lib/maple_tree.c | 73 +++++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 62 insertions(+), 11 deletions(-) (limited to 'lib/maple_tree.c') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 3d53339656e1..946acda29521 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -902,6 +902,44 @@ static inline void ma_set_meta(struct maple_node *mn, enum maple_type mt, meta->end = end; } +/* + * mas_clear_meta() - clear the metadata information of a node, if it exists + * @mas: The maple state + * @mn: The maple node + * @mt: The maple node type + * @offset: The offset of the highest sub-gap in this node. + * @end: The end of the data in this node. + */ +static inline void mas_clear_meta(struct ma_state *mas, struct maple_node *mn, + enum maple_type mt) +{ + struct maple_metadata *meta; + unsigned long *pivots; + void __rcu **slots; + void *next; + + switch (mt) { + case maple_range_64: + pivots = mn->mr64.pivot; + if (unlikely(pivots[MAPLE_RANGE64_SLOTS - 2])) { + slots = mn->mr64.slot; + next = mas_slot_locked(mas, slots, + MAPLE_RANGE64_SLOTS - 1); + if (unlikely((mte_to_node(next) && mte_node_type(next)))) + return; /* The last slot is a node, no metadata */ + } + fallthrough; + case maple_arange_64: + meta = ma_meta(mn, mt); + break; + default: + return; + } + + meta->gap = 0; + meta->end = 0; +} + /* * ma_meta_end() - Get the data end of a node from the metadata * @mn: The maple node @@ -5441,20 +5479,22 @@ no_gap: * mas_dead_leaves() - Mark all leaves of a node as dead. * @mas: The maple state * @slots: Pointer to the slot array + * @type: The maple node type * * Must hold the write lock. * * Return: The number of leaves marked as dead. */ static inline -unsigned char mas_dead_leaves(struct ma_state *mas, void __rcu **slots) +unsigned char mas_dead_leaves(struct ma_state *mas, void __rcu **slots, + enum maple_type mt) { struct maple_node *node; enum maple_type type; void *entry; int offset; - for (offset = 0; offset < mt_slot_count(mas->node); offset++) { + for (offset = 0; offset < mt_slots[mt]; offset++) { entry = mas_slot_locked(mas, slots, offset); type = mte_node_type(entry); node = mte_to_node(entry); @@ -5473,14 +5513,13 @@ unsigned char mas_dead_leaves(struct ma_state *mas, void __rcu **slots) static void __rcu **mas_dead_walk(struct ma_state *mas, unsigned char offset) { - struct maple_node *node, *next; + struct maple_node *next; void __rcu **slots = NULL; next = mas_mn(mas); do { - mas->node = ma_enode_ptr(next); - node = mas_mn(mas); - slots = ma_slots(node, node->type); + mas->node = mt_mk_node(next, next->type); + slots = ma_slots(next, next->type); next = mas_slot_locked(mas, slots, offset); offset = 0; } while (!ma_is_leaf(next->type)); @@ -5544,11 +5583,14 @@ static inline void __rcu **mas_destroy_descend(struct ma_state *mas, node = mas_mn(mas); slots = ma_slots(node, mte_node_type(mas->node)); next = mas_slot_locked(mas, slots, 0); - if ((mte_dead_node(next))) + if ((mte_dead_node(next))) { + mte_to_node(next)->type = mte_node_type(next); next = mas_slot_locked(mas, slots, 1); + } mte_set_node_dead(mas->node); node->type = mte_node_type(mas->node); + mas_clear_meta(mas, node, node->type); node->piv_parent = prev; node->parent_slot = offset; offset = 0; @@ -5568,13 +5610,18 @@ static void mt_destroy_walk(struct maple_enode *enode, unsigned char ma_flags, MA_STATE(mas, &mt, 0, 0); - if (mte_is_leaf(enode)) + mas.node = enode; + if (mte_is_leaf(enode)) { + node->type = mte_node_type(enode); goto free_leaf; + } + ma_flags &= ~MT_FLAGS_LOCK_MASK; mt_init_flags(&mt, ma_flags); mas_lock(&mas); - mas.node = start = enode; + mte_to_node(enode)->ma_flags = ma_flags; + start = enode; slots = mas_destroy_descend(&mas, start, 0); node = mas_mn(&mas); do { @@ -5582,7 +5629,8 @@ static void mt_destroy_walk(struct maple_enode *enode, unsigned char ma_flags, unsigned char offset; struct maple_enode *parent, *tmp; - node->slot_len = mas_dead_leaves(&mas, slots); + node->type = mte_node_type(mas.node); + node->slot_len = mas_dead_leaves(&mas, slots, node->type); if (free) mt_free_bulk(node->slot_len, slots); offset = node->parent_slot + 1; @@ -5606,7 +5654,8 @@ next: } while (start != mas.node); node = mas_mn(&mas); - node->slot_len = mas_dead_leaves(&mas, slots); + node->type = mte_node_type(mas.node); + node->slot_len = mas_dead_leaves(&mas, slots, node->type); if (free) mt_free_bulk(node->slot_len, slots); @@ -5616,6 +5665,8 @@ start_slots_free: free_leaf: if (free) mt_free_rcu(&node->rcu); + else + mas_clear_meta(&mas, node, node->type); } /* -- cgit v1.2.3 From 8372f4d83f96f35915106093cde4565836587123 Mon Sep 17 00:00:00 2001 From: Liam Howlett Date: Mon, 27 Feb 2023 09:36:03 -0800 Subject: maple_tree: remove extra smp_wmb() from mas_dead_leaves() The call to mte_set_dead_node() before the smp_wmb() already calls smp_wmb() so this is not needed. This is an optimization for the RCU mode of the maple tree. Link: https://lkml.kernel.org/r/20230227173632.3292573-5-surenb@google.com Fixes: 54a611b60590 ("Maple Tree: add new data structure") Signed-off-by: Liam Howlett Signed-off-by: Suren Baghdasaryan Cc: Signed-off-by: Andrew Morton --- lib/maple_tree.c | 1 - 1 file changed, 1 deletion(-) (limited to 'lib/maple_tree.c') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 946acda29521..96d673e4ba5b 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -5503,7 +5503,6 @@ unsigned char mas_dead_leaves(struct ma_state *mas, void __rcu **slots, break; mte_set_node_dead(entry); - smp_wmb(); /* Needed for RCU */ node->type = type; rcu_assign_pointer(slots[offset], node); } -- cgit v1.2.3 From c13af03de46ba27674dd9fb31a17c0d480081139 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Mon, 27 Feb 2023 09:36:04 -0800 Subject: maple_tree: fix write memory barrier of nodes once dead for RCU mode During the development of the maple tree, the strategy of freeing multiple nodes changed and, in the process, the pivots were reused to store pointers to dead nodes. To ensure the readers see accurate pivots, the writers need to mark the nodes as dead and call smp_wmb() to ensure any readers can identify the node as dead before using the pivot values. There were two places where the old method of marking the node as dead without smp_wmb() were being used, which resulted in RCU readers seeing the wrong pivot value before seeing the node was dead. Fix this race condition by using mte_set_node_dead() which has the smp_wmb() call to ensure the race is closed. Add a WARN_ON() to the ma_free_rcu() call to ensure all nodes being freed are marked as dead to ensure there are no other call paths besides the two updated paths. This is necessary for the RCU mode of the maple tree. Link: https://lkml.kernel.org/r/20230227173632.3292573-6-surenb@google.com Fixes: 54a611b60590 ("Maple Tree: add new data structure") Signed-off-by: Liam R. Howlett Signed-off-by: Suren Baghdasaryan Cc: Signed-off-by: Andrew Morton --- lib/maple_tree.c | 7 +++++-- tools/testing/radix-tree/maple.c | 16 ++++++++++++++++ 2 files changed, 21 insertions(+), 2 deletions(-) (limited to 'lib/maple_tree.c') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 96d673e4ba5b..5202d89ba56e 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -185,7 +185,7 @@ static void mt_free_rcu(struct rcu_head *head) */ static void ma_free_rcu(struct maple_node *node) { - node->parent = ma_parent_ptr(node); + WARN_ON(node->parent != ma_parent_ptr(node)); call_rcu(&node->rcu, mt_free_rcu); } @@ -1778,8 +1778,10 @@ static inline void mas_replace(struct ma_state *mas, bool advanced) rcu_assign_pointer(slots[offset], mas->node); } - if (!advanced) + if (!advanced) { + mte_set_node_dead(old_enode); mas_free(mas, old_enode); + } } /* @@ -4218,6 +4220,7 @@ static inline bool mas_wr_node_store(struct ma_wr_state *wr_mas) done: mas_leaf_set_meta(mas, newnode, dst_pivots, maple_leaf_64, new_end); if (in_rcu) { + mte_set_node_dead(mas->node); mas->node = mt_mk_node(newnode, wr_mas->type); mas_replace(mas, false); } else { diff --git a/tools/testing/radix-tree/maple.c b/tools/testing/radix-tree/maple.c index 958ee9bdb316..4c89ff333f6f 100644 --- a/tools/testing/radix-tree/maple.c +++ b/tools/testing/radix-tree/maple.c @@ -108,6 +108,7 @@ static noinline void check_new_node(struct maple_tree *mt) MT_BUG_ON(mt, mn->slot[1] != NULL); MT_BUG_ON(mt, mas_allocated(&mas) != 0); + mn->parent = ma_parent_ptr(mn); ma_free_rcu(mn); mas.node = MAS_START; mas_nomem(&mas, GFP_KERNEL); @@ -160,6 +161,7 @@ static noinline void check_new_node(struct maple_tree *mt) MT_BUG_ON(mt, mas_allocated(&mas) != i); MT_BUG_ON(mt, !mn); MT_BUG_ON(mt, not_empty(mn)); + mn->parent = ma_parent_ptr(mn); ma_free_rcu(mn); } @@ -192,6 +194,7 @@ static noinline void check_new_node(struct maple_tree *mt) MT_BUG_ON(mt, not_empty(mn)); MT_BUG_ON(mt, mas_allocated(&mas) != i - 1); MT_BUG_ON(mt, !mn); + mn->parent = ma_parent_ptr(mn); ma_free_rcu(mn); } @@ -210,6 +213,7 @@ static noinline void check_new_node(struct maple_tree *mt) mn = mas_pop_node(&mas); MT_BUG_ON(mt, not_empty(mn)); MT_BUG_ON(mt, mas_allocated(&mas) != j - 1); + mn->parent = ma_parent_ptr(mn); ma_free_rcu(mn); } MT_BUG_ON(mt, mas_allocated(&mas) != 0); @@ -233,6 +237,7 @@ static noinline void check_new_node(struct maple_tree *mt) MT_BUG_ON(mt, mas_allocated(&mas) != i - j); mn = mas_pop_node(&mas); MT_BUG_ON(mt, not_empty(mn)); + mn->parent = ma_parent_ptr(mn); ma_free_rcu(mn); MT_BUG_ON(mt, mas_allocated(&mas) != i - j - 1); } @@ -269,6 +274,7 @@ static noinline void check_new_node(struct maple_tree *mt) mn = mas_pop_node(&mas); /* get the next node. */ MT_BUG_ON(mt, mn == NULL); MT_BUG_ON(mt, not_empty(mn)); + mn->parent = ma_parent_ptr(mn); ma_free_rcu(mn); } MT_BUG_ON(mt, mas_allocated(&mas) != 0); @@ -294,6 +300,7 @@ static noinline void check_new_node(struct maple_tree *mt) mn = mas_pop_node(&mas2); /* get the next node. */ MT_BUG_ON(mt, mn == NULL); MT_BUG_ON(mt, not_empty(mn)); + mn->parent = ma_parent_ptr(mn); ma_free_rcu(mn); } MT_BUG_ON(mt, mas_allocated(&mas2) != 0); @@ -334,10 +341,12 @@ static noinline void check_new_node(struct maple_tree *mt) MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS + 2); mn = mas_pop_node(&mas); MT_BUG_ON(mt, not_empty(mn)); + mn->parent = ma_parent_ptr(mn); ma_free_rcu(mn); for (i = 1; i <= MAPLE_ALLOC_SLOTS + 1; i++) { mn = mas_pop_node(&mas); MT_BUG_ON(mt, not_empty(mn)); + mn->parent = ma_parent_ptr(mn); ma_free_rcu(mn); } MT_BUG_ON(mt, mas_allocated(&mas) != 0); @@ -375,6 +384,7 @@ static noinline void check_new_node(struct maple_tree *mt) mas_node_count(&mas, i); /* Request */ mas_nomem(&mas, GFP_KERNEL); /* Fill request */ mn = mas_pop_node(&mas); /* get the next node. */ + mn->parent = ma_parent_ptr(mn); ma_free_rcu(mn); mas_destroy(&mas); @@ -382,10 +392,13 @@ static noinline void check_new_node(struct maple_tree *mt) mas_node_count(&mas, i); /* Request */ mas_nomem(&mas, GFP_KERNEL); /* Fill request */ mn = mas_pop_node(&mas); /* get the next node. */ + mn->parent = ma_parent_ptr(mn); ma_free_rcu(mn); mn = mas_pop_node(&mas); /* get the next node. */ + mn->parent = ma_parent_ptr(mn); ma_free_rcu(mn); mn = mas_pop_node(&mas); /* get the next node. */ + mn->parent = ma_parent_ptr(mn); ma_free_rcu(mn); mas_destroy(&mas); } @@ -35369,6 +35382,7 @@ static noinline void check_prealloc(struct maple_tree *mt) MT_BUG_ON(mt, allocated != 1 + height * 3); mn = mas_pop_node(&mas); MT_BUG_ON(mt, mas_allocated(&mas) != allocated - 1); + mn->parent = ma_parent_ptr(mn); ma_free_rcu(mn); MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0); mas_destroy(&mas); @@ -35386,6 +35400,7 @@ static noinline void check_prealloc(struct maple_tree *mt) mas_destroy(&mas); allocated = mas_allocated(&mas); MT_BUG_ON(mt, allocated != 0); + mn->parent = ma_parent_ptr(mn); ma_free_rcu(mn); MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0); @@ -35756,6 +35771,7 @@ void farmer_tests(void) tree.ma_root = mt_mk_node(node, maple_leaf_64); mt_dump(&tree); + node->parent = ma_parent_ptr(node); ma_free_rcu(node); /* Check things that will make lockdep angry */ -- cgit v1.2.3 From 0a2b18d948838e16912b3b627b504ab062b7d02a Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Mon, 27 Feb 2023 09:36:05 -0800 Subject: maple_tree: add smp_rmb() to dead node detection Add an smp_rmb() before reading the parent pointer to ensure that anything read from the node prior to the parent pointer hasn't been reordered ahead of this check. The is necessary for RCU mode. Link: https://lkml.kernel.org/r/20230227173632.3292573-7-surenb@google.com Fixes: 54a611b60590 ("Maple Tree: add new data structure") Signed-off-by: Liam R. Howlett Signed-off-by: Suren Baghdasaryan Cc: Signed-off-by: Andrew Morton --- lib/maple_tree.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'lib/maple_tree.c') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 5202d89ba56e..72c89eb03393 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -539,9 +539,11 @@ static inline struct maple_node *mte_parent(const struct maple_enode *enode) */ static inline bool ma_dead_node(const struct maple_node *node) { - struct maple_node *parent = (void *)((unsigned long) - node->parent & ~MAPLE_NODE_MASK); + struct maple_node *parent; + /* Do not reorder reads from the node prior to the parent check */ + smp_rmb(); + parent = (void *)((unsigned long) node->parent & ~MAPLE_NODE_MASK); return (parent == node); } @@ -556,6 +558,8 @@ static inline bool mte_dead_node(const struct maple_enode *enode) struct maple_node *parent, *node; node = mte_to_node(enode); + /* Do not reorder reads from the node prior to the parent check */ + smp_rmb(); parent = mte_parent(enode); return (parent == node); } -- cgit v1.2.3 From 790e1fa86b340c2bd4a327e01c161f7a1ad885f6 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Mon, 27 Feb 2023 09:36:06 -0800 Subject: maple_tree: add RCU lock checking to rcu callback functions Dereferencing RCU objects within the RCU callback without the RCU check has caused lockdep to complain. Fix the RCU dereferencing by using the RCU callback lock to ensure the operation is safe. Also stop creating a new lock to use for dereferencing during destruction of the tree or subtree. Instead, pass through a pointer to the tree that has the lock that is held for RCU dereferencing checking. It also does not make sense to use the maple state in the freeing scenario as the tree walk is a special case where the tree no longer has the normal encodings and parent pointers. Link: https://lkml.kernel.org/r/20230227173632.3292573-8-surenb@google.com Fixes: 54a611b60590 ("Maple Tree: add new data structure") Signed-off-by: Liam R. Howlett Reported-by: Suren Baghdasaryan Cc: Signed-off-by: Andrew Morton --- lib/maple_tree.c | 188 ++++++++++++++++++++++++++++--------------------------- 1 file changed, 96 insertions(+), 92 deletions(-) (limited to 'lib/maple_tree.c') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 72c89eb03393..b1db0bd71aed 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -824,6 +824,11 @@ static inline void *mt_slot(const struct maple_tree *mt, return rcu_dereference_check(slots[offset], mt_locked(mt)); } +static inline void *mt_slot_locked(struct maple_tree *mt, void __rcu **slots, + unsigned char offset) +{ + return rcu_dereference_protected(slots[offset], mt_locked(mt)); +} /* * mas_slot_locked() - Get the slot value when holding the maple tree lock. * @mas: The maple state @@ -835,7 +840,7 @@ static inline void *mt_slot(const struct maple_tree *mt, static inline void *mas_slot_locked(struct ma_state *mas, void __rcu **slots, unsigned char offset) { - return rcu_dereference_protected(slots[offset], mt_locked(mas->tree)); + return mt_slot_locked(mas->tree, slots, offset); } /* @@ -907,34 +912,35 @@ static inline void ma_set_meta(struct maple_node *mn, enum maple_type mt, } /* - * mas_clear_meta() - clear the metadata information of a node, if it exists - * @mas: The maple state + * mt_clear_meta() - clear the metadata information of a node, if it exists + * @mt: The maple tree * @mn: The maple node - * @mt: The maple node type + * @type: The maple node type * @offset: The offset of the highest sub-gap in this node. * @end: The end of the data in this node. */ -static inline void mas_clear_meta(struct ma_state *mas, struct maple_node *mn, - enum maple_type mt) +static inline void mt_clear_meta(struct maple_tree *mt, struct maple_node *mn, + enum maple_type type) { struct maple_metadata *meta; unsigned long *pivots; void __rcu **slots; void *next; - switch (mt) { + switch (type) { case maple_range_64: pivots = mn->mr64.pivot; if (unlikely(pivots[MAPLE_RANGE64_SLOTS - 2])) { slots = mn->mr64.slot; - next = mas_slot_locked(mas, slots, - MAPLE_RANGE64_SLOTS - 1); - if (unlikely((mte_to_node(next) && mte_node_type(next)))) - return; /* The last slot is a node, no metadata */ + next = mt_slot_locked(mt, slots, + MAPLE_RANGE64_SLOTS - 1); + if (unlikely((mte_to_node(next) && + mte_node_type(next)))) + return; /* no metadata, could be node */ } fallthrough; case maple_arange_64: - meta = ma_meta(mn, mt); + meta = ma_meta(mn, type); break; default: return; @@ -5483,7 +5489,7 @@ no_gap: } /* - * mas_dead_leaves() - Mark all leaves of a node as dead. + * mte_dead_leaves() - Mark all leaves of a node as dead. * @mas: The maple state * @slots: Pointer to the slot array * @type: The maple node type @@ -5493,16 +5499,16 @@ no_gap: * Return: The number of leaves marked as dead. */ static inline -unsigned char mas_dead_leaves(struct ma_state *mas, void __rcu **slots, - enum maple_type mt) +unsigned char mte_dead_leaves(struct maple_enode *enode, struct maple_tree *mt, + void __rcu **slots) { struct maple_node *node; enum maple_type type; void *entry; int offset; - for (offset = 0; offset < mt_slots[mt]; offset++) { - entry = mas_slot_locked(mas, slots, offset); + for (offset = 0; offset < mt_slot_count(enode); offset++) { + entry = mt_slot(mt, slots, offset); type = mte_node_type(entry); node = mte_to_node(entry); /* Use both node and type to catch LE & BE metadata */ @@ -5517,162 +5523,160 @@ unsigned char mas_dead_leaves(struct ma_state *mas, void __rcu **slots, return offset; } -static void __rcu **mas_dead_walk(struct ma_state *mas, unsigned char offset) +/** + * mte_dead_walk() - Walk down a dead tree to just before the leaves + * @enode: The maple encoded node + * @offset: The starting offset + * + * Note: This can only be used from the RCU callback context. + */ +static void __rcu **mte_dead_walk(struct maple_enode **enode, unsigned char offset) { - struct maple_node *next; + struct maple_node *node, *next; void __rcu **slots = NULL; - next = mas_mn(mas); + next = mte_to_node(*enode); do { - mas->node = mt_mk_node(next, next->type); - slots = ma_slots(next, next->type); - next = mas_slot_locked(mas, slots, offset); + *enode = ma_enode_ptr(next); + node = mte_to_node(*enode); + slots = ma_slots(node, node->type); + next = rcu_dereference_protected(slots[offset], + lock_is_held(&rcu_callback_map)); offset = 0; } while (!ma_is_leaf(next->type)); return slots; } +/** + * mt_free_walk() - Walk & free a tree in the RCU callback context + * @head: The RCU head that's within the node. + * + * Note: This can only be used from the RCU callback context. + */ static void mt_free_walk(struct rcu_head *head) { void __rcu **slots; struct maple_node *node, *start; - struct maple_tree mt; + struct maple_enode *enode; unsigned char offset; enum maple_type type; - MA_STATE(mas, &mt, 0, 0); node = container_of(head, struct maple_node, rcu); if (ma_is_leaf(node->type)) goto free_leaf; - mt_init_flags(&mt, node->ma_flags); - mas_lock(&mas); start = node; - mas.node = mt_mk_node(node, node->type); - slots = mas_dead_walk(&mas, 0); - node = mas_mn(&mas); + enode = mt_mk_node(node, node->type); + slots = mte_dead_walk(&enode, 0); + node = mte_to_node(enode); do { mt_free_bulk(node->slot_len, slots); offset = node->parent_slot + 1; - mas.node = node->piv_parent; - if (mas_mn(&mas) == node) - goto start_slots_free; - - type = mte_node_type(mas.node); - slots = ma_slots(mte_to_node(mas.node), type); - if ((offset < mt_slots[type]) && (slots[offset])) - slots = mas_dead_walk(&mas, offset); - - node = mas_mn(&mas); + enode = node->piv_parent; + if (mte_to_node(enode) == node) + goto free_leaf; + + type = mte_node_type(enode); + slots = ma_slots(mte_to_node(enode), type); + if ((offset < mt_slots[type]) && + rcu_dereference_protected(slots[offset], + lock_is_held(&rcu_callback_map))) + slots = mte_dead_walk(&enode, offset); + node = mte_to_node(enode); } while ((node != start) || (node->slot_len < offset)); slots = ma_slots(node, node->type); mt_free_bulk(node->slot_len, slots); -start_slots_free: - mas_unlock(&mas); free_leaf: mt_free_rcu(&node->rcu); } -static inline void __rcu **mas_destroy_descend(struct ma_state *mas, - struct maple_enode *prev, unsigned char offset) +static inline void __rcu **mte_destroy_descend(struct maple_enode **enode, + struct maple_tree *mt, struct maple_enode *prev, unsigned char offset) { struct maple_node *node; - struct maple_enode *next = mas->node; + struct maple_enode *next = *enode; void __rcu **slots = NULL; + enum maple_type type; + unsigned char next_offset = 0; do { - mas->node = next; - node = mas_mn(mas); - slots = ma_slots(node, mte_node_type(mas->node)); - next = mas_slot_locked(mas, slots, 0); - if ((mte_dead_node(next))) { - mte_to_node(next)->type = mte_node_type(next); - next = mas_slot_locked(mas, slots, 1); - } + *enode = next; + node = mte_to_node(*enode); + type = mte_node_type(*enode); + slots = ma_slots(node, type); + next = mt_slot_locked(mt, slots, next_offset); + if ((mte_dead_node(next))) + next = mt_slot_locked(mt, slots, ++next_offset); - mte_set_node_dead(mas->node); - node->type = mte_node_type(mas->node); - mas_clear_meta(mas, node, node->type); + mte_set_node_dead(*enode); + node->type = type; node->piv_parent = prev; node->parent_slot = offset; - offset = 0; - prev = mas->node; + offset = next_offset; + next_offset = 0; + prev = *enode; } while (!mte_is_leaf(next)); return slots; } -static void mt_destroy_walk(struct maple_enode *enode, unsigned char ma_flags, +static void mt_destroy_walk(struct maple_enode *enode, struct maple_tree *mt, bool free) { void __rcu **slots; struct maple_node *node = mte_to_node(enode); struct maple_enode *start; - struct maple_tree mt; - - MA_STATE(mas, &mt, 0, 0); - mas.node = enode; if (mte_is_leaf(enode)) { node->type = mte_node_type(enode); goto free_leaf; } - ma_flags &= ~MT_FLAGS_LOCK_MASK; - mt_init_flags(&mt, ma_flags); - mas_lock(&mas); - - mte_to_node(enode)->ma_flags = ma_flags; start = enode; - slots = mas_destroy_descend(&mas, start, 0); - node = mas_mn(&mas); + slots = mte_destroy_descend(&enode, mt, start, 0); + node = mte_to_node(enode); // Updated in the above call. do { enum maple_type type; unsigned char offset; struct maple_enode *parent, *tmp; - node->type = mte_node_type(mas.node); - node->slot_len = mas_dead_leaves(&mas, slots, node->type); + node->slot_len = mte_dead_leaves(enode, mt, slots); if (free) mt_free_bulk(node->slot_len, slots); offset = node->parent_slot + 1; - mas.node = node->piv_parent; - if (mas_mn(&mas) == node) - goto start_slots_free; + enode = node->piv_parent; + if (mte_to_node(enode) == node) + goto free_leaf; - type = mte_node_type(mas.node); - slots = ma_slots(mte_to_node(mas.node), type); + type = mte_node_type(enode); + slots = ma_slots(mte_to_node(enode), type); if (offset >= mt_slots[type]) goto next; - tmp = mas_slot_locked(&mas, slots, offset); + tmp = mt_slot_locked(mt, slots, offset); if (mte_node_type(tmp) && mte_to_node(tmp)) { - parent = mas.node; - mas.node = tmp; - slots = mas_destroy_descend(&mas, parent, offset); + parent = enode; + enode = tmp; + slots = mte_destroy_descend(&enode, mt, parent, offset); } next: - node = mas_mn(&mas); - } while (start != mas.node); + node = mte_to_node(enode); + } while (start != enode); - node = mas_mn(&mas); - node->type = mte_node_type(mas.node); - node->slot_len = mas_dead_leaves(&mas, slots, node->type); + node = mte_to_node(enode); + node->slot_len = mte_dead_leaves(enode, mt, slots); if (free) mt_free_bulk(node->slot_len, slots); -start_slots_free: - mas_unlock(&mas); - free_leaf: if (free) mt_free_rcu(&node->rcu); else - mas_clear_meta(&mas, node, node->type); + mt_clear_meta(mt, node, node->type); } /* @@ -5688,10 +5692,10 @@ static inline void mte_destroy_walk(struct maple_enode *enode, struct maple_node *node = mte_to_node(enode); if (mt_in_rcu(mt)) { - mt_destroy_walk(enode, mt->ma_flags, false); + mt_destroy_walk(enode, mt, false); call_rcu(&node->rcu, mt_free_walk); } else { - mt_destroy_walk(enode, mt->ma_flags, true); + mt_destroy_walk(enode, mt, true); } } -- cgit v1.2.3 From ec07967d7523adb3670f9dfee0232e3bc868f3de Mon Sep 17 00:00:00 2001 From: Peng Zhang Date: Tue, 14 Mar 2023 20:42:01 +0800 Subject: maple_tree: fix get wrong data_end in mtree_lookup_walk() if (likely(offset > end)) max = pivots[offset]; The above code should be changed to if (likely(offset < end)), which is correct. This affects the correctness of ma_data_end(). Now it seems that the final result will not be wrong, but it is best to change it. This patch does not change the code as above, because it simplifies the code by the way. Link: https://lkml.kernel.org/r/20230314124203.91572-1-zhangpeng.00@bytedance.com Link: https://lkml.kernel.org/r/20230314124203.91572-2-zhangpeng.00@bytedance.com Fixes: 54a611b60590 ("Maple Tree: add new data structure") Signed-off-by: Peng Zhang Reviewed-by: Liam R. Howlett Cc: Signed-off-by: Andrew Morton --- lib/maple_tree.c | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) (limited to 'lib/maple_tree.c') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index b1db0bd71aed..b8a230f5d94e 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -3941,18 +3941,13 @@ static inline void *mtree_lookup_walk(struct ma_state *mas) end = ma_data_end(node, type, pivots, max); if (unlikely(ma_dead_node(node))) goto dead_node; - - if (pivots[offset] >= mas->index) - goto next; - do { - offset++; - } while ((offset < end) && (pivots[offset] < mas->index)); - - if (likely(offset > end)) - max = pivots[offset]; + if (pivots[offset] >= mas->index) { + max = pivots[offset]; + break; + } + } while (++offset < end); -next: slots = ma_slots(node, type); next = mt_slot(mas->tree, slots, offset); if (unlikely(ma_dead_node(node))) -- cgit v1.2.3 From c45ea315a602d45569b08b93e9ab30f6a63a38aa Mon Sep 17 00:00:00 2001 From: Peng Zhang Date: Tue, 14 Mar 2023 20:42:03 +0800 Subject: maple_tree: fix a potential concurrency bug in RCU mode There is a concurrency bug that may cause the wrong value to be loaded when a CPU is modifying the maple tree. CPU1: mtree_insert_range() mas_insert() mas_store_root() ... mas_root_expand() ... rcu_assign_pointer(mas->tree->ma_root, mte_mk_root(mas->node)); ma_set_meta(node, maple_leaf_64, 0, slot); <---IP CPU2: mtree_load() mtree_lookup_walk() ma_data_end(); When CPU1 is about to execute the instruction pointed to by IP, the ma_data_end() executed by CPU2 may return the wrong end position, which will cause the value loaded by mtree_load() to be wrong. An example of triggering the bug: Add mdelay(100) between rcu_assign_pointer() and ma_set_meta() in mas_root_expand(). static DEFINE_MTREE(tree); int work(void *p) { unsigned long val; for (int i = 0 ; i< 30; ++i) { val = (unsigned long)mtree_load(&tree, 8); mdelay(5); pr_info("%lu",val); } return 0; } mt_init_flags(&tree, MT_FLAGS_USE_RCU); mtree_insert(&tree, 0, (void*)12345, GFP_KERNEL); run_thread(work) mtree_insert(&tree, 1, (void*)56789, GFP_KERNEL); In RCU mode, mtree_load() should always return the value before or after the data structure is modified, and in this example mtree_load(&tree, 8) may return 56789 which is not expected, it should always return NULL. Fix it by put ma_set_meta() before rcu_assign_pointer(). Link: https://lkml.kernel.org/r/20230314124203.91572-4-zhangpeng.00@bytedance.com Fixes: 54a611b60590 ("Maple Tree: add new data structure") Signed-off-by: Peng Zhang Reviewed-by: Liam R. Howlett Cc: Signed-off-by: Andrew Morton --- lib/maple_tree.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'lib/maple_tree.c') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index b8a230f5d94e..db60edb55f2f 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -3725,10 +3725,9 @@ static inline int mas_root_expand(struct ma_state *mas, void *entry) slot++; mas->depth = 1; mas_set_height(mas); - + ma_set_meta(node, maple_leaf_64, 0, slot); /* swap the new root into the tree */ rcu_assign_pointer(mas->tree->ma_root, mte_mk_root(mas->node)); - ma_set_meta(node, maple_leaf_64, 0, slot); return slot; } -- cgit v1.2.3 From 1f5f12ece722aacea1769fb644f27790ede339dc Mon Sep 17 00:00:00 2001 From: Peng Zhang Date: Tue, 11 Apr 2023 12:10:04 +0800 Subject: maple_tree: fix a potential memory leak, OOB access, or other unpredictable bug In mas_alloc_nodes(), "node->node_count = 0" means to initialize the node_count field of the new node, but the node may not be a new node. It may be a node that existed before and node_count has a value, setting it to 0 will cause a memory leak. At this time, mas->alloc->total will be greater than the actual number of nodes in the linked list, which may cause many other errors. For example, out-of-bounds access in mas_pop_node(), and mas_pop_node() may return addresses that should not be used. Fix it by initializing node_count only for new nodes. Also, by the way, an if-else statement was removed to simplify the code. Link: https://lkml.kernel.org/r/20230411041005.26205-1-zhangpeng.00@bytedance.com Fixes: 54a611b60590 ("Maple Tree: add new data structure") Signed-off-by: Peng Zhang Reviewed-by: Liam R. Howlett Cc: Signed-off-by: Andrew Morton --- lib/maple_tree.c | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) (limited to 'lib/maple_tree.c') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index db60edb55f2f..7ff2a821a2a1 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -1303,26 +1303,21 @@ static inline void mas_alloc_nodes(struct ma_state *mas, gfp_t gfp) node = mas->alloc; node->request_count = 0; while (requested) { - max_req = MAPLE_ALLOC_SLOTS; - if (node->node_count) { - unsigned int offset = node->node_count; - - slots = (void **)&node->slot[offset]; - max_req -= offset; - } else { - slots = (void **)&node->slot; - } - + max_req = MAPLE_ALLOC_SLOTS - node->node_count; + slots = (void **)&node->slot[node->node_count]; max_req = min(requested, max_req); count = mt_alloc_bulk(gfp, max_req, slots); if (!count) goto nomem_bulk; + if (node->node_count == 0) { + node->slot[0]->node_count = 0; + node->slot[0]->request_count = 0; + } + node->node_count += count; allocated += count; node = node->slot[0]; - node->node_count = 0; - node->request_count = 0; requested -= count; } mas->alloc->total = allocated; -- cgit v1.2.3