From f7942430e40f14c6d2ca48a1875add509938c07d Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 12 Dec 2016 16:43:41 -0800 Subject: lib: radix-tree: native accounting of exceptional entries The way the page cache is sneaking shadow entries of evicted pages into the radix tree past the node entry accounting and tracking them manually in the upper bits of node->count is fraught with problems. These shadow entries are marked in the tree as exceptional entries, which are a native concept to the radix tree. Maintain an explicit counter of exceptional entries in the radix tree node. Subsequent patches will switch shadow entry tracking over to that counter. DAX and shmem are the other users of exceptional entries. Since slot replacements that change the entry type from regular to exceptional must now be accounted, introduce a __radix_tree_replace() function that does replacement and accounting, and switch DAX and shmem over. The increase in radix tree node size is temporary. A followup patch switches the shadow tracking to this new scheme and we'll no longer need the upper bits in node->count and shrink that back to one byte. Link: http://lkml.kernel.org/r/20161117192945.GA23430@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Jan Kara Cc: Kirill A. Shutemov Cc: Hugh Dickins Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/radix-tree.c | 46 +++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 43 insertions(+), 3 deletions(-) (limited to 'lib/radix-tree.c') diff --git a/lib/radix-tree.c b/lib/radix-tree.c index 8e6d552c40dd..7885796d35ae 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -220,10 +220,10 @@ static void dump_node(struct radix_tree_node *node, unsigned long index) { unsigned long i; - pr_debug("radix node: %p offset %d tags %lx %lx %lx shift %d count %d parent %p\n", + pr_debug("radix node: %p offset %d tags %lx %lx %lx shift %d count %d exceptional %d parent %p\n", node, node->offset, node->tags[0][0], node->tags[1][0], node->tags[2][0], - node->shift, node->count, node->parent); + node->shift, node->count, node->exceptional, node->parent); for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) { unsigned long first = index | (i << node->shift); @@ -522,8 +522,13 @@ static int radix_tree_extend(struct radix_tree_root *root, node->offset = 0; node->count = 1; node->parent = NULL; - if (radix_tree_is_internal_node(slot)) + if (radix_tree_is_internal_node(slot)) { entry_to_node(slot)->parent = node; + } else { + /* Moving an exceptional root->rnode to a node */ + if (radix_tree_exceptional_entry(slot)) + node->exceptional = 1; + } node->slots[0] = slot; slot = node_to_entry(node); rcu_assign_pointer(root->rnode, slot); @@ -649,6 +654,8 @@ int __radix_tree_insert(struct radix_tree_root *root, unsigned long index, if (node) { unsigned offset = get_slot_offset(node, slot); node->count++; + if (radix_tree_exceptional_entry(item)) + node->exceptional++; BUG_ON(tag_get(node, 0, offset)); BUG_ON(tag_get(node, 1, offset)); BUG_ON(tag_get(node, 2, offset)); @@ -746,6 +753,37 @@ void *radix_tree_lookup(struct radix_tree_root *root, unsigned long index) } EXPORT_SYMBOL(radix_tree_lookup); +/** + * __radix_tree_replace - replace item in a slot + * @root: radix tree root + * @node: pointer to tree node + * @slot: pointer to slot in @node + * @item: new item to store in the slot. + * + * For use with __radix_tree_lookup(). Caller must hold tree write locked + * across slot lookup and replacement. + */ +void __radix_tree_replace(struct radix_tree_root *root, + struct radix_tree_node *node, + void **slot, void *item) +{ + void *old = rcu_dereference_raw(*slot); + int exceptional; + + WARN_ON_ONCE(radix_tree_is_internal_node(item)); + WARN_ON_ONCE(!!item - !!old); + + exceptional = !!radix_tree_exceptional_entry(item) - + !!radix_tree_exceptional_entry(old); + + WARN_ON_ONCE(exceptional && !node && slot != (void **)&root->rnode); + + if (node) + node->exceptional += exceptional; + + rcu_assign_pointer(*slot, item); +} + /** * radix_tree_tag_set - set a tag on a radix tree node * @root: radix tree root @@ -1561,6 +1599,8 @@ void *radix_tree_delete_item(struct radix_tree_root *root, delete_sibling_entries(node, node_to_entry(slot), offset); node->slots[offset] = NULL; node->count--; + if (radix_tree_exceptional_entry(entry)) + node->exceptional--; __radix_tree_delete_node(root, node); -- cgit v1.2.3 From 6d75f366b9242f9b17ed7d0b0604d7460f818f21 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 12 Dec 2016 16:43:43 -0800 Subject: lib: radix-tree: check accounting of existing slot replacement users The bug in khugepaged fixed earlier in this series shows that radix tree slot replacement is fragile; and it will become more so when not only NULL<->!NULL transitions need to be caught but transitions from and to exceptional entries as well. We need checks. Re-implement radix_tree_replace_slot() on top of the sanity-checked __radix_tree_replace(). This requires existing callers to also pass the radix tree root, but it'll warn us when somebody replaces slots with contents that need proper accounting (transitions between NULL entries, real entries, exceptional entries) and where a replacement through the slot pointer would corrupt the radix tree node counts. Link: http://lkml.kernel.org/r/20161117193021.GB23430@cmpxchg.org Signed-off-by: Johannes Weiner Suggested-by: Jan Kara Reviewed-by: Jan Kara Cc: Kirill A. Shutemov Cc: Hugh Dickins Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/s390/mm/gmap.c | 2 +- drivers/sh/intc/virq.c | 2 +- fs/dax.c | 4 +-- include/linux/radix-tree.h | 16 ++------- lib/radix-tree.c | 63 +++++++++++++++++++++++++++-------- mm/filemap.c | 4 +-- mm/khugepaged.c | 5 +-- mm/migrate.c | 4 +-- mm/truncate.c | 2 +- tools/testing/radix-tree/multiorder.c | 2 +- 10 files changed, 64 insertions(+), 40 deletions(-) (limited to 'lib/radix-tree.c') diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index 3ba622702ce4..ec1f0dedb948 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -1015,7 +1015,7 @@ static inline void gmap_insert_rmap(struct gmap *sg, unsigned long vmaddr, if (slot) { rmap->next = radix_tree_deref_slot_protected(slot, &sg->guest_table_lock); - radix_tree_replace_slot(slot, rmap); + radix_tree_replace_slot(&sg->host_to_rmap, slot, rmap); } else { rmap->next = NULL; radix_tree_insert(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT, diff --git a/drivers/sh/intc/virq.c b/drivers/sh/intc/virq.c index e7899624aa0b..35bbe288ddb4 100644 --- a/drivers/sh/intc/virq.c +++ b/drivers/sh/intc/virq.c @@ -254,7 +254,7 @@ restart: radix_tree_tag_clear(&d->tree, entry->enum_id, INTC_TAG_VIRQ_NEEDS_ALLOC); - radix_tree_replace_slot((void **)entries[i], + radix_tree_replace_slot(&d->tree, (void **)entries[i], &intc_irq_xlate[irq]); } diff --git a/fs/dax.c b/fs/dax.c index db78bae0dc0f..85930c2a2749 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -342,7 +342,7 @@ static inline void *lock_slot(struct address_space *mapping, void **slot) radix_tree_deref_slot_protected(slot, &mapping->tree_lock); entry |= RADIX_DAX_ENTRY_LOCK; - radix_tree_replace_slot(slot, (void *)entry); + radix_tree_replace_slot(&mapping->page_tree, slot, (void *)entry); return (void *)entry; } @@ -356,7 +356,7 @@ static inline void *unlock_slot(struct address_space *mapping, void **slot) radix_tree_deref_slot_protected(slot, &mapping->tree_lock); entry &= ~(unsigned long)RADIX_DAX_ENTRY_LOCK; - radix_tree_replace_slot(slot, (void *)entry); + radix_tree_replace_slot(&mapping->page_tree, slot, (void *)entry); return (void *)entry; } diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index 7ced8a70cc8b..2d1b9b8be983 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -249,20 +249,6 @@ static inline int radix_tree_exception(void *arg) return unlikely((unsigned long)arg & RADIX_TREE_ENTRY_MASK); } -/** - * radix_tree_replace_slot - replace item in a slot - * @pslot: pointer to slot, returned by radix_tree_lookup_slot - * @item: new item to store in the slot. - * - * For use with radix_tree_lookup_slot(). Caller must hold tree write locked - * across slot lookup and replacement. - */ -static inline void radix_tree_replace_slot(void **pslot, void *item) -{ - BUG_ON(radix_tree_is_internal_node(item)); - rcu_assign_pointer(*pslot, item); -} - int __radix_tree_create(struct radix_tree_root *root, unsigned long index, unsigned order, struct radix_tree_node **nodep, void ***slotp); @@ -280,6 +266,8 @@ void **radix_tree_lookup_slot(struct radix_tree_root *, unsigned long); void __radix_tree_replace(struct radix_tree_root *root, struct radix_tree_node *node, void **slot, void *item); +void radix_tree_replace_slot(struct radix_tree_root *root, + void **slot, void *item); bool __radix_tree_delete_node(struct radix_tree_root *root, struct radix_tree_node *node); void *radix_tree_delete_item(struct radix_tree_root *, unsigned long, void *); diff --git a/lib/radix-tree.c b/lib/radix-tree.c index 7885796d35ae..f91d5b0af654 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -753,19 +753,10 @@ void *radix_tree_lookup(struct radix_tree_root *root, unsigned long index) } EXPORT_SYMBOL(radix_tree_lookup); -/** - * __radix_tree_replace - replace item in a slot - * @root: radix tree root - * @node: pointer to tree node - * @slot: pointer to slot in @node - * @item: new item to store in the slot. - * - * For use with __radix_tree_lookup(). Caller must hold tree write locked - * across slot lookup and replacement. - */ -void __radix_tree_replace(struct radix_tree_root *root, - struct radix_tree_node *node, - void **slot, void *item) +static void replace_slot(struct radix_tree_root *root, + struct radix_tree_node *node, + void **slot, void *item, + bool warn_typeswitch) { void *old = rcu_dereference_raw(*slot); int exceptional; @@ -776,7 +767,7 @@ void __radix_tree_replace(struct radix_tree_root *root, exceptional = !!radix_tree_exceptional_entry(item) - !!radix_tree_exceptional_entry(old); - WARN_ON_ONCE(exceptional && !node && slot != (void **)&root->rnode); + WARN_ON_ONCE(warn_typeswitch && exceptional); if (node) node->exceptional += exceptional; @@ -784,6 +775,50 @@ void __radix_tree_replace(struct radix_tree_root *root, rcu_assign_pointer(*slot, item); } +/** + * __radix_tree_replace - replace item in a slot + * @root: radix tree root + * @node: pointer to tree node + * @slot: pointer to slot in @node + * @item: new item to store in the slot. + * + * For use with __radix_tree_lookup(). Caller must hold tree write locked + * across slot lookup and replacement. + */ +void __radix_tree_replace(struct radix_tree_root *root, + struct radix_tree_node *node, + void **slot, void *item) +{ + /* + * This function supports replacing exceptional entries, but + * that needs accounting against the node unless the slot is + * root->rnode. + */ + replace_slot(root, node, slot, item, + !node && slot != (void **)&root->rnode); +} + +/** + * radix_tree_replace_slot - replace item in a slot + * @root: radix tree root + * @slot: pointer to slot + * @item: new item to store in the slot. + * + * For use with radix_tree_lookup_slot(), radix_tree_gang_lookup_slot(), + * radix_tree_gang_lookup_tag_slot(). Caller must hold tree write locked + * across slot lookup and replacement. + * + * NOTE: This cannot be used to switch between non-entries (empty slots), + * regular entries, and exceptional entries, as that requires accounting + * inside the radix tree node. When switching from one type of entry to + * another, use __radix_tree_lookup() and __radix_tree_replace(). + */ +void radix_tree_replace_slot(struct radix_tree_root *root, + void **slot, void *item) +{ + replace_slot(root, NULL, slot, item, true); +} + /** * radix_tree_tag_set - set a tag on a radix tree node * @root: radix tree root diff --git a/mm/filemap.c b/mm/filemap.c index caa779f8797f..1ba726aef708 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -147,7 +147,7 @@ static int page_cache_tree_insert(struct address_space *mapping, false); } } - radix_tree_replace_slot(slot, page); + radix_tree_replace_slot(&mapping->page_tree, slot, page); mapping->nrpages++; if (node) { workingset_node_pages_inc(node); @@ -196,7 +196,7 @@ static void page_cache_tree_delete(struct address_space *mapping, shadow = NULL; } - radix_tree_replace_slot(slot, shadow); + radix_tree_replace_slot(&mapping->page_tree, slot, shadow); if (!node) break; diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 5d7c006373d3..7a50c726c5ae 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1426,7 +1426,7 @@ static void collapse_shmem(struct mm_struct *mm, list_add_tail(&page->lru, &pagelist); /* Finally, replace with the new page. */ - radix_tree_replace_slot(slot, + radix_tree_replace_slot(&mapping->page_tree, slot, new_page + (index % HPAGE_PMD_NR)); slot = radix_tree_iter_next(&iter); @@ -1538,7 +1538,8 @@ tree_unlocked: /* Unfreeze the page. */ list_del(&page->lru); page_ref_unfreeze(page, 2); - radix_tree_replace_slot(slot, page); + radix_tree_replace_slot(&mapping->page_tree, + slot, page); spin_unlock_irq(&mapping->tree_lock); putback_lru_page(page); unlock_page(page); diff --git a/mm/migrate.c b/mm/migrate.c index 66ce6b490b13..0ed24b1fa77b 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -482,7 +482,7 @@ int migrate_page_move_mapping(struct address_space *mapping, SetPageDirty(newpage); } - radix_tree_replace_slot(pslot, newpage); + radix_tree_replace_slot(&mapping->page_tree, pslot, newpage); /* * Drop cache reference from old page by unfreezing @@ -556,7 +556,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, get_page(newpage); - radix_tree_replace_slot(pslot, newpage); + radix_tree_replace_slot(&mapping->page_tree, pslot, newpage); page_ref_unfreeze(page, expected_count - 1); diff --git a/mm/truncate.c b/mm/truncate.c index 8d8c62d89e6d..3c631c357873 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -49,7 +49,7 @@ static void clear_exceptional_entry(struct address_space *mapping, goto unlock; if (*slot != entry) goto unlock; - radix_tree_replace_slot(slot, NULL); + radix_tree_replace_slot(&mapping->page_tree, slot, NULL); mapping->nrexceptional--; if (!node) goto unlock; diff --git a/tools/testing/radix-tree/multiorder.c b/tools/testing/radix-tree/multiorder.c index 05d7bc488971..d1be94667a30 100644 --- a/tools/testing/radix-tree/multiorder.c +++ b/tools/testing/radix-tree/multiorder.c @@ -146,7 +146,7 @@ static void multiorder_check(unsigned long index, int order) slot = radix_tree_lookup_slot(&tree, index); free(*slot); - radix_tree_replace_slot(slot, item2); + radix_tree_replace_slot(&tree, slot, item2); for (i = min; i < max; i++) { struct item *item = item_lookup(&tree, i); assert(item != 0); -- cgit v1.2.3 From f4b109c6dad54257eca837f9dd16a23f2eeab832 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 12 Dec 2016 16:43:46 -0800 Subject: lib: radix-tree: add entry deletion support to __radix_tree_replace() Page cache shadow entry handling will be a lot simpler when it can use a single generic replacement function for pages, shadow entries, and emptying slots. Make __radix_tree_replace() properly account insertions and deletions in node->count and garbage collect nodes as they become empty. Then re-implement radix_tree_delete() on top of it. Link: http://lkml.kernel.org/r/20161117193058.GC23430@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Jan Kara Cc: Kirill A. Shutemov Cc: Hugh Dickins Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/radix-tree.c | 227 ++++++++++++++++++++++++++++--------------------------- 1 file changed, 116 insertions(+), 111 deletions(-) (limited to 'lib/radix-tree.c') diff --git a/lib/radix-tree.c b/lib/radix-tree.c index f91d5b0af654..5d8930f3b3d8 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -538,6 +538,107 @@ out: return maxshift + RADIX_TREE_MAP_SHIFT; } +/** + * radix_tree_shrink - shrink radix tree to minimum height + * @root radix tree root + */ +static inline bool radix_tree_shrink(struct radix_tree_root *root) +{ + bool shrunk = false; + + for (;;) { + struct radix_tree_node *node = root->rnode; + struct radix_tree_node *child; + + if (!radix_tree_is_internal_node(node)) + break; + node = entry_to_node(node); + + /* + * The candidate node has more than one child, or its child + * is not at the leftmost slot, or the child is a multiorder + * entry, we cannot shrink. + */ + if (node->count != 1) + break; + child = node->slots[0]; + if (!child) + break; + if (!radix_tree_is_internal_node(child) && node->shift) + break; + + if (radix_tree_is_internal_node(child)) + entry_to_node(child)->parent = NULL; + + /* + * We don't need rcu_assign_pointer(), since we are simply + * moving the node from one part of the tree to another: if it + * was safe to dereference the old pointer to it + * (node->slots[0]), it will be safe to dereference the new + * one (root->rnode) as far as dependent read barriers go. + */ + root->rnode = child; + + /* + * We have a dilemma here. The node's slot[0] must not be + * NULLed in case there are concurrent lookups expecting to + * find the item. However if this was a bottom-level node, + * then it may be subject to the slot pointer being visible + * to callers dereferencing it. If item corresponding to + * slot[0] is subsequently deleted, these callers would expect + * their slot to become empty sooner or later. + * + * For example, lockless pagecache will look up a slot, deref + * the page pointer, and if the page has 0 refcount it means it + * was concurrently deleted from pagecache so try the deref + * again. Fortunately there is already a requirement for logic + * to retry the entire slot lookup -- the indirect pointer + * problem (replacing direct root node with an indirect pointer + * also results in a stale slot). So tag the slot as indirect + * to force callers to retry. + */ + if (!radix_tree_is_internal_node(child)) + node->slots[0] = RADIX_TREE_RETRY; + + radix_tree_node_free(node); + shrunk = true; + } + + return shrunk; +} + +static bool delete_node(struct radix_tree_root *root, + struct radix_tree_node *node) +{ + bool deleted = false; + + do { + struct radix_tree_node *parent; + + if (node->count) { + if (node == entry_to_node(root->rnode)) + deleted |= radix_tree_shrink(root); + return deleted; + } + + parent = node->parent; + if (parent) { + parent->slots[node->offset] = NULL; + parent->count--; + } else { + root_tag_clear_all(root); + root->rnode = NULL; + } + + radix_tree_node_free(node); + deleted = true; + + node = parent; + } while (node); + + return deleted; +} + /** * __radix_tree_create - create a slot in a radix tree * @root: radix tree root @@ -759,18 +860,20 @@ static void replace_slot(struct radix_tree_root *root, bool warn_typeswitch) { void *old = rcu_dereference_raw(*slot); - int exceptional; + int count, exceptional; WARN_ON_ONCE(radix_tree_is_internal_node(item)); - WARN_ON_ONCE(!!item - !!old); + count = !!item - !!old; exceptional = !!radix_tree_exceptional_entry(item) - !!radix_tree_exceptional_entry(old); - WARN_ON_ONCE(warn_typeswitch && exceptional); + WARN_ON_ONCE(warn_typeswitch && (count || exceptional)); - if (node) + if (node) { + node->count += count; node->exceptional += exceptional; + } rcu_assign_pointer(*slot, item); } @@ -790,12 +893,14 @@ void __radix_tree_replace(struct radix_tree_root *root, void **slot, void *item) { /* - * This function supports replacing exceptional entries, but - * that needs accounting against the node unless the slot is - * root->rnode. + * This function supports replacing exceptional entries and + * deleting entries, but that needs accounting against the + * node unless the slot is root->rnode. */ replace_slot(root, node, slot, item, !node && slot != (void **)&root->rnode); + + delete_node(root, node); } /** @@ -810,8 +915,8 @@ void __radix_tree_replace(struct radix_tree_root *root, * * NOTE: This cannot be used to switch between non-entries (empty slots), * regular entries, and exceptional entries, as that requires accounting - * inside the radix tree node. When switching from one type of entry to - * another, use __radix_tree_lookup() and __radix_tree_replace(). + * inside the radix tree node. When switching from one type of entry or + * deleting, use __radix_tree_lookup() and __radix_tree_replace(). */ void radix_tree_replace_slot(struct radix_tree_root *root, void **slot, void *item) @@ -1466,75 +1571,6 @@ unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item) } #endif /* CONFIG_SHMEM && CONFIG_SWAP */ -/** - * radix_tree_shrink - shrink radix tree to minimum height - * @root radix tree root - */ -static inline bool radix_tree_shrink(struct radix_tree_root *root) -{ - bool shrunk = false; - - for (;;) { - struct radix_tree_node *node = root->rnode; - struct radix_tree_node *child; - - if (!radix_tree_is_internal_node(node)) - break; - node = entry_to_node(node); - - /* - * The candidate node has more than one child, or its child - * is not at the leftmost slot, or the child is a multiorder - * entry, we cannot shrink. - */ - if (node->count != 1) - break; - child = node->slots[0]; - if (!child) - break; - if (!radix_tree_is_internal_node(child) && node->shift) - break; - - if (radix_tree_is_internal_node(child)) - entry_to_node(child)->parent = NULL; - - /* - * We don't need rcu_assign_pointer(), since we are simply - * moving the node from one part of the tree to another: if it - * was safe to dereference the old pointer to it - * (node->slots[0]), it will be safe to dereference the new - * one (root->rnode) as far as dependent read barriers go. - */ - root->rnode = child; - - /* - * We have a dilemma here. The node's slot[0] must not be - * NULLed in case there are concurrent lookups expecting to - * find the item. However if this was a bottom-level node, - * then it may be subject to the slot pointer being visible - * to callers dereferencing it. If item corresponding to - * slot[0] is subsequently deleted, these callers would expect - * their slot to become empty sooner or later. - * - * For example, lockless pagecache will look up a slot, deref - * the page pointer, and if the page has 0 refcount it means it - * was concurrently deleted from pagecache so try the deref - * again. Fortunately there is already a requirement for logic - * to retry the entire slot lookup -- the indirect pointer - * problem (replacing direct root node with an indirect pointer - * also results in a stale slot). So tag the slot as indirect - * to force callers to retry. - */ - if (!radix_tree_is_internal_node(child)) - node->slots[0] = RADIX_TREE_RETRY; - - radix_tree_node_free(node); - shrunk = true; - } - - return shrunk; -} - /** * __radix_tree_delete_node - try to free node after clearing a slot * @root: radix tree root @@ -1549,33 +1585,7 @@ static inline bool radix_tree_shrink(struct radix_tree_root *root) bool __radix_tree_delete_node(struct radix_tree_root *root, struct radix_tree_node *node) { - bool deleted = false; - - do { - struct radix_tree_node *parent; - - if (node->count) { - if (node == entry_to_node(root->rnode)) - deleted |= radix_tree_shrink(root); - return deleted; - } - - parent = node->parent; - if (parent) { - parent->slots[node->offset] = NULL; - parent->count--; - } else { - root_tag_clear_all(root); - root->rnode = NULL; - } - - radix_tree_node_free(node); - deleted = true; - - node = parent; - } while (node); - - return deleted; + return delete_node(root, node); } static inline void delete_sibling_entries(struct radix_tree_node *node, @@ -1632,12 +1642,7 @@ void *radix_tree_delete_item(struct radix_tree_root *root, node_tag_clear(root, node, tag, offset); delete_sibling_entries(node, node_to_entry(slot), offset); - node->slots[offset] = NULL; - node->count--; - if (radix_tree_exceptional_entry(entry)) - node->exceptional--; - - __radix_tree_delete_node(root, node); + __radix_tree_replace(root, node, slot, NULL); return entry; } -- cgit v1.2.3 From 4d693d08607ab319095ec8942909df4b4aebdf66 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 12 Dec 2016 16:43:49 -0800 Subject: lib: radix-tree: update callback for changing leaf nodes Support handing __radix_tree_replace() a callback that gets invoked for all leaf nodes that change or get freed as a result of the slot replacement, to assist users tracking nodes with node->private_list. This prepares for putting page cache shadow entries into the radix tree root again and drastically simplifying the shadow tracking. Link: http://lkml.kernel.org/r/20161117193134.GD23430@cmpxchg.org Signed-off-by: Johannes Weiner Suggested-by: Jan Kara Reviewed-by: Jan Kara Cc: Kirill A. Shutemov Cc: Hugh Dickins Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/dax.c | 3 ++- include/linux/radix-tree.h | 4 +++- lib/radix-tree.c | 42 +++++++++++++++++++++++++++++------------- mm/shmem.c | 3 ++- 4 files changed, 36 insertions(+), 16 deletions(-) (limited to 'lib/radix-tree.c') diff --git a/fs/dax.c b/fs/dax.c index 85930c2a2749..6916ed37d463 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -649,7 +649,8 @@ static void *dax_insert_mapping_entry(struct address_space *mapping, ret = __radix_tree_lookup(page_tree, index, &node, &slot); WARN_ON_ONCE(ret != entry); - __radix_tree_replace(page_tree, node, slot, new_entry); + __radix_tree_replace(page_tree, node, slot, + new_entry, NULL, NULL); } if (vmf->flags & FAULT_FLAG_WRITE) radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY); diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index 2d1b9b8be983..15c972ea9510 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -263,9 +263,11 @@ void *__radix_tree_lookup(struct radix_tree_root *root, unsigned long index, struct radix_tree_node **nodep, void ***slotp); void *radix_tree_lookup(struct radix_tree_root *, unsigned long); void **radix_tree_lookup_slot(struct radix_tree_root *, unsigned long); +typedef void (*radix_tree_update_node_t)(struct radix_tree_node *, void *); void __radix_tree_replace(struct radix_tree_root *root, struct radix_tree_node *node, - void **slot, void *item); + void **slot, void *item, + radix_tree_update_node_t update_node, void *private); void radix_tree_replace_slot(struct radix_tree_root *root, void **slot, void *item); bool __radix_tree_delete_node(struct radix_tree_root *root, diff --git a/lib/radix-tree.c b/lib/radix-tree.c index 5d8930f3b3d8..df4ff18dd63c 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -325,7 +325,6 @@ static void radix_tree_node_rcu_free(struct rcu_head *head) tag_clear(node, i, 0); node->slots[0] = NULL; - node->count = 0; kmem_cache_free(radix_tree_node_cachep, node); } @@ -542,7 +541,9 @@ out: * radix_tree_shrink - shrink radix tree to minimum height * @root radix tree root */ -static inline bool radix_tree_shrink(struct radix_tree_root *root) +static inline bool radix_tree_shrink(struct radix_tree_root *root, + radix_tree_update_node_t update_node, + void *private) { bool shrunk = false; @@ -597,8 +598,12 @@ static inline bool radix_tree_shrink(struct radix_tree_root *root) * also results in a stale slot). So tag the slot as indirect * to force callers to retry. */ - if (!radix_tree_is_internal_node(child)) + node->count = 0; + if (!radix_tree_is_internal_node(child)) { node->slots[0] = RADIX_TREE_RETRY; + if (update_node) + update_node(node, private); + } radix_tree_node_free(node); shrunk = true; @@ -608,7 +613,8 @@ static inline bool radix_tree_shrink(struct radix_tree_root *root) } static bool delete_node(struct radix_tree_root *root, - struct radix_tree_node *node) + struct radix_tree_node *node, + radix_tree_update_node_t update_node, void *private) { bool deleted = false; @@ -617,7 +623,8 @@ static bool delete_node(struct radix_tree_root *root, if (node->count) { if (node == entry_to_node(root->rnode)) - deleted |= radix_tree_shrink(root); + deleted |= radix_tree_shrink(root, update_node, + private); return deleted; } @@ -880,17 +887,20 @@ static void replace_slot(struct radix_tree_root *root, /** * __radix_tree_replace - replace item in a slot - * @root: radix tree root - * @node: pointer to tree node - * @slot: pointer to slot in @node - * @item: new item to store in the slot. + * @root: radix tree root + * @node: pointer to tree node + * @slot: pointer to slot in @node + * @item: new item to store in the slot. + * @update_node: callback for changing leaf nodes + * @private: private data to pass to @update_node * * For use with __radix_tree_lookup(). Caller must hold tree write locked * across slot lookup and replacement. */ void __radix_tree_replace(struct radix_tree_root *root, struct radix_tree_node *node, - void **slot, void *item) + void **slot, void *item, + radix_tree_update_node_t update_node, void *private) { /* * This function supports replacing exceptional entries and @@ -900,7 +910,13 @@ void __radix_tree_replace(struct radix_tree_root *root, replace_slot(root, node, slot, item, !node && slot != (void **)&root->rnode); - delete_node(root, node); + if (!node) + return; + + if (update_node) + update_node(node, private); + + delete_node(root, node, update_node, private); } /** @@ -1585,7 +1601,7 @@ unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item) bool __radix_tree_delete_node(struct radix_tree_root *root, struct radix_tree_node *node) { - return delete_node(root, node); + return delete_node(root, node, NULL, NULL); } static inline void delete_sibling_entries(struct radix_tree_node *node, @@ -1642,7 +1658,7 @@ void *radix_tree_delete_item(struct radix_tree_root *root, node_tag_clear(root, node, tag, offset); delete_sibling_entries(node, node_to_entry(slot), offset); - __radix_tree_replace(root, node, slot, NULL); + __radix_tree_replace(root, node, slot, NULL, NULL, NULL); return entry; } diff --git a/mm/shmem.c b/mm/shmem.c index 3149ddee8f55..abd7403aba41 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -311,7 +311,8 @@ static int shmem_radix_tree_replace(struct address_space *mapping, return -ENOENT; if (item != expected) return -ENOENT; - __radix_tree_replace(&mapping->page_tree, node, pslot, replacement); + __radix_tree_replace(&mapping->page_tree, node, pslot, + replacement, NULL, NULL); return 0; } -- cgit v1.2.3 From 14b468791fa955d442f962fdf5207dfd39a131c8 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 12 Dec 2016 16:43:52 -0800 Subject: mm: workingset: move shadow entry tracking to radix tree exceptional tracking Currently, we track the shadow entries in the page cache in the upper bits of the radix_tree_node->count, behind the back of the radix tree implementation. Because the radix tree code has no awareness of them, we rely on random subtleties throughout the implementation (such as the node->count != 1 check in the shrinking code, which is meant to exclude multi-entry nodes but also happens to skip nodes with only one shadow entry, as that's accounted in the upper bits). This is error prone and has, in fact, caused the bug fixed in d3798ae8c6f3 ("mm: filemap: don't plant shadow entries without radix tree node"). To remove these subtleties, this patch moves shadow entry tracking from the upper bits of node->count to the existing counter for exceptional entries. node->count goes back to being a simple counter of valid entries in the tree node and can be shrunk to a single byte. This vastly simplifies the page cache code. All accounting happens natively inside the radix tree implementation, and maintaining the LRU linkage of shadow nodes is consolidated into a single function in the workingset code that is called for leaf nodes affected by a change in the page cache tree. This also removes the last user of the __radix_delete_node() return value. Eliminate it. Link: http://lkml.kernel.org/r/20161117193211.GE23430@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Jan Kara Cc: Kirill A. Shutemov Cc: Hugh Dickins Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/radix-tree.h | 8 ++----- include/linux/swap.h | 34 +--------------------------- lib/radix-tree.c | 25 +++++---------------- mm/filemap.c | 54 +++++--------------------------------------- mm/truncate.c | 21 +++-------------- mm/workingset.c | 56 +++++++++++++++++++++++++++++++++++----------- 6 files changed, 60 insertions(+), 138 deletions(-) (limited to 'lib/radix-tree.c') diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index 15c972ea9510..744486057e9e 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -80,14 +80,10 @@ static inline bool radix_tree_is_internal_node(void *ptr) #define RADIX_TREE_MAX_PATH (DIV_ROUND_UP(RADIX_TREE_INDEX_BITS, \ RADIX_TREE_MAP_SHIFT)) -/* Internally used bits of node->count */ -#define RADIX_TREE_COUNT_SHIFT (RADIX_TREE_MAP_SHIFT + 1) -#define RADIX_TREE_COUNT_MASK ((1UL << RADIX_TREE_COUNT_SHIFT) - 1) - struct radix_tree_node { unsigned char shift; /* Bits remaining in each slot */ unsigned char offset; /* Slot offset in parent */ - unsigned int count; /* Total entry count */ + unsigned char count; /* Total entry count */ unsigned char exceptional; /* Exceptional entry count */ union { struct { @@ -270,7 +266,7 @@ void __radix_tree_replace(struct radix_tree_root *root, radix_tree_update_node_t update_node, void *private); void radix_tree_replace_slot(struct radix_tree_root *root, void **slot, void *item); -bool __radix_tree_delete_node(struct radix_tree_root *root, +void __radix_tree_delete_node(struct radix_tree_root *root, struct radix_tree_node *node); void *radix_tree_delete_item(struct radix_tree_root *, unsigned long, void *); void *radix_tree_delete(struct radix_tree_root *, unsigned long); diff --git a/include/linux/swap.h b/include/linux/swap.h index a56523cefb9b..09b212d37f1d 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -246,39 +246,7 @@ struct swap_info_struct { void *workingset_eviction(struct address_space *mapping, struct page *page); bool workingset_refault(void *shadow); void workingset_activation(struct page *page); -extern struct list_lru workingset_shadow_nodes; - -static inline unsigned int workingset_node_pages(struct radix_tree_node *node) -{ - return node->count & RADIX_TREE_COUNT_MASK; -} - -static inline void workingset_node_pages_inc(struct radix_tree_node *node) -{ - node->count++; -} - -static inline void workingset_node_pages_dec(struct radix_tree_node *node) -{ - VM_WARN_ON_ONCE(!workingset_node_pages(node)); - node->count--; -} - -static inline unsigned int workingset_node_shadows(struct radix_tree_node *node) -{ - return node->count >> RADIX_TREE_COUNT_SHIFT; -} - -static inline void workingset_node_shadows_inc(struct radix_tree_node *node) -{ - node->count += 1U << RADIX_TREE_COUNT_SHIFT; -} - -static inline void workingset_node_shadows_dec(struct radix_tree_node *node) -{ - VM_WARN_ON_ONCE(!workingset_node_shadows(node)); - node->count -= 1U << RADIX_TREE_COUNT_SHIFT; -} +void workingset_update_node(struct radix_tree_node *node, void *private); /* linux/mm/page_alloc.c */ extern unsigned long totalram_pages; diff --git a/lib/radix-tree.c b/lib/radix-tree.c index df4ff18dd63c..9dbfaac05e6c 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -541,12 +541,10 @@ out: * radix_tree_shrink - shrink radix tree to minimum height * @root radix tree root */ -static inline bool radix_tree_shrink(struct radix_tree_root *root, +static inline void radix_tree_shrink(struct radix_tree_root *root, radix_tree_update_node_t update_node, void *private) { - bool shrunk = false; - for (;;) { struct radix_tree_node *node = root->rnode; struct radix_tree_node *child; @@ -606,26 +604,20 @@ static inline bool radix_tree_shrink(struct radix_tree_root *root, } radix_tree_node_free(node); - shrunk = true; } - - return shrunk; } -static bool delete_node(struct radix_tree_root *root, +static void delete_node(struct radix_tree_root *root, struct radix_tree_node *node, radix_tree_update_node_t update_node, void *private) { - bool deleted = false; - do { struct radix_tree_node *parent; if (node->count) { if (node == entry_to_node(root->rnode)) - deleted |= radix_tree_shrink(root, update_node, - private); - return deleted; + radix_tree_shrink(root, update_node, private); + return; } parent = node->parent; @@ -638,12 +630,9 @@ static bool delete_node(struct radix_tree_root *root, } radix_tree_node_free(node); - deleted = true; node = parent; } while (node); - - return deleted; } /** @@ -1595,13 +1584,11 @@ unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item) * After clearing the slot at @index in @node from radix tree * rooted at @root, call this function to attempt freeing the * node and shrinking the tree. - * - * Returns %true if @node was freed, %false otherwise. */ -bool __radix_tree_delete_node(struct radix_tree_root *root, +void __radix_tree_delete_node(struct radix_tree_root *root, struct radix_tree_node *node) { - return delete_node(root, node, NULL, NULL); + delete_node(root, node, NULL, NULL); } static inline void delete_sibling_entries(struct radix_tree_node *node, diff --git a/mm/filemap.c b/mm/filemap.c index 1ba726aef708..dc3e5fce0b7b 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -132,37 +132,19 @@ static int page_cache_tree_insert(struct address_space *mapping, if (!dax_mapping(mapping)) { if (shadowp) *shadowp = p; - if (node) - workingset_node_shadows_dec(node); } else { /* DAX can replace empty locked entry with a hole */ WARN_ON_ONCE(p != (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | RADIX_DAX_ENTRY_LOCK)); - /* DAX accounts exceptional entries as normal pages */ - if (node) - workingset_node_pages_dec(node); /* Wakeup waiters for exceptional entry lock */ dax_wake_mapping_entry_waiter(mapping, page->index, false); } } - radix_tree_replace_slot(&mapping->page_tree, slot, page); + __radix_tree_replace(&mapping->page_tree, node, slot, page, + workingset_update_node, mapping); mapping->nrpages++; - if (node) { - workingset_node_pages_inc(node); - /* - * Don't track node that contains actual pages. - * - * Avoid acquiring the list_lru lock if already - * untracked. The list_empty() test is safe as - * node->private_list is protected by - * mapping->tree_lock. - */ - if (!list_empty(&node->private_list)) - list_lru_del(&workingset_shadow_nodes, - &node->private_list); - } return 0; } @@ -185,8 +167,6 @@ static void page_cache_tree_delete(struct address_space *mapping, __radix_tree_lookup(&mapping->page_tree, page->index + i, &node, &slot); - radix_tree_clear_tags(&mapping->page_tree, node, slot); - if (!node) { VM_BUG_ON_PAGE(nr != 1, page); /* @@ -196,33 +176,9 @@ static void page_cache_tree_delete(struct address_space *mapping, shadow = NULL; } - radix_tree_replace_slot(&mapping->page_tree, slot, shadow); - - if (!node) - break; - - workingset_node_pages_dec(node); - if (shadow) - workingset_node_shadows_inc(node); - else - if (__radix_tree_delete_node(&mapping->page_tree, node)) - continue; - - /* - * Track node that only contains shadow entries. DAX mappings - * contain no shadow entries and may contain other exceptional - * entries so skip those. - * - * Avoid acquiring the list_lru lock if already tracked. - * The list_empty() test is safe as node->private_list is - * protected by mapping->tree_lock. - */ - if (!dax_mapping(mapping) && !workingset_node_pages(node) && - list_empty(&node->private_list)) { - node->private_data = mapping; - list_lru_add(&workingset_shadow_nodes, - &node->private_list); - } + radix_tree_clear_tags(&mapping->page_tree, node, slot); + __radix_tree_replace(&mapping->page_tree, node, slot, shadow, + workingset_update_node, mapping); } if (shadow) { diff --git a/mm/truncate.c b/mm/truncate.c index 3c631c357873..fd97f1dbce29 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -44,28 +44,13 @@ static void clear_exceptional_entry(struct address_space *mapping, * without the tree itself locked. These unlocked entries * need verification under the tree lock. */ - if (!__radix_tree_lookup(&mapping->page_tree, index, &node, - &slot)) + if (!__radix_tree_lookup(&mapping->page_tree, index, &node, &slot)) goto unlock; if (*slot != entry) goto unlock; - radix_tree_replace_slot(&mapping->page_tree, slot, NULL); + __radix_tree_replace(&mapping->page_tree, node, slot, NULL, + workingset_update_node, mapping); mapping->nrexceptional--; - if (!node) - goto unlock; - workingset_node_shadows_dec(node); - /* - * Don't track node without shadow entries. - * - * Avoid acquiring the list_lru lock if already untracked. - * The list_empty() test is safe as node->private_list is - * protected by mapping->tree_lock. - */ - if (!workingset_node_shadows(node) && - !list_empty(&node->private_list)) - list_lru_del(&workingset_shadow_nodes, - &node->private_list); - __radix_tree_delete_node(&mapping->page_tree, node); unlock: spin_unlock_irq(&mapping->tree_lock); } diff --git a/mm/workingset.c b/mm/workingset.c index 98f830897b1b..ef556bf1323d 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -334,18 +335,45 @@ out: * point where they would still be useful. */ -struct list_lru workingset_shadow_nodes; +static struct list_lru shadow_nodes; + +void workingset_update_node(struct radix_tree_node *node, void *private) +{ + struct address_space *mapping = private; + + /* Only regular page cache has shadow entries */ + if (dax_mapping(mapping) || shmem_mapping(mapping)) + return; + + /* + * Track non-empty nodes that contain only shadow entries; + * unlink those that contain pages or are being freed. + * + * Avoid acquiring the list_lru lock when the nodes are + * already where they should be. The list_empty() test is safe + * as node->private_list is protected by &mapping->tree_lock. + */ + if (node->count && node->count == node->exceptional) { + if (list_empty(&node->private_list)) { + node->private_data = mapping; + list_lru_add(&shadow_nodes, &node->private_list); + } + } else { + if (!list_empty(&node->private_list)) + list_lru_del(&shadow_nodes, &node->private_list); + } +} static unsigned long count_shadow_nodes(struct shrinker *shrinker, struct shrink_control *sc) { - unsigned long shadow_nodes; unsigned long max_nodes; + unsigned long nodes; unsigned long pages; /* list_lru lock nests inside IRQ-safe mapping->tree_lock */ local_irq_disable(); - shadow_nodes = list_lru_shrink_count(&workingset_shadow_nodes, sc); + nodes = list_lru_shrink_count(&shadow_nodes, sc); local_irq_enable(); if (sc->memcg) { @@ -372,10 +400,10 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, */ max_nodes = pages >> (1 + RADIX_TREE_MAP_SHIFT - 3); - if (shadow_nodes <= max_nodes) + if (nodes <= max_nodes) return 0; - return shadow_nodes - max_nodes; + return nodes - max_nodes; } static enum lru_status shadow_lru_isolate(struct list_head *item, @@ -418,22 +446,25 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, * no pages, so we expect to be able to remove them all and * delete and free the empty node afterwards. */ - if (WARN_ON_ONCE(!workingset_node_shadows(node))) + if (WARN_ON_ONCE(!node->exceptional)) goto out_invalid; - if (WARN_ON_ONCE(workingset_node_pages(node))) + if (WARN_ON_ONCE(node->count != node->exceptional)) goto out_invalid; for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) { if (node->slots[i]) { if (WARN_ON_ONCE(!radix_tree_exceptional_entry(node->slots[i]))) goto out_invalid; + if (WARN_ON_ONCE(!node->exceptional)) + goto out_invalid; if (WARN_ON_ONCE(!mapping->nrexceptional)) goto out_invalid; node->slots[i] = NULL; - workingset_node_shadows_dec(node); + node->exceptional--; + node->count--; mapping->nrexceptional--; } } - if (WARN_ON_ONCE(workingset_node_shadows(node))) + if (WARN_ON_ONCE(node->exceptional)) goto out_invalid; inc_node_state(page_pgdat(virt_to_page(node)), WORKINGSET_NODERECLAIM); __radix_tree_delete_node(&mapping->page_tree, node); @@ -456,8 +487,7 @@ static unsigned long scan_shadow_nodes(struct shrinker *shrinker, /* list_lru lock nests inside IRQ-safe mapping->tree_lock */ local_irq_disable(); - ret = list_lru_shrink_walk(&workingset_shadow_nodes, sc, - shadow_lru_isolate, NULL); + ret = list_lru_shrink_walk(&shadow_nodes, sc, shadow_lru_isolate, NULL); local_irq_enable(); return ret; } @@ -496,7 +526,7 @@ static int __init workingset_init(void) pr_info("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n", timestamp_bits, max_order, bucket_order); - ret = list_lru_init_key(&workingset_shadow_nodes, &shadow_nodes_key); + ret = list_lru_init_key(&shadow_nodes, &shadow_nodes_key); if (ret) goto err; ret = register_shrinker(&workingset_shadow_shrinker); @@ -504,7 +534,7 @@ static int __init workingset_init(void) goto err_list_lru; return 0; err_list_lru: - list_lru_destroy(&workingset_shadow_nodes); + list_lru_destroy(&shadow_nodes); err: return ret; } -- cgit v1.2.3