summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKent Overstreet <kent.overstreet@gmail.com>2022-03-05 03:16:04 +0300
committerKent Overstreet <kent.overstreet@linux.dev>2023-10-23 00:09:26 +0300
commit3098553776a16c08446c408005090423d62e6b54 (patch)
tree17cc317918e2fb51dd540d86650510928965cb5e
parent5b3f780540aa5e39859a0c00ace61713da054a0f (diff)
downloadlinux-3098553776a16c08446c408005090423d62e6b54.tar.xz
bcachefs: Fix usage of six lock's percpu mode
Six locks have a percpu mode, which we use for interior btree nodes, as well as btree key cache keys for the subvolumes btree. We've been switching locks back and forth between percpu and non percpu mode as needed, but it turns out this is racy - when we're reusing an existing node, other threads could be attempting to lock it while we're switching it between modes. This patch fixes this by never switching 'struct btree' between the two modes, and instead segragating them between two different freed lists. Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com> Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
-rw-r--r--fs/bcachefs/btree_cache.c41
-rw-r--r--fs/bcachefs/btree_cache.h2
-rw-r--r--fs/bcachefs/btree_io.c2
-rw-r--r--fs/bcachefs/btree_key_cache.c10
-rw-r--r--fs/bcachefs/btree_types.h3
-rw-r--r--fs/bcachefs/btree_update_interior.c99
-rw-r--r--fs/bcachefs/btree_update_interior.h6
-rw-r--r--fs/bcachefs/debug.c5
8 files changed, 99 insertions, 69 deletions
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 42253ca17f04..92a8cc704cab 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -40,6 +40,14 @@ static inline unsigned btree_cache_can_free(struct btree_cache *bc)
return max_t(int, 0, bc->used - bc->reserve);
}
+static void btree_node_to_freedlist(struct btree_cache *bc, struct btree *b)
+{
+ if (b->c.lock.readers)
+ list_move(&b->list, &bc->freed_pcpu);
+ else
+ list_move(&b->list, &bc->freed_nonpcpu);
+}
+
static void btree_node_data_free(struct bch_fs *c, struct btree *b)
{
struct btree_cache *bc = &c->btree_cache;
@@ -56,7 +64,8 @@ static void btree_node_data_free(struct bch_fs *c, struct btree *b)
b->aux_data = NULL;
bc->used--;
- list_move(&b->list, &bc->freed);
+
+ btree_node_to_freedlist(bc, b);
}
static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg,
@@ -162,11 +171,6 @@ int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b,
b->c.level = level;
b->c.btree_id = id;
- if (level)
- six_lock_pcpu_alloc(&b->c.lock);
- else
- six_lock_pcpu_free_rcu(&b->c.lock);
-
mutex_lock(&bc->lock);
ret = __bch2_btree_node_hash_insert(bc, b);
if (!ret)
@@ -432,8 +436,10 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c)
BUG_ON(atomic_read(&c->btree_cache.dirty));
- while (!list_empty(&bc->freed)) {
- b = list_first_entry(&bc->freed, struct btree, list);
+ list_splice(&bc->freed_pcpu, &bc->freed_nonpcpu);
+
+ while (!list_empty(&bc->freed_nonpcpu)) {
+ b = list_first_entry(&bc->freed_nonpcpu, struct btree, list);
list_del(&b->list);
six_lock_pcpu_free(&b->c.lock);
kfree(b);
@@ -487,7 +493,8 @@ void bch2_fs_btree_cache_init_early(struct btree_cache *bc)
mutex_init(&bc->lock);
INIT_LIST_HEAD(&bc->live);
INIT_LIST_HEAD(&bc->freeable);
- INIT_LIST_HEAD(&bc->freed);
+ INIT_LIST_HEAD(&bc->freed_pcpu);
+ INIT_LIST_HEAD(&bc->freed_nonpcpu);
}
/*
@@ -562,9 +569,12 @@ static struct btree *btree_node_cannibalize(struct bch_fs *c)
}
}
-struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c)
+struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c, bool pcpu_read_locks)
{
struct btree_cache *bc = &c->btree_cache;
+ struct list_head *freed = pcpu_read_locks
+ ? &bc->freed_pcpu
+ : &bc->freed_nonpcpu;
struct btree *b, *b2;
u64 start_time = local_clock();
unsigned flags;
@@ -576,7 +586,7 @@ struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c)
* We never free struct btree itself, just the memory that holds the on
* disk node. Check the freed list before allocating a new one:
*/
- list_for_each_entry(b, &bc->freed, list)
+ list_for_each_entry(b, freed, list)
if (!btree_node_reclaim(c, b)) {
list_del_init(&b->list);
goto got_node;
@@ -586,6 +596,9 @@ struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c)
if (!b)
goto err_locked;
+ if (pcpu_read_locks)
+ six_lock_pcpu_alloc(&b->c.lock);
+
BUG_ON(!six_trylock_intent(&b->c.lock));
BUG_ON(!six_trylock_write(&b->c.lock));
got_node:
@@ -598,7 +611,7 @@ got_node:
if (!btree_node_reclaim(c, b2)) {
swap(b->data, b2->data);
swap(b->aux_data, b2->aux_data);
- list_move(&b2->list, &bc->freed);
+ btree_node_to_freedlist(bc, b2);
six_unlock_write(&b2->c.lock);
six_unlock_intent(&b2->c.lock);
goto got_mem;
@@ -643,7 +656,7 @@ err_locked:
if (b) {
swap(b->data, b2->data);
swap(b->aux_data, b2->aux_data);
- list_move(&b2->list, &bc->freed);
+ btree_node_to_freedlist(bc, b2);
six_unlock_write(&b2->c.lock);
six_unlock_intent(&b2->c.lock);
} else {
@@ -688,7 +701,7 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
return ERR_PTR(-EINTR);
}
- b = bch2_btree_node_mem_alloc(c);
+ b = bch2_btree_node_mem_alloc(c, level != 0);
if (trans && b == ERR_PTR(-ENOMEM)) {
trans->memory_allocation_failure = true;
diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h
index 96f8f90e85a1..83723805f12a 100644
--- a/fs/bcachefs/btree_cache.h
+++ b/fs/bcachefs/btree_cache.h
@@ -20,7 +20,7 @@ void bch2_btree_cache_cannibalize_unlock(struct bch_fs *);
int bch2_btree_cache_cannibalize_lock(struct bch_fs *, struct closure *);
struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *);
-struct btree *bch2_btree_node_mem_alloc(struct bch_fs *);
+struct btree *bch2_btree_node_mem_alloc(struct bch_fs *, bool);
struct btree *bch2_btree_node_get(struct btree_trans *, struct btree_path *,
const struct bkey_i *, unsigned,
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 53f83340f69a..3031b566a112 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -1542,7 +1542,7 @@ int bch2_btree_root_read(struct bch_fs *c, enum btree_id id,
closure_sync(&cl);
} while (ret);
- b = bch2_btree_node_mem_alloc(c);
+ b = bch2_btree_node_mem_alloc(c, level != 0);
bch2_btree_cache_cannibalize_unlock(c);
BUG_ON(IS_ERR(b));
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 70f31b5379e7..7e41552a57df 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -166,13 +166,13 @@ btree_key_cache_create(struct bch_fs *c,
}
was_new = false;
+ } else {
+ if (btree_id == BTREE_ID_subvolumes)
+ six_lock_pcpu_alloc(&ck->c.lock);
+ else
+ six_lock_pcpu_free(&ck->c.lock);
}
- if (btree_id == BTREE_ID_subvolumes)
- six_lock_pcpu_alloc(&ck->c.lock);
- else
- six_lock_pcpu_free(&ck->c.lock);
-
ck->c.level = 0;
ck->c.btree_id = btree_id;
ck->key.btree_id = btree_id;
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 561406b4b7c2..51eb686331bf 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -152,7 +152,8 @@ struct btree_cache {
struct mutex lock;
struct list_head live;
struct list_head freeable;
- struct list_head freed;
+ struct list_head freed_pcpu;
+ struct list_head freed_nonpcpu;
/* Number of elements in live + freeable lists */
unsigned used;
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 523d1146b2e2..43022b340f4e 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -181,6 +181,7 @@ static void bch2_btree_node_free_inmem(struct btree_trans *trans,
static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
struct disk_reservation *res,
struct closure *cl,
+ bool interior_node,
unsigned flags)
{
struct write_point *wp;
@@ -242,7 +243,7 @@ retry:
bch2_open_bucket_get(c, wp, &ob);
bch2_alloc_sectors_done(c, wp);
mem_alloc:
- b = bch2_btree_node_mem_alloc(c);
+ b = bch2_btree_node_mem_alloc(c, interior_node);
six_unlock_write(&b->c.lock);
six_unlock_intent(&b->c.lock);
@@ -260,12 +261,13 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev
{
struct bch_fs *c = as->c;
struct btree *b;
+ struct prealloc_nodes *p = &as->prealloc_nodes[!!level];
int ret;
BUG_ON(level >= BTREE_MAX_DEPTH);
- BUG_ON(!as->nr_prealloc_nodes);
+ BUG_ON(!p->nr);
- b = as->prealloc_nodes[--as->nr_prealloc_nodes];
+ b = p->b[--p->nr];
six_lock_intent(&b->c.lock, NULL, NULL);
six_lock_write(&b->c.lock, NULL, NULL);
@@ -377,43 +379,49 @@ static struct btree *__btree_root_alloc(struct btree_update *as, unsigned level)
static void bch2_btree_reserve_put(struct btree_update *as)
{
struct bch_fs *c = as->c;
+ struct prealloc_nodes *p;
mutex_lock(&c->btree_reserve_cache_lock);
- while (as->nr_prealloc_nodes) {
- struct btree *b = as->prealloc_nodes[--as->nr_prealloc_nodes];
+ for (p = as->prealloc_nodes;
+ p < as->prealloc_nodes + ARRAY_SIZE(as->prealloc_nodes);
+ p++) {
+ while (p->nr) {
+ struct btree *b = p->b[--p->nr];
- six_lock_intent(&b->c.lock, NULL, NULL);
- six_lock_write(&b->c.lock, NULL, NULL);
+ six_lock_intent(&b->c.lock, NULL, NULL);
+ six_lock_write(&b->c.lock, NULL, NULL);
- if (c->btree_reserve_cache_nr <
- ARRAY_SIZE(c->btree_reserve_cache)) {
- struct btree_alloc *a =
- &c->btree_reserve_cache[c->btree_reserve_cache_nr++];
+ if (c->btree_reserve_cache_nr <
+ ARRAY_SIZE(c->btree_reserve_cache)) {
+ struct btree_alloc *a =
+ &c->btree_reserve_cache[c->btree_reserve_cache_nr++];
- a->ob = b->ob;
- b->ob.nr = 0;
- bkey_copy(&a->k, &b->key);
- } else {
- bch2_open_buckets_put(c, &b->ob);
- }
+ a->ob = b->ob;
+ b->ob.nr = 0;
+ bkey_copy(&a->k, &b->key);
+ } else {
+ bch2_open_buckets_put(c, &b->ob);
+ }
- __btree_node_free(c, b);
- six_unlock_write(&b->c.lock);
- six_unlock_intent(&b->c.lock);
+ __btree_node_free(c, b);
+ six_unlock_write(&b->c.lock);
+ six_unlock_intent(&b->c.lock);
+ }
}
mutex_unlock(&c->btree_reserve_cache_lock);
}
-static int bch2_btree_reserve_get(struct btree_update *as, unsigned nr_nodes,
+static int bch2_btree_reserve_get(struct btree_update *as, unsigned nr_nodes[2],
unsigned flags, struct closure *cl)
{
struct bch_fs *c = as->c;
struct btree *b;
+ unsigned interior;
int ret;
- BUG_ON(nr_nodes > BTREE_RESERVE_MAX);
+ BUG_ON(nr_nodes[0] + nr_nodes[1] > BTREE_RESERVE_MAX);
/*
* Protects reaping from the btree node cache and using the btree node
@@ -423,23 +431,28 @@ static int bch2_btree_reserve_get(struct btree_update *as, unsigned nr_nodes,
if (ret)
return ret;
- while (as->nr_prealloc_nodes < nr_nodes) {
- b = __bch2_btree_node_alloc(c, &as->disk_res,
- flags & BTREE_INSERT_NOWAIT
- ? NULL : cl, flags);
- if (IS_ERR(b)) {
- ret = PTR_ERR(b);
- goto err_free;
- }
+ for (interior = 0; interior < 2; interior++) {
+ struct prealloc_nodes *p = as->prealloc_nodes + interior;
+
+ while (p->nr < nr_nodes[interior]) {
+ b = __bch2_btree_node_alloc(c, &as->disk_res,
+ flags & BTREE_INSERT_NOWAIT
+ ? NULL : cl,
+ interior, flags);
+ if (IS_ERR(b)) {
+ ret = PTR_ERR(b);
+ goto err;
+ }
- as->prealloc_nodes[as->nr_prealloc_nodes++] = b;
+ p->b[p->nr++] = b;
+ }
}
bch2_btree_cache_cannibalize_unlock(c);
return 0;
-err_free:
+err:
bch2_btree_cache_cannibalize_unlock(c);
- trace_btree_reserve_get_fail(c, nr_nodes, cl);
+ trace_btree_reserve_get_fail(c, nr_nodes[0] + nr_nodes[1], cl);
return ret;
}
@@ -942,7 +955,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
u64 start_time = local_clock();
int disk_res_flags = (flags & BTREE_INSERT_NOFAIL)
? BCH_DISK_RESERVATION_NOFAIL : 0;
- unsigned nr_nodes;
+ unsigned nr_nodes[2];
unsigned update_level = level;
int journal_flags = 0;
int ret = 0;
@@ -954,10 +967,11 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
closure_init_stack(&cl);
retry:
- nr_nodes = 0;
+ nr_nodes[0] = nr_nodes[1] = 0;
+ update_level = level;
while (1) {
- nr_nodes += 1 + split;
+ nr_nodes[!!update_level] += 1 + split;
update_level++;
if (!btree_path_node(path, update_level))
@@ -972,7 +986,7 @@ retry:
/* Might have to allocate a new root: */
if (update_level < BTREE_MAX_DEPTH)
- nr_nodes += 1;
+ nr_nodes[1] += 1;
if (!bch2_btree_path_upgrade(trans, path, U8_MAX)) {
trace_trans_restart_iter_upgrade(trans->fn, _RET_IP_,
@@ -1050,7 +1064,7 @@ retry:
}
ret = bch2_disk_reservation_get(c, &as->disk_res,
- nr_nodes * btree_sectors(c),
+ (nr_nodes[0] + nr_nodes[1]) * btree_sectors(c),
c->opts.metadata_replicas,
disk_res_flags);
if (ret)
@@ -1085,11 +1099,6 @@ static void bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b)
list_del_init(&b->list);
mutex_unlock(&c->btree_cache.lock);
- if (b->c.level)
- six_lock_pcpu_alloc(&b->c.lock);
- else
- six_lock_pcpu_free(&b->c.lock);
-
mutex_lock(&c->btree_root_lock);
BUG_ON(btree_node_root(c, b) &&
(b->c.level < btree_node_root(c, b)->c.level ||
@@ -2015,7 +2024,7 @@ int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *ite
return -EINTR;
}
- new_hash = bch2_btree_node_mem_alloc(c);
+ new_hash = bch2_btree_node_mem_alloc(c, false);
}
path->intent_ref++;
@@ -2091,7 +2100,7 @@ void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id)
closure_sync(&cl);
} while (ret);
- b = bch2_btree_node_mem_alloc(c);
+ b = bch2_btree_node_mem_alloc(c, false);
bch2_btree_cache_cannibalize_unlock(c);
set_btree_node_fake(b);
diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h
index 8dc86fa636d6..e72eb8795616 100644
--- a/fs/bcachefs/btree_update_interior.h
+++ b/fs/bcachefs/btree_update_interior.h
@@ -76,8 +76,10 @@ struct btree_update {
struct journal_entry_pin journal;
/* Preallocated nodes we reserve when we start the update: */
- struct btree *prealloc_nodes[BTREE_UPDATE_NODES_MAX];
- unsigned nr_prealloc_nodes;
+ struct prealloc_nodes {
+ struct btree *b[BTREE_UPDATE_NODES_MAX];
+ unsigned nr;
+ } prealloc_nodes[2];
/* Nodes being freed: */
struct keylist old_keys;
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
index 1fff03d301a9..457fcee7d8e1 100644
--- a/fs/bcachefs/debug.c
+++ b/fs/bcachefs/debug.c
@@ -443,6 +443,11 @@ static void bch2_cached_btree_node_to_text(struct printbuf *out, struct bch_fs *
bch2_flags_to_text(out, bch2_btree_node_flags, b->flags);
pr_newline(out);
+ pr_buf(out, "pcpu read locks: ");
+ pr_tab(out);
+ pr_buf(out, "%u", b->c.lock.readers != NULL);
+ pr_newline(out);
+
pr_buf(out, "written:");
pr_tab(out);
pr_buf(out, "%u", b->written);