diff options
author | Kent Overstreet <kent.overstreet@gmail.com> | 2022-08-22 20:23:47 +0300 |
---|---|---|
committer | Kent Overstreet <kent.overstreet@linux.dev> | 2023-10-23 00:09:41 +0300 |
commit | 33bd5d068603f9e81e0b73dbe50e9b88b2e56d0d (patch) | |
tree | 6fff6e218b381e0fa2cd4580da3a2e919d18ccd8 /fs/bcachefs/btree_locking.c | |
parent | 62448afee714354a26db8a0f3c644f58628f0792 (diff) | |
download | linux-33bd5d068603f9e81e0b73dbe50e9b88b2e56d0d.tar.xz |
bcachefs: Deadlock cycle detector
We've outgrown our own deadlock avoidance strategy.
The btree iterator API provides an interface where the user doesn't need
to concern themselves with lock ordering - different btree iterators can
be traversed in any order. Without special care, this will lead to
deadlocks.
Our previous strategy was to define a lock ordering internally, and
whenever we attempt to take a lock and trylock() fails, we'd check if
the current btree transaction is holding any locks that cause a lock
ordering violation. If so, we'd issue a transaction restart, and then
bch2_trans_begin() would re-traverse all previously used iterators, but
in the correct order.
That approach had some issues, though.
- Sometimes we'd issue transaction restarts unnecessarily, when no
deadlock would have actually occured. Lock ordering restarts have
become our primary cause of transaction restarts, on some workloads
totally 20% of actual transaction commits.
- To avoid deadlock or livelock, we'd often have to take intent locks
when we only wanted a read lock: with the lock ordering approach, it
is actually illegal to hold _any_ read lock while blocking on an intent
lock, and this has been causing us unnecessary lock contention.
- It was getting fragile - the various lock ordering rules are not
trivial, and we'd been seeing occasional livelock issues related to
this machinery.
So, since bcachefs is already a relational database masquerading as a
filesystem, we're stealing the next traditional database technique and
switching to a cycle detector for avoiding deadlocks.
When we block taking a btree lock, after adding ourself to the waitlist
but before sleeping, we do a DFS of btree transactions waiting on other
btree transactions, starting with the current transaction and walking
our held locks, and transactions blocking on our held locks.
If we find a cycle, we emit a transaction restart. Occasionally (e.g.
the btree split path) we can not allow the lock() operation to fail, so
if necessary we'll tell another transaction that it has to fail.
Result: trans_restart_would_deadlock events are reduced by a factor of
10 to 100, and we'll be able to delete a whole bunch of grotty, fragile
code.
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Diffstat (limited to 'fs/bcachefs/btree_locking.c')
-rw-r--r-- | fs/bcachefs/btree_locking.c | 246 |
1 files changed, 243 insertions, 3 deletions
diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c index 5b6d8184ea45..869f4163a3c6 100644 --- a/fs/bcachefs/btree_locking.c +++ b/fs/bcachefs/btree_locking.c @@ -52,10 +52,248 @@ void bch2_btree_node_unlock_write(struct btree_trans *trans, /* lock */ -void __bch2_btree_node_lock_write(struct btree_trans *trans, - struct btree_bkey_cached_common *b) +/* + * @trans wants to lock @b with type @type + */ +struct trans_waiting_for_lock { + struct btree_trans *trans; + struct btree_bkey_cached_common *node_want; + enum six_lock_type lock_want; + + /* for iterating over held locks :*/ + u8 path_idx; + u8 level; + u64 lock_start_time; +}; + +struct lock_graph { + struct trans_waiting_for_lock g[8]; + unsigned nr; +}; + +static void lock_graph_pop(struct lock_graph *g) +{ + closure_put(&g->g[--g->nr].trans->ref); +} + +static int abort_lock(struct lock_graph *g, struct trans_waiting_for_lock *i) +{ + int ret; + + if (i == g->g) { + ret = btree_trans_restart(i->trans, BCH_ERR_transaction_restart_would_deadlock); + } else { + i->trans->lock_must_abort = true; + ret = 0; + } + + for (i = g->g + 1; i < g->g + g->nr; i++) + wake_up_process(i->trans->locking_wait.task); + return ret; +} + +static noinline int break_cycle(struct lock_graph *g) +{ + struct trans_waiting_for_lock *i; + + for (i = g->g; i < g->g + g->nr; i++) { + if (i->trans->lock_may_not_fail || + i->trans->locking_wait.lock_want == SIX_LOCK_write) + continue; + + return abort_lock(g, i); + } + + for (i = g->g; i < g->g + g->nr; i++) { + if (i->trans->lock_may_not_fail || + !i->trans->in_traverse_all) + continue; + + return abort_lock(g, i); + } + + for (i = g->g; i < g->g + g->nr; i++) { + if (i->trans->lock_may_not_fail) + continue; + + return abort_lock(g, i); + } + + BUG(); +} + +static int lock_graph_descend(struct lock_graph *g, struct btree_trans *trans) +{ + struct btree_trans *orig_trans = g->g->trans; + struct trans_waiting_for_lock *i; + int ret = 0; + + for (i = g->g; i < g->g + g->nr; i++) { + if (i->trans->locking != i->node_want) + while (g->g + g->nr >= i) { + lock_graph_pop(g); + return 0; + } + + if (i->trans == trans) { + ret = break_cycle(g); + if (ret) + goto deadlock; + /* + * If we didn't abort (instead telling another + * transaction to abort), keep checking: + */ + } + } + + if (g->nr == ARRAY_SIZE(g->g)) { + if (orig_trans->lock_may_not_fail) + return 0; + + trace_and_count(trans->c, trans_restart_would_deadlock_recursion_limit, trans, _RET_IP_); + ret = btree_trans_restart(orig_trans, BCH_ERR_transaction_restart_deadlock_recursion_limit); + goto deadlock; + } + + closure_get(&trans->ref); + + g->g[g->nr++] = (struct trans_waiting_for_lock) { + .trans = trans, + .node_want = trans->locking, + .lock_want = trans->locking_wait.lock_want, + }; + + return 0; +deadlock: + while (g->nr) + lock_graph_pop(g); + return ret; +} + +#if 0 +static void print_cycle(struct printbuf *out, struct lock_graph *g) +{ + struct trans_waiting_for_lock *i; + + prt_str(out, "Found lock cycle:"); + prt_newline(out); + + for (i = g->g; i < g->g + g->nr; i++) + bch2_btree_trans_to_text(out, i->trans); +} +#endif + +static noinline void lock_graph_remove_non_waiters(struct lock_graph *g) +{ + struct trans_waiting_for_lock *i; + + for (i = g->g + 1; i < g->g + g->nr; i++) + if (i->trans->locking != i->node_want || + i->trans->locking_wait.start_time != i[-1].lock_start_time) { + while (g->g + g->nr >= i) + lock_graph_pop(g); + return; + } + BUG(); +} + +static bool lock_type_conflicts(enum six_lock_type t1, enum six_lock_type t2) +{ + return t1 + t2 > 1; +} + +static int check_for_deadlock(struct btree_trans *trans) +{ + struct lock_graph g; + struct trans_waiting_for_lock *top; + struct btree_bkey_cached_common *b; + struct btree_path *path; + int ret; + + if (trans->lock_must_abort) + return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock); + + g.nr = 0; + ret = lock_graph_descend(&g, trans); + BUG_ON(ret); +next: + if (!g.nr) + return 0; + + top = &g.g[g.nr - 1]; + + trans_for_each_path_from(top->trans, path, top->path_idx) { + if (!path->nodes_locked) + continue; + + if (top->path_idx != path->idx) { + top->path_idx = path->idx; + top->level = 0; + top->lock_start_time = 0; + } + + for (; + top->level < BTREE_MAX_DEPTH; + top->level++, top->lock_start_time = 0) { + int lock_held = btree_node_locked_type(path, top->level); + + if (lock_held == BTREE_NODE_UNLOCKED) + continue; + + b = &READ_ONCE(path->l[top->level].b)->c; + + if (unlikely(IS_ERR_OR_NULL(b))) { + lock_graph_remove_non_waiters(&g); + goto next; + } + + if (list_empty_careful(&b->lock.wait_list)) + continue; + + raw_spin_lock(&b->lock.wait_lock); + list_for_each_entry(trans, &b->lock.wait_list, locking_wait.list) { + BUG_ON(b != trans->locking); + + if (top->lock_start_time && + time_after_eq64(top->lock_start_time, trans->locking_wait.start_time)) + continue; + + top->lock_start_time = trans->locking_wait.start_time; + + /* Don't check for self deadlock: */ + if (trans == top->trans || + !lock_type_conflicts(lock_held, trans->locking_wait.lock_want)) + continue; + + ret = lock_graph_descend(&g, trans); + raw_spin_unlock(&b->lock.wait_lock); + + if (ret) + return ret < 0 ? ret : 0; + goto next; + + } + raw_spin_unlock(&b->lock.wait_lock); + } + } + + lock_graph_pop(&g); + goto next; +} + +int bch2_six_check_for_deadlock(struct six_lock *lock, void *p) +{ + struct btree_trans *trans = p; + + return check_for_deadlock(trans); +} + +int __bch2_btree_node_lock_write(struct btree_trans *trans, + struct btree_bkey_cached_common *b, + bool lock_may_not_fail) { int readers = bch2_btree_node_lock_counts(trans, NULL, b, b->level).n[SIX_LOCK_read]; + int ret; /* * Must drop our read locks before calling six_lock_write() - @@ -64,8 +302,10 @@ void __bch2_btree_node_lock_write(struct btree_trans *trans, * locked: */ six_lock_readers_add(&b->lock, -readers); - btree_node_lock_nopath_nofail(trans, b, SIX_LOCK_write); + ret = __btree_node_lock_nopath(trans, b, SIX_LOCK_write, lock_may_not_fail); six_lock_readers_add(&b->lock, readers); + + return ret; } static inline bool path_has_read_locks(struct btree_path *path) |