From 33bd5d068603f9e81e0b73dbe50e9b88b2e56d0d Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 22 Aug 2022 13:23:47 -0400 Subject: bcachefs: Deadlock cycle detector We've outgrown our own deadlock avoidance strategy. The btree iterator API provides an interface where the user doesn't need to concern themselves with lock ordering - different btree iterators can be traversed in any order. Without special care, this will lead to deadlocks. Our previous strategy was to define a lock ordering internally, and whenever we attempt to take a lock and trylock() fails, we'd check if the current btree transaction is holding any locks that cause a lock ordering violation. If so, we'd issue a transaction restart, and then bch2_trans_begin() would re-traverse all previously used iterators, but in the correct order. That approach had some issues, though. - Sometimes we'd issue transaction restarts unnecessarily, when no deadlock would have actually occured. Lock ordering restarts have become our primary cause of transaction restarts, on some workloads totally 20% of actual transaction commits. - To avoid deadlock or livelock, we'd often have to take intent locks when we only wanted a read lock: with the lock ordering approach, it is actually illegal to hold _any_ read lock while blocking on an intent lock, and this has been causing us unnecessary lock contention. - It was getting fragile - the various lock ordering rules are not trivial, and we'd been seeing occasional livelock issues related to this machinery. So, since bcachefs is already a relational database masquerading as a filesystem, we're stealing the next traditional database technique and switching to a cycle detector for avoiding deadlocks. When we block taking a btree lock, after adding ourself to the waitlist but before sleeping, we do a DFS of btree transactions waiting on other btree transactions, starting with the current transaction and walking our held locks, and transactions blocking on our held locks. If we find a cycle, we emit a transaction restart. Occasionally (e.g. the btree split path) we can not allow the lock() operation to fail, so if necessary we'll tell another transaction that it has to fail. Result: trans_restart_would_deadlock events are reduced by a factor of 10 to 100, and we'll be able to delete a whole bunch of grotty, fragile code. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_locking.h | 65 +++++++++++++++++++++++++++++++++------------ 1 file changed, 48 insertions(+), 17 deletions(-) (limited to 'fs/bcachefs/btree_locking.h') diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h index aea2ebafffd8..874dd4428b3a 100644 --- a/fs/bcachefs/btree_locking.h +++ b/fs/bcachefs/btree_locking.h @@ -183,22 +183,41 @@ bch2_btree_node_unlock_write_inlined(struct btree_trans *trans, struct btree_pat void bch2_btree_node_unlock_write(struct btree_trans *, struct btree_path *, struct btree *); +int bch2_six_check_for_deadlock(struct six_lock *lock, void *p); + /* lock: */ +static inline int __btree_node_lock_nopath(struct btree_trans *trans, + struct btree_bkey_cached_common *b, + enum six_lock_type type, + bool lock_may_not_fail) +{ + int ret; + + trans->lock_may_not_fail = lock_may_not_fail; + trans->locking = b; + trans->lock_must_abort = false; + + ret = six_lock_type_waiter(&b->lock, type, &trans->locking_wait, + bch2_six_check_for_deadlock, trans); + WRITE_ONCE(trans->locking, NULL); + WRITE_ONCE(trans->locking_wait.start_time, 0); + return ret; +} + static inline int __must_check btree_node_lock_nopath(struct btree_trans *trans, struct btree_bkey_cached_common *b, enum six_lock_type type) { - six_lock_type(&b->lock, type, NULL, NULL); - return 0; + return __btree_node_lock_nopath(trans, b, type, false); } static inline void btree_node_lock_nopath_nofail(struct btree_trans *trans, struct btree_bkey_cached_common *b, enum six_lock_type type) { - int ret = btree_node_lock_nopath(trans, b, type); + int ret = __btree_node_lock_nopath(trans, b, type, true); BUG_ON(ret); } @@ -210,8 +229,6 @@ static inline int btree_node_lock_type(struct btree_trans *trans, enum six_lock_type type, six_lock_should_sleep_fn should_sleep_fn, void *p) { - int ret; - if (six_trylock_type(&b->lock, type)) return 0; @@ -219,11 +236,10 @@ static inline int btree_node_lock_type(struct btree_trans *trans, trans->locking_pos = pos; trans->locking_btree_id = path->btree_id; trans->locking_level = level; - trans->locking_lock_type = type; + trans->lock_may_not_fail = false; trans->locking = b; - ret = six_lock_type(&b->lock, type, should_sleep_fn, p); - trans->locking = NULL; - return ret; + return six_lock_type_waiter(&b->lock, type, &trans->locking_wait, + bch2_six_check_for_deadlock, trans); } /* @@ -279,12 +295,15 @@ static inline int btree_node_lock(struct btree_trans *trans, return ret; } -void __bch2_btree_node_lock_write(struct btree_trans *, struct btree_bkey_cached_common *); +int __bch2_btree_node_lock_write(struct btree_trans *, struct btree_bkey_cached_common *, bool); -static inline void bch2_btree_node_lock_write_nofail(struct btree_trans *trans, - struct btree_path *path, - struct btree_bkey_cached_common *b) +static inline int __btree_node_lock_write(struct btree_trans *trans, + struct btree_path *path, + struct btree_bkey_cached_common *b, + bool lock_may_not_fail) { + int ret; + EBUG_ON(&path->l[b->level].b->c != b); EBUG_ON(path->l[b->level].lock_seq != b->lock.state.seq); EBUG_ON(!btree_node_intent_locked(path, b->level)); @@ -296,8 +315,21 @@ static inline void bch2_btree_node_lock_write_nofail(struct btree_trans *trans, */ mark_btree_node_locked_noreset(path, b->level, SIX_LOCK_write); - if (unlikely(!six_trylock_write(&b->lock))) - __bch2_btree_node_lock_write(trans, b); + ret = likely(six_trylock_write(&b->lock)) + ? 0 + : __bch2_btree_node_lock_write(trans, b, lock_may_not_fail); + if (ret) + mark_btree_node_locked_noreset(path, b->level, SIX_LOCK_intent); + + return ret; +} + +static inline void bch2_btree_node_lock_write_nofail(struct btree_trans *trans, + struct btree_path *path, + struct btree_bkey_cached_common *b) +{ + int ret = __btree_node_lock_write(trans, path, b, true); + BUG_ON(ret); } static inline int __must_check @@ -305,8 +337,7 @@ bch2_btree_node_lock_write(struct btree_trans *trans, struct btree_path *path, struct btree_bkey_cached_common *b) { - bch2_btree_node_lock_write_nofail(trans, path, b); - return 0; + return __btree_node_lock_write(trans, path, b, false); } /* relock: */ -- cgit v1.2.3