diff options
author | Kent Overstreet <kent.overstreet@gmail.com> | 2019-04-12 05:39:39 +0300 |
---|---|---|
committer | Kent Overstreet <kent.overstreet@linux.dev> | 2023-10-23 00:08:20 +0300 |
commit | 644d180b055fa47be7e6ca8b684f45e2350dfafd (patch) | |
tree | e7842030427308ac1f4b7c69b5f365e7e6bb39aa /fs | |
parent | 3ea2b1e12898154d6fae49b22a3509521ba49d38 (diff) | |
download | linux-644d180b055fa47be7e6ca8b684f45e2350dfafd.tar.xz |
bcachefs: Journal replay refactoring
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
Diffstat (limited to 'fs')
-rw-r--r-- | fs/bcachefs/journal.c | 15 | ||||
-rw-r--r-- | fs/bcachefs/journal_io.c | 130 | ||||
-rw-r--r-- | fs/bcachefs/journal_io.h | 2 | ||||
-rw-r--r-- | fs/bcachefs/journal_types.h | 1 | ||||
-rw-r--r-- | fs/bcachefs/recovery.c | 343 |
5 files changed, 251 insertions, 240 deletions
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index 969612e612e0..25d0631c43dd 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -984,9 +984,9 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq, u64 last_seq = cur_seq, nr, seq; if (!list_empty(journal_entries)) - last_seq = le64_to_cpu(list_last_entry(journal_entries, - struct journal_replay, - list)->j.last_seq); + last_seq = le64_to_cpu(list_first_entry(journal_entries, + struct journal_replay, + list)->j.seq); nr = cur_seq - last_seq; @@ -999,6 +999,8 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq, } } + j->replay_journal_seq = last_seq; + j->replay_journal_seq_end = cur_seq; j->last_seq_ondisk = last_seq; j->pin.front = last_seq; j->pin.back = cur_seq; @@ -1007,7 +1009,7 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq, fifo_for_each_entry_ptr(p, &j->pin, seq) { INIT_LIST_HEAD(&p->list); INIT_LIST_HEAD(&p->flushed); - atomic_set(&p->count, 0); + atomic_set(&p->count, 1); p->devs.nr = 0; } @@ -1016,10 +1018,7 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq, BUG_ON(seq < last_seq || seq >= cur_seq); - p = journal_seq_pin(j, seq); - - atomic_set(&p->count, 1); - p->devs = i->devs; + journal_seq_pin(j, seq)->devs = i->devs; } spin_lock(&j->lock); diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index 8010b38114ac..4fd7b048050b 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -1,9 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" -#include "alloc_background.h" #include "alloc_foreground.h" -#include "btree_gc.h" -#include "btree_update.h" #include "buckets.h" #include "checksum.h" #include "error.h" @@ -642,18 +639,6 @@ err: goto out; } -void bch2_journal_entries_free(struct list_head *list) -{ - - while (!list_empty(list)) { - struct journal_replay *i = - list_first_entry(list, struct journal_replay, list); - list_del(&i->list); - kvpfree(i, offsetof(struct journal_replay, j) + - vstruct_bytes(&i->j)); - } -} - int bch2_journal_read(struct bch_fs *c, struct list_head *list) { struct journal_list jlist; @@ -733,121 +718,6 @@ fsck_err: return ret; } -/* journal replay: */ - -static int bch2_extent_replay_key(struct bch_fs *c, struct bkey_i *k) -{ - struct btree_trans trans; - struct btree_iter *iter; - /* - * We might cause compressed extents to be - * split, so we need to pass in a - * disk_reservation: - */ - struct disk_reservation disk_res = - bch2_disk_reservation_init(c, 0); - BKEY_PADDED(k) split; - int ret; - - bch2_trans_init(&trans, c); - - iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, - bkey_start_pos(&k->k), - BTREE_ITER_INTENT); - do { - ret = bch2_btree_iter_traverse(iter); - if (ret) - break; - - bkey_copy(&split.k, k); - bch2_cut_front(iter->pos, &split.k); - bch2_extent_trim_atomic(&split.k, iter); - - ret = bch2_disk_reservation_add(c, &disk_res, - split.k.k.size * - bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(&split.k)), - BCH_DISK_RESERVATION_NOFAIL); - BUG_ON(ret); - - bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &split.k)); - ret = bch2_trans_commit(&trans, &disk_res, NULL, - BTREE_INSERT_ATOMIC| - BTREE_INSERT_NOFAIL| - BTREE_INSERT_LAZY_RW| - BTREE_INSERT_JOURNAL_REPLAY); - } while ((!ret || ret == -EINTR) && - bkey_cmp(k->k.p, iter->pos)); - - bch2_disk_reservation_put(c, &disk_res); - - /* - * This isn't strictly correct - we should only be relying on the btree - * node lock for synchronization with gc when we've got a write lock - * held. - * - * but - there are other correctness issues if btree gc were to run - * before journal replay finishes - */ - BUG_ON(c->gc_pos.phase); - - bch2_mark_key(c, bkey_i_to_s_c(k), false, -((s64) k->k.size), - NULL, 0, 0); - bch2_trans_exit(&trans); - - return ret; -} - -int bch2_journal_replay(struct bch_fs *c, struct list_head *list) -{ - struct journal *j = &c->journal; - struct bkey_i *k, *_n; - struct jset_entry *entry; - struct journal_replay *i, *n; - int ret = 0; - - list_for_each_entry_safe(i, n, list, list) { - j->replay_journal_seq = le64_to_cpu(i->j.seq); - - for_each_jset_key(k, _n, entry, &i->j) { - switch (entry->btree_id) { - case BTREE_ID_ALLOC: - ret = bch2_alloc_replay_key(c, k); - break; - case BTREE_ID_EXTENTS: - ret = bch2_extent_replay_key(c, k); - break; - default: - ret = bch2_btree_insert(c, entry->btree_id, k, - NULL, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_LAZY_RW| - BTREE_INSERT_JOURNAL_REPLAY| - BTREE_INSERT_NOMARK); - break; - } - - if (ret) { - bch_err(c, "journal replay: error %d while replaying key", - ret); - goto err; - } - - cond_resched(); - } - - bch2_journal_pin_put(j, j->replay_journal_seq); - } - - j->replay_journal_seq = 0; - - bch2_journal_set_replay_done(j); - bch2_journal_flush_all_pins(j); - ret = bch2_journal_error(j); -err: - bch2_journal_entries_free(list); - return ret; -} - /* journal write: */ static void __journal_write_alloc(struct journal *j, diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h index 4bb174839956..72e575f360af 100644 --- a/fs/bcachefs/journal_io.h +++ b/fs/bcachefs/journal_io.h @@ -36,8 +36,6 @@ static inline struct jset_entry *__jset_entry_type_next(struct jset *jset, vstruct_for_each_safe(entry, k, _n) int bch2_journal_read(struct bch_fs *, struct list_head *); -void bch2_journal_entries_free(struct list_head *); -int bch2_journal_replay(struct bch_fs *, struct list_head *); void bch2_journal_write(struct closure *); diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h index 7349b50bc5e7..0585e9b6e230 100644 --- a/fs/bcachefs/journal_types.h +++ b/fs/bcachefs/journal_types.h @@ -203,6 +203,7 @@ struct journal { } pin; u64 replay_journal_seq; + u64 replay_journal_seq_end; struct write_point wp; spinlock_t err_lock; diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index b1fcc105cffd..2e849135195d 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -12,94 +12,162 @@ #include "error.h" #include "fsck.h" #include "journal_io.h" +#include "journal_reclaim.h" #include "journal_seq_blacklist.h" #include "quota.h" #include "recovery.h" #include "replicas.h" #include "super-io.h" +#include <linux/sort.h> #include <linux/stat.h> #define QSTR(n) { { { .len = strlen(n) } }, .name = n } -static struct bkey_i *btree_root_find(struct bch_fs *c, - struct bch_sb_field_clean *clean, - struct jset *j, - enum btree_id id, unsigned *level) +/* journal replay: */ + +static void bch2_journal_entries_free(struct list_head *list) { - struct bkey_i *k; - struct jset_entry *entry, *start, *end; - if (clean) { - start = clean->start; - end = vstruct_end(&clean->field); - } else { - start = j->start; - end = vstruct_last(j); + while (!list_empty(list)) { + struct journal_replay *i = + list_first_entry(list, struct journal_replay, list); + list_del(&i->list); + kvpfree(i, offsetof(struct journal_replay, j) + + vstruct_bytes(&i->j)); } +} - for (entry = start; entry < end; entry = vstruct_next(entry)) - if (entry->type == BCH_JSET_ENTRY_btree_root && - entry->btree_id == id) - goto found; +static int bch2_extent_replay_key(struct bch_fs *c, struct bkey_i *k) +{ + struct btree_trans trans; + struct btree_iter *iter; + /* + * We might cause compressed extents to be + * split, so we need to pass in a + * disk_reservation: + */ + struct disk_reservation disk_res = + bch2_disk_reservation_init(c, 0); + BKEY_PADDED(k) split; + int ret; - return NULL; -found: - if (!entry->u64s) - return ERR_PTR(-EINVAL); + bch2_trans_init(&trans, c); - k = entry->start; - *level = entry->level; - return k; -} + iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, + bkey_start_pos(&k->k), + BTREE_ITER_INTENT); + do { + ret = bch2_btree_iter_traverse(iter); + if (ret) + break; -static int verify_superblock_clean(struct bch_fs *c, - struct bch_sb_field_clean **cleanp, - struct jset *j) -{ - unsigned i; - struct bch_sb_field_clean *clean = *cleanp; - int ret = 0; + bkey_copy(&split.k, k); + bch2_cut_front(iter->pos, &split.k); + bch2_extent_trim_atomic(&split.k, iter); - if (!clean || !j) - return 0; + ret = bch2_disk_reservation_add(c, &disk_res, + split.k.k.size * + bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(&split.k)), + BCH_DISK_RESERVATION_NOFAIL); + BUG_ON(ret); - if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c, - "superblock journal seq (%llu) doesn't match journal (%llu) after clean shutdown", - le64_to_cpu(clean->journal_seq), - le64_to_cpu(j->seq))) { - kfree(clean); - *cleanp = NULL; - return 0; + bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &split.k)); + ret = bch2_trans_commit(&trans, &disk_res, NULL, + BTREE_INSERT_ATOMIC| + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW| + BTREE_INSERT_JOURNAL_REPLAY); + } while ((!ret || ret == -EINTR) && + bkey_cmp(k->k.p, iter->pos)); + + bch2_disk_reservation_put(c, &disk_res); + + /* + * This isn't strictly correct - we should only be relying on the btree + * node lock for synchronization with gc when we've got a write lock + * held. + * + * but - there are other correctness issues if btree gc were to run + * before journal replay finishes + */ + BUG_ON(c->gc_pos.phase); + + bch2_mark_key(c, bkey_i_to_s_c(k), false, -((s64) k->k.size), + NULL, 0, 0); + bch2_trans_exit(&trans); + + return ret; +} + +static int bch2_journal_replay_key(struct bch_fs *c, enum btree_id btree_id, + struct bkey_i *k) +{ + switch (btree_id) { + case BTREE_ID_ALLOC: + return bch2_alloc_replay_key(c, k); + case BTREE_ID_EXTENTS: + return bch2_extent_replay_key(c, k); + default: + return bch2_btree_insert(c, btree_id, k, + NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW| + BTREE_INSERT_JOURNAL_REPLAY| + BTREE_INSERT_NOMARK); } +} - mustfix_fsck_err_on(j->read_clock != clean->read_clock, c, - "superblock read clock doesn't match journal after clean shutdown"); - mustfix_fsck_err_on(j->write_clock != clean->write_clock, c, - "superblock read clock doesn't match journal after clean shutdown"); +static void replay_now_at(struct journal *j, u64 seq) +{ + BUG_ON(seq < j->replay_journal_seq); + BUG_ON(seq > j->replay_journal_seq_end); - for (i = 0; i < BTREE_ID_NR; i++) { - struct bkey_i *k1, *k2; - unsigned l1 = 0, l2 = 0; + while (j->replay_journal_seq < seq) + bch2_journal_pin_put(j, j->replay_journal_seq++); +} - k1 = btree_root_find(c, clean, NULL, i, &l1); - k2 = btree_root_find(c, NULL, j, i, &l2); +static int bch2_journal_replay(struct bch_fs *c, struct list_head *list) +{ + struct journal *j = &c->journal; + struct bkey_i *k, *_n; + struct jset_entry *entry; + struct journal_replay *i, *n; + int ret = 0; - if (!k1 && !k2) - continue; + list_for_each_entry_safe(i, n, list, list) { + replay_now_at(j, le64_to_cpu(i->j.seq)); - mustfix_fsck_err_on(!k1 || !k2 || - IS_ERR(k1) || - IS_ERR(k2) || - k1->k.u64s != k2->k.u64s || - memcmp(k1, k2, bkey_bytes(k1)) || - l1 != l2, c, - "superblock btree root doesn't match journal after clean shutdown"); + for_each_jset_key(k, _n, entry, &i->j) { + ret = bch2_journal_replay_key(c, entry->btree_id, k); + if (ret) { + bch_err(c, "journal replay: error %d while replaying key", + ret); + goto err; + } + + cond_resched(); + } } -fsck_err: + + replay_now_at(j, j->replay_journal_seq_end); + j->replay_journal_seq = 0; + + bch2_journal_set_replay_done(j); + bch2_journal_flush_all_pins(j); + ret = bch2_journal_error(j); +err: + bch2_journal_entries_free(list); return ret; } +static bool journal_empty(struct list_head *journal) +{ + return list_empty(journal) || + journal_entry_empty(&list_last_entry(journal, + struct journal_replay, list)->j); +} + static int verify_journal_entries_not_blacklisted_or_missing(struct bch_fs *c, struct list_head *journal) @@ -130,40 +198,7 @@ fsck_err: return ret; } -static struct bch_sb_field_clean *read_superblock_clean(struct bch_fs *c) -{ - struct bch_sb_field_clean *clean, *sb_clean; - int ret; - - mutex_lock(&c->sb_lock); - sb_clean = bch2_sb_get_clean(c->disk_sb.sb); - - if (fsck_err_on(!sb_clean, c, - "superblock marked clean but clean section not present")) { - SET_BCH_SB_CLEAN(c->disk_sb.sb, false); - c->sb.clean = false; - mutex_unlock(&c->sb_lock); - return NULL; - } - - clean = kmemdup(sb_clean, vstruct_bytes(&sb_clean->field), - GFP_KERNEL); - if (!clean) { - mutex_unlock(&c->sb_lock); - return ERR_PTR(-ENOMEM); - } - - if (le16_to_cpu(c->disk_sb.sb->version) < - bcachefs_metadata_version_bkey_renumber) - bch2_sb_clean_renumber(clean, READ); - - mutex_unlock(&c->sb_lock); - - return clean; -fsck_err: - mutex_unlock(&c->sb_lock); - return ERR_PTR(ret); -} +/* journal replay early: */ static int journal_replay_entry_early(struct bch_fs *c, struct jset_entry *entry) @@ -275,6 +310,121 @@ static int journal_replay_early(struct bch_fs *c, return 0; } +/* sb clean section: */ + +static struct bkey_i *btree_root_find(struct bch_fs *c, + struct bch_sb_field_clean *clean, + struct jset *j, + enum btree_id id, unsigned *level) +{ + struct bkey_i *k; + struct jset_entry *entry, *start, *end; + + if (clean) { + start = clean->start; + end = vstruct_end(&clean->field); + } else { + start = j->start; + end = vstruct_last(j); + } + + for (entry = start; entry < end; entry = vstruct_next(entry)) + if (entry->type == BCH_JSET_ENTRY_btree_root && + entry->btree_id == id) + goto found; + + return NULL; +found: + if (!entry->u64s) + return ERR_PTR(-EINVAL); + + k = entry->start; + *level = entry->level; + return k; +} + +static int verify_superblock_clean(struct bch_fs *c, + struct bch_sb_field_clean **cleanp, + struct jset *j) +{ + unsigned i; + struct bch_sb_field_clean *clean = *cleanp; + int ret = 0; + + if (!clean || !j) + return 0; + + if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c, + "superblock journal seq (%llu) doesn't match journal (%llu) after clean shutdown", + le64_to_cpu(clean->journal_seq), + le64_to_cpu(j->seq))) { + kfree(clean); + *cleanp = NULL; + return 0; + } + + mustfix_fsck_err_on(j->read_clock != clean->read_clock, c, + "superblock read clock doesn't match journal after clean shutdown"); + mustfix_fsck_err_on(j->write_clock != clean->write_clock, c, + "superblock read clock doesn't match journal after clean shutdown"); + + for (i = 0; i < BTREE_ID_NR; i++) { + struct bkey_i *k1, *k2; + unsigned l1 = 0, l2 = 0; + + k1 = btree_root_find(c, clean, NULL, i, &l1); + k2 = btree_root_find(c, NULL, j, i, &l2); + + if (!k1 && !k2) + continue; + + mustfix_fsck_err_on(!k1 || !k2 || + IS_ERR(k1) || + IS_ERR(k2) || + k1->k.u64s != k2->k.u64s || + memcmp(k1, k2, bkey_bytes(k1)) || + l1 != l2, c, + "superblock btree root doesn't match journal after clean shutdown"); + } +fsck_err: + return ret; +} + +static struct bch_sb_field_clean *read_superblock_clean(struct bch_fs *c) +{ + struct bch_sb_field_clean *clean, *sb_clean; + int ret; + + mutex_lock(&c->sb_lock); + sb_clean = bch2_sb_get_clean(c->disk_sb.sb); + + if (fsck_err_on(!sb_clean, c, + "superblock marked clean but clean section not present")) { + SET_BCH_SB_CLEAN(c->disk_sb.sb, false); + c->sb.clean = false; + mutex_unlock(&c->sb_lock); + return NULL; + } + + clean = kmemdup(sb_clean, vstruct_bytes(&sb_clean->field), + GFP_KERNEL); + if (!clean) { + mutex_unlock(&c->sb_lock); + return ERR_PTR(-ENOMEM); + } + + if (le16_to_cpu(c->disk_sb.sb->version) < + bcachefs_metadata_version_bkey_renumber) + bch2_sb_clean_renumber(clean, READ); + + mutex_unlock(&c->sb_lock); + + return clean; +fsck_err: + mutex_unlock(&c->sb_lock); + return ERR_PTR(ret); +} + static int read_btree_roots(struct bch_fs *c) { unsigned i; @@ -320,13 +470,6 @@ fsck_err: return ret; } -static bool journal_empty(struct list_head *journal) -{ - return list_empty(journal) || - journal_entry_empty(&list_last_entry(journal, - struct journal_replay, list)->j); -} - int bch2_fs_recovery(struct bch_fs *c) { const char *err = "cannot allocate memory"; |