From 1dd7f9d98de0740b42f1ac3f0b1d8af9c76801de Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 4 Apr 2019 21:53:12 -0400 Subject: bcachefs: Rewrite journal_seq_blacklist machinery Now, we store blacklisted journal sequence numbers in the superblock, not the journal: this helps to greatly simplify the code, and more importantly it's now implemented in a way that doesn't require all btree nodes to be visited before starting the journal - instead, we unconditionally blacklist the next 4 journal sequence numbers after an unclean shutdown. Signed-off-by: Kent Overstreet --- fs/bcachefs/journal_seq_blacklist.c | 491 ++++++++++++++++-------------------- 1 file changed, 223 insertions(+), 268 deletions(-) (limited to 'fs/bcachefs/journal_seq_blacklist.c') diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c index 45c8d38d12de..0df8dfccd5b5 100644 --- a/fs/bcachefs/journal_seq_blacklist.c +++ b/fs/bcachefs/journal_seq_blacklist.c @@ -1,13 +1,10 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" -#include "btree_update.h" -#include "btree_update_interior.h" -#include "error.h" -#include "journal.h" -#include "journal_io.h" -#include "journal_reclaim.h" +#include "btree_iter.h" +#include "eytzinger.h" #include "journal_seq_blacklist.h" +#include "super-io.h" /* * journal_seq_blacklist machinery: @@ -37,327 +34,285 @@ * record that it was blacklisted so that a) on recovery we don't think we have * missing journal entries and b) so that the btree code continues to ignore * that bset, until that btree node is rewritten. - * - * Blacklisted journal sequence numbers are themselves recorded as entries in - * the journal. */ -/* - * Called when journal needs to evict a blacklist entry to reclaim space: find - * any btree nodes that refer to the blacklist journal sequence numbers, and - * rewrite them: - */ -static void journal_seq_blacklist_flush(struct journal *j, - struct journal_entry_pin *pin, u64 seq) +static unsigned +blacklist_nr_entries(struct bch_sb_field_journal_seq_blacklist *bl) { - struct bch_fs *c = - container_of(j, struct bch_fs, journal); - struct journal_seq_blacklist *bl = - container_of(pin, struct journal_seq_blacklist, pin); - struct blacklisted_node n; - struct closure cl; - unsigned i; - int ret; + return bl + ? ((vstruct_end(&bl->field) - (void *) &bl->start[0]) / + sizeof(struct journal_seq_blacklist_entry)) + : 0; +} - closure_init_stack(&cl); +static unsigned sb_blacklist_u64s(unsigned nr) +{ + struct bch_sb_field_journal_seq_blacklist *bl; - for (i = 0;; i++) { - struct btree_trans trans; - struct btree_iter *iter; - struct btree *b; + return (sizeof(*bl) + sizeof(bl->start[0]) * nr) / sizeof(u64); +} - bch2_trans_init(&trans, c); +static struct bch_sb_field_journal_seq_blacklist * +blacklist_entry_try_merge(struct bch_fs *c, + struct bch_sb_field_journal_seq_blacklist *bl, + unsigned i) +{ + unsigned nr = blacklist_nr_entries(bl); + + if (le64_to_cpu(bl->start[i].end) >= + le64_to_cpu(bl->start[i + 1].start)) { + bl->start[i].end = bl->start[i + 1].end; + --nr; + memmove(&bl->start[i], + &bl->start[i + 1], + sizeof(bl->start[0]) * (nr - i)); + + bl = bch2_sb_resize_journal_seq_blacklist(&c->disk_sb, + sb_blacklist_u64s(nr)); + BUG_ON(!bl); + } - mutex_lock(&j->blacklist_lock); - if (i >= bl->nr_entries) { - mutex_unlock(&j->blacklist_lock); - break; - } - n = bl->entries[i]; - mutex_unlock(&j->blacklist_lock); + return bl; +} - iter = bch2_trans_get_node_iter(&trans, n.btree_id, n.pos, - 0, 0, 0); +int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end) +{ + struct bch_sb_field_journal_seq_blacklist *bl; + unsigned i, nr; + int ret = 0; - b = bch2_btree_iter_peek_node(iter); + mutex_lock(&c->sb_lock); + bl = bch2_sb_get_journal_seq_blacklist(c->disk_sb.sb); + nr = blacklist_nr_entries(bl); - /* The node might have already been rewritten: */ + if (bl) { + for (i = 0; i < nr; i++) { + struct journal_seq_blacklist_entry *e = + bl->start + i; - if (b->data->keys.seq == n.seq) { - ret = bch2_btree_node_rewrite(c, iter, n.seq, 0); - if (ret) { - bch2_trans_exit(&trans); - bch2_fs_fatal_error(c, - "error %i rewriting btree node with blacklisted journal seq", - ret); - bch2_journal_halt(j); - return; + if (start == le64_to_cpu(e->start) && + end == le64_to_cpu(e->end)) + goto out; + + if (start <= le64_to_cpu(e->start) && + end >= le64_to_cpu(e->end)) { + e->start = cpu_to_le64(start); + e->end = cpu_to_le64(end); + + if (i + 1 < nr) + bl = blacklist_entry_try_merge(c, + bl, i); + if (i) + bl = blacklist_entry_try_merge(c, + bl, i - 1); + goto out_write_sb; } } - - bch2_trans_exit(&trans); } - for (i = 0;; i++) { - struct btree_update *as; - struct pending_btree_node_free *d; - - mutex_lock(&j->blacklist_lock); - if (i >= bl->nr_entries) { - mutex_unlock(&j->blacklist_lock); - break; - } - n = bl->entries[i]; - mutex_unlock(&j->blacklist_lock); -redo_wait: - mutex_lock(&c->btree_interior_update_lock); - - /* - * Is the node on the list of pending interior node updates - - * being freed? If so, wait for that to finish: - */ - for_each_pending_btree_node_free(c, as, d) - if (n.seq == d->seq && - n.btree_id == d->btree_id && - !d->level && - !bkey_cmp(n.pos, d->key.k.p)) { - closure_wait(&as->wait, &cl); - mutex_unlock(&c->btree_interior_update_lock); - closure_sync(&cl); - goto redo_wait; - } - - mutex_unlock(&c->btree_interior_update_lock); + bl = bch2_sb_resize_journal_seq_blacklist(&c->disk_sb, + sb_blacklist_u64s(nr + 1)); + if (!bl) { + ret = -ENOMEM; + goto out; } - mutex_lock(&j->blacklist_lock); + bl->start[nr].start = cpu_to_le64(start); + bl->start[nr].end = cpu_to_le64(end); +out_write_sb: + c->disk_sb.sb->features[0] |= + 1ULL << BCH_FEATURE_JOURNAL_SEQ_BLACKLIST_V3; - bch2_journal_pin_drop(j, &bl->pin); - list_del(&bl->list); - kfree(bl->entries); - kfree(bl); + ret = bch2_write_super(c); +out: + mutex_unlock(&c->sb_lock); - mutex_unlock(&j->blacklist_lock); + return ret; } -/* - * Determine if a particular sequence number is blacklisted - if so, return - * blacklist entry: - */ -struct journal_seq_blacklist * -bch2_journal_seq_blacklist_find(struct journal *j, u64 seq) +static int journal_seq_blacklist_table_cmp(const void *_l, + const void *_r, size_t size) { - struct journal_seq_blacklist *bl; + const struct journal_seq_blacklist_table_entry *l = _l; + const struct journal_seq_blacklist_table_entry *r = _r; - lockdep_assert_held(&j->blacklist_lock); - - list_for_each_entry(bl, &j->seq_blacklist, list) - if (seq >= bl->start && seq <= bl->end) - return bl; - - return NULL; + return (l->start > r->start) - (l->start < r->start); } -/* - * Allocate a new, in memory blacklist entry: - */ -static struct journal_seq_blacklist * -bch2_journal_seq_blacklisted_new(struct journal *j, u64 start, u64 end) +bool bch2_journal_seq_is_blacklisted(struct bch_fs *c, u64 seq, + bool dirty) { - struct journal_seq_blacklist *bl; + struct journal_seq_blacklist_table *t = c->journal_seq_blacklist_table; + struct journal_seq_blacklist_table_entry search = { .start = seq }; + int idx; - lockdep_assert_held(&j->blacklist_lock); + if (!t) + return false; - /* - * When we start the journal, bch2_journal_start() will skip over @seq: - */ + idx = eytzinger0_find_le(t->entries, t->nr, + sizeof(t->entries[0]), + journal_seq_blacklist_table_cmp, + &search); + if (idx < 0) + return false; - bl = kzalloc(sizeof(*bl), GFP_KERNEL); - if (!bl) - return NULL; + BUG_ON(t->entries[idx].start > seq); - bl->start = start; - bl->end = end; + if (seq >= t->entries[idx].end) + return false; - list_add_tail(&bl->list, &j->seq_blacklist); - return bl; + if (dirty) + t->entries[idx].dirty = true; + return true; } -/* - * Returns true if @seq is newer than the most recent journal entry that got - * written, and data corresponding to @seq should be ignored - also marks @seq - * as blacklisted so that on future restarts the corresponding data will still - * be ignored: - */ -int bch2_journal_seq_should_ignore(struct bch_fs *c, u64 seq, struct btree *b) +int bch2_blacklist_table_initialize(struct bch_fs *c) { - struct journal *j = &c->journal; - struct journal_seq_blacklist *bl = NULL; - struct blacklisted_node *n; - u64 journal_seq; - int ret = 0; - - if (!seq) - return 0; + struct bch_sb_field_journal_seq_blacklist *bl = + bch2_sb_get_journal_seq_blacklist(c->disk_sb.sb); + struct journal_seq_blacklist_table *t; + unsigned i, nr = blacklist_nr_entries(bl); - spin_lock(&j->lock); - journal_seq = journal_cur_seq(j); - spin_unlock(&j->lock); + BUG_ON(c->journal_seq_blacklist_table); - /* Interier updates aren't journalled: */ - BUG_ON(b->level); - BUG_ON(seq > journal_seq && test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)); + if (!bl) + return 0; - /* - * Decrease this back to j->seq + 2 when we next rev the on disk format: - * increasing it temporarily to work around bug in old kernels - */ - fsck_err_on(seq > journal_seq + 4, c, - "bset journal seq too far in the future: %llu > %llu", - seq, journal_seq); + t = kzalloc(sizeof(*t) + sizeof(t->entries[0]) * nr, + GFP_KERNEL); + if (!t) + return -ENOMEM; - if (seq <= journal_seq && - list_empty_careful(&j->seq_blacklist)) - return 0; + t->nr = nr; - mutex_lock(&j->blacklist_lock); - - if (seq <= journal_seq) { - bl = bch2_journal_seq_blacklist_find(j, seq); - if (!bl) - goto out; - } else { - bch_verbose(c, "btree node %u:%llu:%llu has future journal sequence number %llu, blacklisting", - b->btree_id, b->key.k.p.inode, b->key.k.p.offset, seq); - - if (!j->new_blacklist) { - j->new_blacklist = bch2_journal_seq_blacklisted_new(j, - journal_seq + 1, - journal_seq + 1); - if (!j->new_blacklist) { - ret = -ENOMEM; - goto out; - } - } - bl = j->new_blacklist; - bl->end = max(bl->end, seq); + for (i = 0; i < nr; i++) { + t->entries[i].start = le64_to_cpu(bl->start[i].start); + t->entries[i].end = le64_to_cpu(bl->start[i].end); } - for (n = bl->entries; n < bl->entries + bl->nr_entries; n++) - if (b->data->keys.seq == n->seq && - b->btree_id == n->btree_id && - !bkey_cmp(b->key.k.p, n->pos)) - goto found_entry; - - if (!bl->nr_entries || - is_power_of_2(bl->nr_entries)) { - n = krealloc(bl->entries, - max_t(size_t, bl->nr_entries * 2, 8) * sizeof(*n), - GFP_KERNEL); - if (!n) { - ret = -ENOMEM; - goto out; - } - bl->entries = n; - } + eytzinger0_sort(t->entries, + t->nr, + sizeof(t->entries[0]), + journal_seq_blacklist_table_cmp, + NULL); - bl->entries[bl->nr_entries++] = (struct blacklisted_node) { - .seq = b->data->keys.seq, - .btree_id = b->btree_id, - .pos = b->key.k.p, - }; -found_entry: - ret = 1; -out: -fsck_err: - mutex_unlock(&j->blacklist_lock); - return ret; + c->journal_seq_blacklist_table = t; + return 0; } -static int __bch2_journal_seq_blacklist_read(struct journal *j, - struct journal_replay *i, - u64 start, u64 end) +static const char * +bch2_sb_journal_seq_blacklist_validate(struct bch_sb *sb, + struct bch_sb_field *f) { - struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct journal_seq_blacklist *bl; - - bch_verbose(c, "blacklisting existing journal seq %llu-%llu", - start, end); + struct bch_sb_field_journal_seq_blacklist *bl = + field_to_type(f, journal_seq_blacklist); + struct journal_seq_blacklist_entry *i; + unsigned nr = blacklist_nr_entries(bl); + + for (i = bl->start; i < bl->start + nr; i++) { + if (le64_to_cpu(i->start) >= + le64_to_cpu(i->end)) + return "entry start >= end"; + + if (i + 1 < bl->start + nr && + le64_to_cpu(i[0].end) > + le64_to_cpu(i[1].start)) + return "entries out of order"; + } - bl = bch2_journal_seq_blacklisted_new(j, start, end); - if (!bl) - return -ENOMEM; + return NULL; +} - bch2_journal_pin_add(j, le64_to_cpu(i->j.seq), &bl->pin, - journal_seq_blacklist_flush); - return 0; +static void bch2_sb_journal_seq_blacklist_to_text(struct printbuf *out, + struct bch_sb *sb, + struct bch_sb_field *f) +{ + struct bch_sb_field_journal_seq_blacklist *bl = + field_to_type(f, journal_seq_blacklist); + struct journal_seq_blacklist_entry *i; + unsigned nr = blacklist_nr_entries(bl); + + for (i = bl->start; i < bl->start + nr; i++) { + if (i != bl->start) + pr_buf(out, " "); + + pr_buf(out, "%llu-%llu", + le64_to_cpu(i->start), + le64_to_cpu(i->end)); + } } -/* - * After reading the journal, find existing journal seq blacklist entries and - * read them into memory: - */ -int bch2_journal_seq_blacklist_read(struct journal *j, - struct journal_replay *i) +const struct bch_sb_field_ops bch_sb_field_ops_journal_seq_blacklist = { + .validate = bch2_sb_journal_seq_blacklist_validate, + .to_text = bch2_sb_journal_seq_blacklist_to_text +}; + +void bch2_blacklist_entries_gc(struct work_struct *work) { - struct jset_entry *entry; - int ret = 0; + struct bch_fs *c = container_of(work, struct bch_fs, + journal_seq_blacklist_gc_work); + struct journal_seq_blacklist_table *t; + struct bch_sb_field_journal_seq_blacklist *bl; + struct journal_seq_blacklist_entry *src, *dst; + struct btree_trans trans; + unsigned i, nr, new_nr; + int ret; - vstruct_for_each(&i->j, entry) { - switch (entry->type) { - case BCH_JSET_ENTRY_blacklist: { - struct jset_entry_blacklist *bl_entry = - container_of(entry, struct jset_entry_blacklist, entry); + bch2_trans_init(&trans, c); - ret = __bch2_journal_seq_blacklist_read(j, i, - le64_to_cpu(bl_entry->seq), - le64_to_cpu(bl_entry->seq)); - break; - } - case BCH_JSET_ENTRY_blacklist_v2: { - struct jset_entry_blacklist_v2 *bl_entry = - container_of(entry, struct jset_entry_blacklist_v2, entry); - - ret = __bch2_journal_seq_blacklist_read(j, i, - le64_to_cpu(bl_entry->start), - le64_to_cpu(bl_entry->end)); - break; - } - } + for (i = 0; i < BTREE_ID_NR; i++) { + struct btree_iter *iter; + struct btree *b; - if (ret) - break; + for_each_btree_node(&trans, iter, i, POS_MIN, + BTREE_ITER_PREFETCH, b) + if (test_bit(BCH_FS_STOPPING, &c->flags)) { + bch2_trans_exit(&trans); + return; + } + bch2_trans_iter_free(&trans, iter); } - return ret; -} - -/* - * After reading the journal and walking the btree, we might have new journal - * sequence numbers to blacklist - add entries to the next journal entry to be - * written: - */ -void bch2_journal_seq_blacklist_write(struct journal *j) -{ - struct journal_seq_blacklist *bl = j->new_blacklist; - struct jset_entry_blacklist_v2 *bl_entry; - struct jset_entry *entry; + ret = bch2_trans_exit(&trans); + if (ret) + return; + mutex_lock(&c->sb_lock); + bl = bch2_sb_get_journal_seq_blacklist(c->disk_sb.sb); if (!bl) - return; + goto out; - entry = bch2_journal_add_entry_noreservation(journal_cur_buf(j), - (sizeof(*bl_entry) - sizeof(*entry)) / sizeof(u64)); + nr = blacklist_nr_entries(bl); + dst = bl->start; - bl_entry = container_of(entry, struct jset_entry_blacklist_v2, entry); - bl_entry->entry.type = BCH_JSET_ENTRY_blacklist_v2; - bl_entry->start = cpu_to_le64(bl->start); - bl_entry->end = cpu_to_le64(bl->end); + t = c->journal_seq_blacklist_table; + BUG_ON(nr != t->nr); + + for (src = bl->start, i = eytzinger0_first(t->nr); + src < bl->start + nr; + src++, i = eytzinger0_next(i, nr)) { + BUG_ON(t->entries[i].start != le64_to_cpu(src->start)); + BUG_ON(t->entries[i].end != le64_to_cpu(src->end)); + + if (t->entries[i].dirty) + *dst++ = *src; + } - bch2_journal_pin_add(j, - journal_cur_seq(j), - &bl->pin, - journal_seq_blacklist_flush); + new_nr = dst - bl->start; - j->new_blacklist = NULL; + bch_info(c, "nr blacklist entries was %u, now %u", nr, new_nr); + + if (new_nr != nr) { + bl = bch2_sb_resize_journal_seq_blacklist(&c->disk_sb, + new_nr ? sb_blacklist_u64s(new_nr) : 0); + BUG_ON(new_nr && !bl); + + if (!new_nr) + c->disk_sb.sb->features[0] &= + ~(1ULL << BCH_FEATURE_JOURNAL_SEQ_BLACKLIST_V3); + + bch2_write_super(c); + } +out: + mutex_unlock(&c->sb_lock); } -- cgit v1.2.3