summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKent Overstreet <kent.overstreet@gmail.com>2019-12-30 22:37:25 +0300
committerKent Overstreet <kent.overstreet@linux.dev>2023-10-23 00:08:36 +0300
commite3e464ac6d09269b19cea3dc32b626db44d0e6ba (patch)
tree7aafd377933161ed88573a5e3dab7ee3d8e0e06a
parent57b0b3db475de6b724e4db3b827c00484cdde642 (diff)
downloadlinux-e3e464ac6d09269b19cea3dc32b626db44d0e6ba.tar.xz
bcachefs: Move extent overwrite handling out of core btree code
Ever since the btree code was first written, handling of overwriting existing extents - including partially overwriting and splittin existing extents - was handled as part of the core btree insert path. The modern transaction and iterator infrastructure didn't exist then, so that was the only way for it to be done. This patch moves that outside of the core btree code to a pass that runs at transaction commit time. This is a significant simplification to the btree code and overall reduction in code size, but more importantly it gets us much closer to the core btree code being completely independent of extents and is important prep work for snapshots. This introduces a new feature bit; the old and new extent update models are incompatible when the filesystem needs journal replay. Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
-rw-r--r--fs/bcachefs/bcachefs_format.h6
-rw-r--r--fs/bcachefs/btree_gc.c57
-rw-r--r--fs/bcachefs/btree_io.c17
-rw-r--r--fs/bcachefs/btree_iter.c25
-rw-r--r--fs/bcachefs/btree_types.h3
-rw-r--r--fs/bcachefs/btree_update.h5
-rw-r--r--fs/bcachefs/btree_update_interior.h23
-rw-r--r--fs/bcachefs/btree_update_leaf.c228
-rw-r--r--fs/bcachefs/buckets.c13
-rw-r--r--fs/bcachefs/buckets.h2
-rw-r--r--fs/bcachefs/extent_update.c410
-rw-r--r--fs/bcachefs/extent_update.h5
-rw-r--r--fs/bcachefs/fsck.c56
-rw-r--r--fs/bcachefs/recovery.c154
-rw-r--r--fs/bcachefs/recovery.h2
15 files changed, 404 insertions, 602 deletions
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index d1c0a5d5580e..1ad5ff449a5b 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -1315,12 +1315,14 @@ LE64_BITMASK(BCH_SB_ERASURE_CODE, struct bch_sb, flags[3], 0, 16);
x(inline_data, 8) \
x(new_extent_overwrite, 9) \
x(incompressible, 10) \
- x(btree_ptr_v2, 11)
+ x(btree_ptr_v2, 11) \
+ x(extents_above_btree_updates, 12)
#define BCH_SB_FEATURES_ALL \
((1ULL << BCH_FEATURE_new_siphash)| \
(1ULL << BCH_FEATURE_new_extent_overwrite)| \
- (1ULL << BCH_FEATURE_btree_ptr_v2))
+ (1ULL << BCH_FEATURE_btree_ptr_v2)| \
+ (1ULL << BCH_FEATURE_extents_above_btree_updates))
enum bch_sb_feature {
#define x(f, n) BCH_FEATURE_##f,
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index a5fe3b316e06..f85fbc057fb3 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -186,8 +186,16 @@ fsck_err:
return ret;
}
-static int btree_gc_mark_node(struct bch_fs *c, struct btree *b,
- u8 *max_stale, bool initial)
+static bool pos_in_journal_keys(struct journal_keys *journal_keys,
+ enum btree_id id, struct bpos pos)
+{
+ struct journal_key *k = journal_key_search(journal_keys, id, pos);
+
+ return k && k->btree_id == id && !bkey_cmp(k->k->k.p, pos);
+}
+
+static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, u8 *max_stale,
+ struct journal_keys *journal_keys, bool initial)
{
struct btree_node_iter iter;
struct bkey unpacked;
@@ -201,6 +209,10 @@ static int btree_gc_mark_node(struct bch_fs *c, struct btree *b,
for_each_btree_node_key_unpack(b, k, &iter,
&unpacked) {
+ if (!b->c.level && journal_keys &&
+ pos_in_journal_keys(journal_keys, b->c.btree_id, k.k->p))
+ continue;
+
bch2_bkey_debugcheck(c, b, k);
ret = bch2_gc_mark_key(c, k, max_stale, initial);
@@ -212,6 +224,7 @@ static int btree_gc_mark_node(struct bch_fs *c, struct btree *b,
}
static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
+ struct journal_keys *journal_keys,
bool initial, bool metadata_only)
{
struct btree_trans trans;
@@ -239,7 +252,8 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
gc_pos_set(c, gc_pos_btree_node(b));
- ret = btree_gc_mark_node(c, b, &max_stale, initial);
+ ret = btree_gc_mark_node(c, b, &max_stale,
+ journal_keys, initial);
if (ret)
break;
@@ -281,36 +295,6 @@ static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r)
(int) btree_id_to_gc_phase(r);
}
-static int mark_journal_key(struct bch_fs *c, enum btree_id id,
- struct bkey_i *insert)
-{
- struct btree_trans trans;
- struct btree_iter *iter;
- struct bkey_s_c k;
- u8 max_stale;
- int ret = 0;
-
- ret = bch2_gc_mark_key(c, bkey_i_to_s_c(insert), &max_stale, true);
- if (ret)
- return ret;
-
- bch2_trans_init(&trans, c, 0, 0);
-
- for_each_btree_key(&trans, iter, id, bkey_start_pos(&insert->k),
- BTREE_ITER_SLOTS, k, ret) {
- percpu_down_read(&c->mark_lock);
- ret = bch2_mark_overwrite(&trans, iter, k, insert, NULL,
- BTREE_TRIGGER_GC|
- BTREE_TRIGGER_NOATOMIC);
- percpu_up_read(&c->mark_lock);
-
- if (!ret)
- break;
- }
-
- return bch2_trans_exit(&trans) ?: ret;
-}
-
static int bch2_gc_btrees(struct bch_fs *c, struct journal_keys *journal_keys,
bool initial, bool metadata_only)
{
@@ -325,18 +309,21 @@ static int bch2_gc_btrees(struct bch_fs *c, struct journal_keys *journal_keys,
enum btree_id id = ids[i];
enum btree_node_type type = __btree_node_type(0, id);
- int ret = bch2_gc_btree(c, id, initial, metadata_only);
+ int ret = bch2_gc_btree(c, id, journal_keys,
+ initial, metadata_only);
if (ret)
return ret;
if (journal_keys && !metadata_only &&
btree_node_type_needs_gc(type)) {
struct journal_key *j;
+ u8 max_stale;
int ret;
for_each_journal_key(*journal_keys, j)
if (j->btree_id == id) {
- ret = mark_journal_key(c, id, j->k);
+ ret = bch2_gc_mark_key(c, bkey_i_to_s_c(j->k),
+ &max_stale, initial);
if (ret)
return ret;
}
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index a4732bf13a11..d0b761417903 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -708,9 +708,7 @@ static int validate_bset(struct bch_fs *c, struct btree *b,
unsigned *whiteout_u64s, int write,
bool have_retry)
{
- struct bkey_packed *k;
- struct bkey prev = KEY(0, 0, 0);
- struct bpos prev_data = POS_MIN;
+ struct bkey_packed *k, *prev = NULL;
bool seen_non_whiteout = false;
unsigned version;
const char *err;
@@ -852,15 +850,15 @@ static int validate_bset(struct bch_fs *c, struct btree *b,
if (!seen_non_whiteout &&
(!bkey_whiteout(k) ||
- (bkey_cmp(prev.p, bkey_start_pos(u.k)) > 0))) {
+ (prev && bkey_iter_cmp(b, prev, k) > 0))) {
*whiteout_u64s = k->_data - i->_data;
seen_non_whiteout = true;
- } else if (bkey_cmp(prev_data, bkey_start_pos(u.k)) > 0 ||
- bkey_cmp(prev.p, u.k->p) > 0) {
+ } else if (prev && bkey_iter_cmp(b, prev, k) > 0) {
char buf1[80];
char buf2[80];
+ struct bkey up = bkey_unpack_key(b, prev);
- bch2_bkey_to_text(&PBUF(buf1), &prev);
+ bch2_bkey_to_text(&PBUF(buf1), &up);
bch2_bkey_to_text(&PBUF(buf2), u.k);
bch2_dump_bset(b, i, 0);
@@ -870,10 +868,7 @@ static int validate_bset(struct bch_fs *c, struct btree *b,
/* XXX: repair this */
}
- if (!bkey_deleted(u.k))
- prev_data = u.k->p;
- prev = *u.k;
-
+ prev = k;
k = bkey_next_skip_noops(k, vstruct_last(i));
}
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 347477c62779..5f918c6c3efb 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1504,12 +1504,12 @@ static struct bkey_s_c __btree_trans_updates_peek(struct btree_iter *iter)
struct btree_trans *trans = iter->trans;
struct btree_insert_entry *i;
- trans_for_each_update(trans, i)
+ trans_for_each_update2(trans, i)
if ((cmp_int(iter->btree_id, i->iter->btree_id) ?:
bkey_cmp(pos, i->k->k.p)) <= 0)
break;
- return i < trans->updates + trans->nr_updates &&
+ return i < trans->updates2 + trans->nr_updates2 &&
iter->btree_id == i->iter->btree_id
? bkey_i_to_s_c(i->k)
: bkey_s_c_null;
@@ -1821,7 +1821,7 @@ int bch2_trans_iter_free(struct btree_trans *trans,
static int bch2_trans_realloc_iters(struct btree_trans *trans,
unsigned new_size)
{
- void *new_iters, *new_updates;
+ void *p, *new_iters, *new_updates, *new_updates2;
size_t iters_bytes;
size_t updates_bytes;
@@ -1839,21 +1839,27 @@ static int bch2_trans_realloc_iters(struct btree_trans *trans,
iters_bytes = sizeof(struct btree_iter) * new_size;
updates_bytes = sizeof(struct btree_insert_entry) * new_size;
- new_iters = kmalloc(iters_bytes + updates_bytes, GFP_NOFS);
- if (new_iters)
+ p = kmalloc(iters_bytes +
+ updates_bytes +
+ updates_bytes, GFP_NOFS);
+ if (p)
goto success;
- new_iters = mempool_alloc(&trans->c->btree_iters_pool, GFP_NOFS);
+ p = mempool_alloc(&trans->c->btree_iters_pool, GFP_NOFS);
new_size = BTREE_ITER_MAX;
trans->used_mempool = true;
success:
- new_updates = new_iters + iters_bytes;
+ new_iters = p; p += iters_bytes;
+ new_updates = p; p += updates_bytes;
+ new_updates2 = p; p += updates_bytes;
memcpy(new_iters, trans->iters,
sizeof(struct btree_iter) * trans->nr_iters);
memcpy(new_updates, trans->updates,
sizeof(struct btree_insert_entry) * trans->nr_updates);
+ memcpy(new_updates2, trans->updates2,
+ sizeof(struct btree_insert_entry) * trans->nr_updates2);
if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
memset(trans->iters, POISON_FREE,
@@ -1865,6 +1871,7 @@ success:
trans->iters = new_iters;
trans->updates = new_updates;
+ trans->updates2 = new_updates2;
trans->size = new_size;
if (trans->iters_live) {
@@ -2126,6 +2133,7 @@ void bch2_trans_reset(struct btree_trans *trans, unsigned flags)
trans->need_reset = 0;
trans->nr_updates = 0;
+ trans->nr_updates2 = 0;
trans->mem_top = 0;
if (trans->fs_usage_deltas) {
@@ -2157,6 +2165,7 @@ void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c,
trans->size = ARRAY_SIZE(trans->iters_onstack);
trans->iters = trans->iters_onstack;
trans->updates = trans->updates_onstack;
+ trans->updates2 = trans->updates2_onstack;
trans->fs_usage_deltas = NULL;
if (expected_nr_iters > trans->size)
@@ -2194,5 +2203,5 @@ int bch2_fs_btree_iter_init(struct bch_fs *c)
return mempool_init_kmalloc_pool(&c->btree_iters_pool, 1,
sizeof(struct btree_iter) * nr +
sizeof(struct btree_insert_entry) * nr +
- sizeof(u8) * nr);
+ sizeof(struct btree_insert_entry) * nr);
}
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index d1d5385d1eb7..fdfa7a265850 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -283,6 +283,7 @@ struct btree_trans {
u8 nr_iters;
u8 nr_updates;
+ u8 nr_updates2;
u8 size;
unsigned used_mempool:1;
unsigned error:1;
@@ -295,6 +296,7 @@ struct btree_trans {
struct btree_iter *iters;
struct btree_insert_entry *updates;
+ struct btree_insert_entry *updates2;
/* update path: */
struct journal_res journal_res;
@@ -308,6 +310,7 @@ struct btree_trans {
struct btree_iter iters_onstack[2];
struct btree_insert_entry updates_onstack[2];
+ struct btree_insert_entry updates2_onstack[2];
};
#define BTREE_FLAG(flag) \
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index d1cd839ac08f..12127a33906b 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -132,4 +132,9 @@ static inline int bch2_trans_commit(struct btree_trans *trans,
(_i) < (_trans)->updates + (_trans)->nr_updates; \
(_i)++)
+#define trans_for_each_update2(_trans, _i) \
+ for ((_i) = (_trans)->updates2; \
+ (_i) < (_trans)->updates2 + (_trans)->nr_updates2; \
+ (_i)++)
+
#endif /* _BCACHEFS_BTREE_UPDATE_H */
diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h
index e3204f32cc68..f6aceed89427 100644
--- a/fs/bcachefs/btree_update_interior.h
+++ b/fs/bcachefs/btree_update_interior.h
@@ -303,18 +303,23 @@ static inline struct btree_node_entry *want_new_bset(struct bch_fs *c,
}
static inline void push_whiteout(struct bch_fs *c, struct btree *b,
- struct bkey_packed *k)
+ struct bpos pos)
{
- unsigned u64s = bkeyp_key_u64s(&b->format, k);
- struct bkey_packed *dst;
+ struct bkey_packed k;
- BUG_ON(u64s > bch_btree_keys_u64s_remaining(c, b));
+ BUG_ON(bch_btree_keys_u64s_remaining(c, b) < BKEY_U64s);
- b->whiteout_u64s += bkeyp_key_u64s(&b->format, k);
- dst = unwritten_whiteouts_start(c, b);
- memcpy_u64s(dst, k, u64s);
- dst->u64s = u64s;
- dst->type = KEY_TYPE_deleted;
+ if (!bkey_pack_pos(&k, pos, b)) {
+ struct bkey *u = (void *) &k;
+
+ bkey_init(u);
+ u->p = pos;
+ }
+
+ k.needs_whiteout = true;
+
+ b->whiteout_u64s += k.u64s;
+ bkey_copy(unwritten_whiteouts_start(c, b), &k);
}
/*
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 94418c9b42e8..f0efc52c7590 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -23,11 +23,10 @@
static inline bool same_leaf_as_prev(struct btree_trans *trans,
struct btree_insert_entry *i)
{
- return i != trans->updates &&
+ return i != trans->updates2 &&
i[0].iter->l[0].b == i[-1].iter->l[0].b;
}
-
inline void bch2_btree_node_lock_for_insert(struct bch_fs *c, struct btree *b,
struct btree_iter *iter)
{
@@ -61,6 +60,9 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter,
EBUG_ON(bkey_deleted(&insert->k) && bkey_val_u64s(&insert->k));
EBUG_ON(bkey_cmp(bkey_start_pos(&insert->k), b->data->min_key) < 0 ||
bkey_cmp(insert->k.p, b->data->max_key) > 0);
+ EBUG_ON(insert->k.u64s >
+ bch_btree_keys_u64s_remaining(iter->trans->c, b));
+ EBUG_ON(iter->flags & BTREE_ITER_IS_EXTENTS);
k = bch2_btree_node_iter_peek_all(node_iter, b);
if (k && bkey_cmp_packed(b, k, &insert->k))
@@ -79,7 +81,7 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter,
k->type = KEY_TYPE_deleted;
if (k->needs_whiteout)
- push_whiteout(iter->trans->c, b, k);
+ push_whiteout(iter->trans->c, b, insert->k.p);
k->needs_whiteout = false;
if (k >= btree_bset_last(b)->start) {
@@ -195,20 +197,6 @@ void bch2_btree_journal_key(struct btree_trans *trans,
set_btree_node_dirty(b);
}
-static void bch2_insert_fixup_key(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bkey_i *insert)
-{
- struct btree_iter_level *l = &iter->l[0];
-
- EBUG_ON(iter->level);
- EBUG_ON(insert->k.u64s >
- bch_btree_keys_u64s_remaining(trans->c, l->b));
-
- if (likely(bch2_btree_bset_insert_key(iter, l->b, &l->iter, insert)))
- bch2_btree_journal_key(trans, iter, insert);
-}
-
/**
* btree_insert_key - insert a key one key into a leaf node
*/
@@ -223,12 +211,12 @@ static void btree_insert_key_leaf(struct btree_trans *trans,
int old_live_u64s = b->nr.live_u64s;
int live_u64s_added, u64s_added;
+ EBUG_ON(iter->level);
+
insert->k.needs_whiteout = false;
- if (!btree_node_is_extents(b))
- bch2_insert_fixup_key(trans, iter, insert);
- else
- bch2_insert_fixup_extent(trans, iter, insert);
+ if (likely(bch2_btree_bset_insert_key(iter, b, &iter->l[0].iter, insert)))
+ bch2_btree_journal_key(trans, iter, insert);
live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
u64s_added = (int) bset_u64s(t) - old_u64s;
@@ -254,12 +242,8 @@ static inline void btree_insert_entry_checks(struct btree_trans *trans,
struct bch_fs *c = trans->c;
BUG_ON(iter->level);
- BUG_ON(bkey_cmp(bkey_start_pos(&insert->k), iter->pos));
- EBUG_ON((iter->flags & BTREE_ITER_IS_EXTENTS) &&
- bkey_cmp(insert->k.p, iter->l[0].b->key.k.p) > 0);
-
+ BUG_ON(bkey_cmp(insert->k.p, iter->pos));
BUG_ON(debug_check_bkeys(c) &&
- !bkey_deleted(&insert->k) &&
bch2_bkey_invalid(c, bkey_i_to_s_c(insert), iter->btree_id));
}
@@ -312,9 +296,16 @@ btree_key_can_insert(struct btree_trans *trans,
if (unlikely(btree_node_fake(b)))
return BTREE_INSERT_BTREE_NODE_FULL;
+ /*
+ * old bch2_extent_sort_fix_overlapping() algorithm won't work with new
+ * style extent updates:
+ */
+ if (unlikely(btree_node_old_extent_overwrite(b)))
+ return BTREE_INSERT_BTREE_NODE_FULL;
+
ret = !btree_node_is_extents(b)
? BTREE_INSERT_OK
- : bch2_extent_can_insert(trans, iter, insert, u64s);
+ : bch2_extent_can_insert(trans, iter, insert);
if (ret)
return ret;
@@ -383,7 +374,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
prefetch(&trans->c->journal.flags);
- trans_for_each_update(trans, i) {
+ trans_for_each_update2(trans, i) {
/* Multiple inserts might go to same leaf: */
if (!same_leaf_as_prev(trans, i))
u64s = 0;
@@ -422,10 +413,10 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) {
if (journal_seq_verify(c))
- trans_for_each_update(trans, i)
+ trans_for_each_update2(trans, i)
i->k->k.version.lo = trans->journal_res.seq;
else if (inject_invalid_keys(c))
- trans_for_each_update(trans, i)
+ trans_for_each_update2(trans, i)
i->k->k.version = MAX_VERSION;
}
@@ -448,7 +439,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
if (unlikely(c->gc_pos.phase))
bch2_trans_mark_gc(trans);
- trans_for_each_update(trans, i)
+ trans_for_each_update2(trans, i)
do_btree_insert_one(trans, i->iter, i->k);
err:
if (marking) {
@@ -469,7 +460,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
struct btree_iter *iter;
int ret;
- trans_for_each_update(trans, i)
+ trans_for_each_update2(trans, i)
BUG_ON(!btree_node_intent_locked(i->iter, 0));
ret = bch2_journal_preres_get(&trans->c->journal,
@@ -497,18 +488,18 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
}
if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
- trans_for_each_update(trans, i)
+ trans_for_each_update2(trans, i)
btree_insert_entry_checks(trans, i->iter, i->k);
bch2_btree_trans_verify_locks(trans);
- trans_for_each_update(trans, i)
+ trans_for_each_update2(trans, i)
if (!same_leaf_as_prev(trans, i))
bch2_btree_node_lock_for_insert(trans->c,
i->iter->l[0].b, i->iter);
ret = bch2_trans_commit_write_locked(trans, stopped_at);
- trans_for_each_update(trans, i)
+ trans_for_each_update2(trans, i)
if (!same_leaf_as_prev(trans, i))
bch2_btree_node_unlock_write_inlined(i->iter->l[0].b,
i->iter);
@@ -525,14 +516,14 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
if (trans->flags & BTREE_INSERT_NOUNLOCK)
trans->nounlock = true;
- trans_for_each_update(trans, i)
+ trans_for_each_update2(trans, i)
if (!same_leaf_as_prev(trans, i))
bch2_foreground_maybe_merge(trans->c, i->iter,
0, trans->flags);
trans->nounlock = false;
- trans_for_each_update(trans, i)
+ trans_for_each_update2(trans, i)
bch2_btree_iter_downgrade(i->iter);
return 0;
@@ -655,6 +646,135 @@ bch2_trans_commit_get_rw_cold(struct btree_trans *trans)
return 0;
}
+static void bch2_trans_update2(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_i *insert)
+{
+ struct btree_insert_entry *i, n = (struct btree_insert_entry) {
+ .iter = iter, .k = insert
+ };
+
+ btree_insert_entry_checks(trans, n.iter, n.k);
+
+ BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK);
+
+ EBUG_ON(trans->nr_updates2 >= trans->nr_iters);
+
+ iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT;
+
+ trans_for_each_update2(trans, i) {
+ if (btree_iter_cmp(n.iter, i->iter) == 0) {
+ *i = n;
+ return;
+ }
+
+ if (btree_iter_cmp(n.iter, i->iter) <= 0)
+ break;
+ }
+
+ array_insert_item(trans->updates2, trans->nr_updates2,
+ i - trans->updates2, n);
+}
+
+static int extent_update_to_keys(struct btree_trans *trans,
+ struct btree_iter *orig_iter,
+ struct bkey_i *insert)
+{
+ struct btree_iter *iter;
+
+ if (bkey_deleted(&insert->k))
+ return 0;
+
+ iter = bch2_trans_copy_iter(trans, orig_iter);
+ if (IS_ERR(iter))
+ return PTR_ERR(iter);
+
+ iter->flags |= BTREE_ITER_INTENT;
+ __bch2_btree_iter_set_pos(iter, insert->k.p, false);
+ bch2_trans_update2(trans, iter, insert);
+ bch2_trans_iter_put(trans, iter);
+ return 0;
+}
+
+static int extent_handle_overwrites(struct btree_trans *trans,
+ enum btree_id btree_id,
+ struct bpos start, struct bpos end)
+{
+ struct btree_iter *iter = NULL, *update_iter;
+ struct bkey_i *update;
+ struct bkey_s_c k;
+ int ret = 0;
+
+ iter = bch2_trans_get_iter(trans, btree_id, start, BTREE_ITER_INTENT);
+ ret = PTR_ERR_OR_ZERO(iter);
+ if (ret)
+ return ret;
+
+ k = bch2_btree_iter_peek_with_updates(iter);
+
+ while (k.k && !(ret = bkey_err(k))) {
+ if (bkey_cmp(end, bkey_start_pos(k.k)) <= 0)
+ break;
+
+ if (bkey_cmp(bkey_start_pos(k.k), start) < 0) {
+ update_iter = bch2_trans_copy_iter(trans, iter);
+ if ((ret = PTR_ERR_OR_ZERO(update_iter)))
+ goto err;
+
+ update = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
+ if ((ret = PTR_ERR_OR_ZERO(update)))
+ goto err;
+
+ bkey_reassemble(update, k);
+ bch2_cut_back(start, update);
+
+ __bch2_btree_iter_set_pos(update_iter, update->k.p, false);
+ bch2_trans_update2(trans, update_iter, update);
+ bch2_trans_iter_put(trans, update_iter);
+ }
+
+ if (bkey_cmp(k.k->p, end) > 0) {
+ update_iter = bch2_trans_copy_iter(trans, iter);
+ if ((ret = PTR_ERR_OR_ZERO(update_iter)))
+ goto err;
+
+ update = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
+ if ((ret = PTR_ERR_OR_ZERO(update)))
+ goto err;
+
+ bkey_reassemble(update, k);
+ bch2_cut_front(end, update);
+
+ __bch2_btree_iter_set_pos(update_iter, update->k.p, false);
+ bch2_trans_update2(trans, update_iter, update);
+ bch2_trans_iter_put(trans, update_iter);
+ } else {
+ update_iter = bch2_trans_copy_iter(trans, iter);
+ if ((ret = PTR_ERR_OR_ZERO(update_iter)))
+ goto err;
+
+ update = bch2_trans_kmalloc(trans, sizeof(struct bkey));
+ if ((ret = PTR_ERR_OR_ZERO(update)))
+ goto err;
+
+ update->k = *k.k;
+ set_bkey_val_u64s(&update->k, 0);
+ update->k.type = KEY_TYPE_deleted;
+ update->k.size = 0;
+
+ __bch2_btree_iter_set_pos(update_iter, update->k.p, false);
+ bch2_trans_update2(trans, update_iter, update);
+ bch2_trans_iter_put(trans, update_iter);
+ }
+
+ k = bch2_btree_iter_next_with_updates(iter);
+ }
+err:
+ if (!IS_ERR_OR_NULL(iter))
+ bch2_trans_iter_put(trans, iter);
+ return ret;
+}
+
int __bch2_trans_commit(struct btree_trans *trans)
{
struct btree_insert_entry *i = NULL;
@@ -724,7 +844,36 @@ int __bch2_trans_commit(struct btree_trans *trans)
}
} while (trans_trigger_run);
+ /* Turn extents updates into keys: */
+ trans_for_each_update(trans, i)
+ if (i->iter->flags & BTREE_ITER_IS_EXTENTS) {
+ struct bpos start = bkey_start_pos(&i->k->k);
+
+ while (i + 1 < trans->updates + trans->nr_updates &&
+ i[0].iter->btree_id == i[1].iter->btree_id &&
+ !bkey_cmp(i[0].k->k.p, bkey_start_pos(&i[1].k->k)))
+ i++;
+
+ ret = extent_handle_overwrites(trans, i->iter->btree_id,
+ start, i->k->k.p);
+ if (ret)
+ goto out;
+ }
+
trans_for_each_update(trans, i) {
+ if (i->iter->flags & BTREE_ITER_IS_EXTENTS) {
+ ret = extent_update_to_keys(trans, i->iter, i->k);
+ if (ret)
+ goto out;
+ } else {
+ bch2_trans_update2(trans, i->iter, i->k);
+ }
+ }
+
+ trans_for_each_update2(trans, i) {
+ BUG_ON(i->iter->uptodate > BTREE_ITER_NEED_PEEK);
+ BUG_ON(i->iter->locks_want < 1);
+
u64s = jset_u64s(i->k->k.u64s);
if (0)
trans->journal_preres_u64s += u64s;
@@ -773,7 +922,10 @@ int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
.trigger_flags = flags, .iter = iter, .k = k
};
- EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&k->k)));
+ EBUG_ON(bkey_cmp(iter->pos,
+ (iter->flags & BTREE_ITER_IS_EXTENTS)
+ ? bkey_start_pos(&k->k)
+ : k->k.p));
iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT;
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 7b0f0583b1a5..cd54c2b1eff2 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1254,21 +1254,21 @@ inline int bch2_mark_overwrite(struct btree_trans *trans,
struct bkey_s_c old,
struct bkey_i *new,
struct bch_fs_usage *fs_usage,
- unsigned flags)
+ unsigned flags,
+ bool is_extents)
{
struct bch_fs *c = trans->c;
- struct btree *b = iter->l[0].b;
unsigned offset = 0;
- s64 sectors = 0;
+ s64 sectors = -((s64) old.k->size);
flags |= BTREE_TRIGGER_OVERWRITE;
- if (btree_node_is_extents(b)
+ if (is_extents
? bkey_cmp(new->k.p, bkey_start_pos(old.k)) <= 0
: bkey_cmp(new->k.p, old.k->p))
return 0;
- if (btree_node_is_extents(b)) {
+ if (is_extents) {
switch (bch2_extent_overlap(&new->k, old.k)) {
case BCH_EXTENT_OVERLAP_ALL:
offset = 0;
@@ -1341,7 +1341,8 @@ int bch2_mark_update(struct btree_trans *trans,
struct bkey_s_c k = bkey_disassemble(b, _k, &unpacked);
ret = bch2_mark_overwrite(trans, iter, k, insert,
- fs_usage, flags);
+ fs_usage, flags,
+ btree_node_type_is_extents(iter->btree_id));
if (ret <= 0)
break;
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 4c84787575f5..29ebc07a2497 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -268,7 +268,7 @@ int bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage_online *,
int bch2_mark_overwrite(struct btree_trans *, struct btree_iter *,
struct bkey_s_c, struct bkey_i *,
- struct bch_fs_usage *, unsigned);
+ struct bch_fs_usage *, unsigned, bool);
int bch2_mark_update(struct btree_trans *, struct btree_iter *,
struct bkey_i *, struct bch_fs_usage *, unsigned);
diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c
index 846d77dc2530..fa6c0698f385 100644
--- a/fs/bcachefs/extent_update.c
+++ b/fs/bcachefs/extent_update.c
@@ -39,6 +39,12 @@ static int count_iters_for_insert(struct btree_trans *trans,
{
int ret = 0;
+ /*
+ * The extent update path requires an _additional_ iterator for each
+ * extent we're inserting and overwriting:
+ */
+ *nr_iters += 1;
+
switch (k.k->type) {
case KEY_TYPE_extent:
case KEY_TYPE_reflink_v:
@@ -167,402 +173,40 @@ int bch2_extent_is_atomic(struct bkey_i *k, struct btree_iter *iter)
enum btree_insert_ret
bch2_extent_can_insert(struct btree_trans *trans,
struct btree_iter *iter,
- struct bkey_i *insert,
- unsigned *u64s)
+ struct bkey_i *insert)
{
struct btree_iter_level *l = &iter->l[0];
struct btree_node_iter node_iter = l->iter;
struct bkey_packed *_k;
+ struct bkey_s_c k;
struct bkey unpacked;
int sectors;
- while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, l->b,
- KEY_TYPE_discard))) {
- struct bkey_s_c k = bkey_disassemble(l->b, _k, &unpacked);
- enum bch_extent_overlap overlap =
- bch2_extent_overlap(&insert->k, k.k);
-
- if (bkey_cmp(bkey_start_pos(k.k), insert->k.p) >= 0)
- break;
-
- overlap = bch2_extent_overlap(&insert->k, k.k);
-
- /*
- * If we're overwriting an existing extent, we may need to emit
- * a whiteout - unless we're inserting a new extent at the same
- * position:
- */
- if (k.k->needs_whiteout &&
- (!bkey_whiteout(&insert->k) ||
- bkey_cmp(k.k->p, insert->k.p)))
- *u64s += BKEY_U64s;
-
- /*
- * If we're partially overwriting an existing extent which has
- * been written out to disk, we'll need to emit a new version of
- * that extent:
- */
- if (bkey_written(l->b, _k) &&
- overlap != BCH_EXTENT_OVERLAP_ALL)
- *u64s += _k->u64s;
-
- /* And we may be splitting an existing extent: */
- if (overlap == BCH_EXTENT_OVERLAP_MIDDLE)
- *u64s += _k->u64s;
-
- if (overlap == BCH_EXTENT_OVERLAP_MIDDLE &&
- (sectors = bch2_bkey_sectors_compressed(k))) {
- int flags = trans->flags & BTREE_INSERT_NOFAIL
- ? BCH_DISK_RESERVATION_NOFAIL : 0;
-
- switch (bch2_disk_reservation_add(trans->c,
- trans->disk_res,
- sectors, flags)) {
- case 0:
- break;
- case -ENOSPC:
- return BTREE_INSERT_ENOSPC;
- default:
- BUG();
- }
- }
-
- if (overlap == BCH_EXTENT_OVERLAP_FRONT ||
- overlap == BCH_EXTENT_OVERLAP_MIDDLE)
- break;
-
- bch2_btree_node_iter_advance(&node_iter, l->b);
- }
-
- return BTREE_INSERT_OK;
-}
-
-static void verify_extent_nonoverlapping(struct bch_fs *c,
- struct btree *b,
- struct btree_node_iter *_iter,
- struct bkey_i *insert)
-{
-#ifdef CONFIG_BCACHEFS_DEBUG
- struct btree_node_iter iter;
- struct bkey_packed *k;
- struct bkey uk;
-
- if (!expensive_debug_checks(c))
- return;
-
- iter = *_iter;
- k = bch2_btree_node_iter_prev_filter(&iter, b, KEY_TYPE_discard);
- BUG_ON(k &&
- (uk = bkey_unpack_key(b, k),
- bkey_cmp(uk.p, bkey_start_pos(&insert->k)) > 0));
-
- iter = *_iter;
- k = bch2_btree_node_iter_peek_filter(&iter, b, KEY_TYPE_discard);
-#if 0
- BUG_ON(k &&
- (uk = bkey_unpack_key(b, k),
- bkey_cmp(insert->k.p, bkey_start_pos(&uk))) > 0);
-#else
- if (k &&
- (uk = bkey_unpack_key(b, k),
- bkey_cmp(insert->k.p, bkey_start_pos(&uk))) > 0) {
- char buf1[100];
- char buf2[100];
-
- bch2_bkey_to_text(&PBUF(buf1), &insert->k);
- bch2_bkey_to_text(&PBUF(buf2), &uk);
-
- bch2_dump_btree_node(b);
- panic("insert > next :\n"
- "insert %s\n"
- "next %s\n",
- buf1, buf2);
- }
-#endif
-
-#endif
-}
-
-static void extent_bset_insert(struct bch_fs *c, struct btree_iter *iter,
- struct bkey_i *insert)
-{
- struct btree_iter_level *l = &iter->l[0];
- struct bkey_packed *k =
- bch2_btree_node_iter_bset_pos(&l->iter, l->b, bset_tree_last(l->b));
-
- BUG_ON(insert->k.u64s > bch_btree_keys_u64s_remaining(c, l->b));
-
- EBUG_ON(bkey_deleted(&insert->k) || !insert->k.size);
- verify_extent_nonoverlapping(c, l->b, &l->iter, insert);
-
- if (debug_check_bkeys(c))
- bch2_bkey_debugcheck(c, l->b, bkey_i_to_s_c(insert));
-
- bch2_bset_insert(l->b, &l->iter, k, insert, 0);
- bch2_btree_node_iter_fix(iter, l->b, &l->iter, k, 0, k->u64s);
-}
-
-static void pack_push_whiteout(struct bch_fs *c, struct btree *b,
- struct bpos pos)
-{
- struct bkey_packed k;
-
- if (!bkey_pack_pos(&k, pos, b)) {
- struct bkey_i tmp;
-
- bkey_init(&tmp.k);
- tmp.k.p = pos;
- bkey_copy(&k, &tmp);
- }
-
- k.needs_whiteout = true;
- push_whiteout(c, b, &k);
-}
-
-static void
-extent_drop(struct bch_fs *c, struct btree_iter *iter,
- struct bkey_packed *_k, struct bkey_s k)
-{
- struct btree_iter_level *l = &iter->l[0];
-
- if (!bkey_whiteout(k.k))
- btree_account_key_drop(l->b, _k);
-
- k.k->size = 0;
- k.k->type = KEY_TYPE_deleted;
-
- if (!btree_node_old_extent_overwrite(l->b) &&
- k.k->needs_whiteout) {
- pack_push_whiteout(c, l->b, k.k->p);
- k.k->needs_whiteout = false;
- }
-
- if (_k >= btree_bset_last(l->b)->start) {
- unsigned u64s = _k->u64s;
-
- bch2_bset_delete(l->b, _k, _k->u64s);
- bch2_btree_node_iter_fix(iter, l->b, &l->iter, _k, u64s, 0);
- } else {
- extent_save(l->b, _k, k.k);
- bch2_btree_iter_fix_key_modified(iter, l->b, _k);
- }
-}
-
-static void
-extent_squash(struct bch_fs *c, struct btree_iter *iter,
- struct bkey_i *insert,
- struct bkey_packed *_k, struct bkey_s k,
- enum bch_extent_overlap overlap)
-{
- struct btree_iter_level *l = &iter->l[0];
- struct bkey_on_stack tmp, split;
-
- bkey_on_stack_init(&tmp);
- bkey_on_stack_init(&split);
-
- if (!btree_node_old_extent_overwrite(l->b)) {
- if (!bkey_whiteout(&insert->k) &&
- !bkey_cmp(k.k->p, insert->k.p)) {
- insert->k.needs_whiteout = k.k->needs_whiteout;
- k.k->needs_whiteout = false;
- }
- } else {
- insert->k.needs_whiteout |= k.k->needs_whiteout;
- }
-
- switch (overlap) {
- case BCH_EXTENT_OVERLAP_FRONT:
- if (bkey_written(l->b, _k)) {
- bkey_on_stack_reassemble(&tmp, c, k.s_c);
- bch2_cut_front(insert->k.p, tmp.k);
-
- /*
- * needs_whiteout was propagated to new version of @k,
- * @tmp:
- */
- if (!btree_node_old_extent_overwrite(l->b))
- k.k->needs_whiteout = false;
-
- extent_drop(c, iter, _k, k);
- extent_bset_insert(c, iter, tmp.k);
- } else {
- btree_keys_account_val_delta(l->b, _k,
- bch2_cut_front_s(insert->k.p, k));
-
- extent_save(l->b, _k, k.k);
- /*
- * No need to call bset_fix_invalidated_key, start of
- * extent changed but extents are indexed by where they
- * end
- */
- bch2_btree_iter_fix_key_modified(iter, l->b, _k);
- }
- break;
- case BCH_EXTENT_OVERLAP_BACK:
- if (bkey_written(l->b, _k)) {
- bkey_on_stack_reassemble(&tmp, c, k.s_c);
- bch2_cut_back(bkey_start_pos(&insert->k), tmp.k);
-
- /*
- * @tmp has different position than @k, needs_whiteout
- * should not be propagated:
- */
- if (!btree_node_old_extent_overwrite(l->b))
- tmp.k->k.needs_whiteout = false;
-
- extent_drop(c, iter, _k, k);
- extent_bset_insert(c, iter, tmp.k);
- } else {
- /*
- * position of @k is changing, emit a whiteout if
- * needs_whiteout is set:
- */
- if (!btree_node_old_extent_overwrite(l->b) &&
- k.k->needs_whiteout) {
- pack_push_whiteout(c, l->b, k.k->p);
- k.k->needs_whiteout = false;
- }
-
- btree_keys_account_val_delta(l->b, _k,
- bch2_cut_back_s(bkey_start_pos(&insert->k), k));
- extent_save(l->b, _k, k.k);
-
- bch2_bset_fix_invalidated_key(l->b, _k);
- bch2_btree_node_iter_fix(iter, l->b, &l->iter,
- _k, _k->u64s, _k->u64s);
- }
- break;
- case BCH_EXTENT_OVERLAP_ALL:
- extent_drop(c, iter, _k, k);
- break;
- case BCH_EXTENT_OVERLAP_MIDDLE:
- bkey_on_stack_reassemble(&split, c, k.s_c);
- bch2_cut_back(bkey_start_pos(&insert->k), split.k);
-
- if (!btree_node_old_extent_overwrite(l->b))
- split.k->k.needs_whiteout = false;
-
- /* this is identical to BCH_EXTENT_OVERLAP_FRONT: */
- if (bkey_written(l->b, _k)) {
- bkey_on_stack_reassemble(&tmp, c, k.s_c);
- bch2_cut_front(insert->k.p, tmp.k);
-
- if (!btree_node_old_extent_overwrite(l->b))
- k.k->needs_whiteout = false;
-
- extent_drop(c, iter, _k, k);
- extent_bset_insert(c, iter, tmp.k);
- } else {
- btree_keys_account_val_delta(l->b, _k,
- bch2_cut_front_s(insert->k.p, k));
-
- extent_save(l->b, _k, k.k);
- bch2_btree_iter_fix_key_modified(iter, l->b, _k);
- }
-
- extent_bset_insert(c, iter, split.k);
- break;
- }
-
- bkey_on_stack_exit(&split, c);
- bkey_on_stack_exit(&tmp, c);
-}
+ _k = bch2_btree_node_iter_peek_filter(&node_iter, l->b,
+ KEY_TYPE_discard);
+ if (!_k)
+ return BTREE_INSERT_OK;
-/**
- * bch_extent_insert_fixup - insert a new extent and deal with overlaps
- *
- * this may result in not actually doing the insert, or inserting some subset
- * of the insert key. For cmpxchg operations this is where that logic lives.
- *
- * All subsets of @insert that need to be inserted are inserted using
- * bch2_btree_insert_and_journal(). If @b or @res fills up, this function
- * returns false, setting @iter->pos for the prefix of @insert that actually got
- * inserted.
- *
- * BSET INVARIANTS: this function is responsible for maintaining all the
- * invariants for bsets of extents in memory. things get really hairy with 0
- * size extents
- *
- * within one bset:
- *
- * bkey_start_pos(bkey_next(k)) >= k
- * or bkey_start_offset(bkey_next(k)) >= k->offset
- *
- * i.e. strict ordering, no overlapping extents.
- *
- * multiple bsets (i.e. full btree node):
- *
- * ∀ k, j
- * k.size != 0 ∧ j.size != 0 →
- * ¬ (k > bkey_start_pos(j) ∧ k < j)
- *
- * i.e. no two overlapping keys _of nonzero size_
- *
- * We can't realistically maintain this invariant for zero size keys because of
- * the key merging done in bch2_btree_insert_key() - for two mergeable keys k, j
- * there may be another 0 size key between them in another bset, and it will
- * thus overlap with the merged key.
- *
- * In addition, the end of iter->pos indicates how much has been processed.
- * If the end of iter->pos is not the same as the end of insert, then
- * key insertion needs to continue/be retried.
- */
-void bch2_insert_fixup_extent(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bkey_i *insert)
-{
- struct bch_fs *c = trans->c;
- struct btree_iter_level *l = &iter->l[0];
- struct btree_node_iter node_iter = l->iter;
- bool do_update = !bkey_whiteout(&insert->k);
- struct bkey_packed *_k;
- struct bkey unpacked;
+ k = bkey_disassemble(l->b, _k, &unpacked);
- EBUG_ON(iter->level);
- EBUG_ON(!insert->k.size);
- EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k)));
+ /* Check if we're splitting a compressed extent: */
- while ((_k = bch2_btree_node_iter_peek_filter(&l->iter, l->b,
- KEY_TYPE_discard))) {
- struct bkey_s k = __bkey_disassemble(l->b, _k, &unpacked);
- enum bch_extent_overlap overlap =
- bch2_extent_overlap(&insert->k, k.k);
+ if (bkey_cmp(bkey_start_pos(&insert->k), bkey_start_pos(k.k)) > 0 &&
+ bkey_cmp(insert->k.p, k.k->p) < 0 &&
+ (sectors = bch2_bkey_sectors_compressed(k))) {
+ int flags = trans->flags & BTREE_INSERT_NOFAIL
+ ? BCH_DISK_RESERVATION_NOFAIL : 0;
- if (bkey_cmp(bkey_start_pos(k.k), insert->k.p) >= 0)
+ switch (bch2_disk_reservation_add(trans->c, trans->disk_res,
+ sectors, flags)) {
+ case 0:
break;
-
- if (!bkey_whiteout(k.k))
- do_update = true;
-
- if (!do_update) {
- struct bpos cur_end = bpos_min(insert->k.p, k.k->p);
-
- bch2_cut_front(cur_end, insert);
- bch2_btree_iter_set_pos_same_leaf(iter, cur_end);
- } else {
- extent_squash(c, iter, insert, _k, k, overlap);
+ case -ENOSPC:
+ return BTREE_INSERT_ENOSPC;
+ default:
+ BUG();
}
-
- node_iter = l->iter;
-
- if (overlap == BCH_EXTENT_OVERLAP_FRONT ||
- overlap == BCH_EXTENT_OVERLAP_MIDDLE)
- break;
}
- l->iter = node_iter;
- bch2_btree_iter_set_pos_same_leaf(iter, insert->k.p);
-
- if (do_update) {
- if (insert->k.type == KEY_TYPE_deleted)
- insert->k.type = KEY_TYPE_discard;
-
- if (!bkey_whiteout(&insert->k) ||
- btree_node_old_extent_overwrite(l->b))
- extent_bset_insert(c, iter, insert);
-
- bch2_btree_journal_key(trans, iter, insert);
- }
-
- bch2_cut_front(insert->k.p, insert);
+ return BTREE_INSERT_OK;
}
diff --git a/fs/bcachefs/extent_update.h b/fs/bcachefs/extent_update.h
index e9dc8091ba3f..38dc084627d2 100644
--- a/fs/bcachefs/extent_update.h
+++ b/fs/bcachefs/extent_update.h
@@ -11,9 +11,6 @@ int bch2_extent_is_atomic(struct bkey_i *, struct btree_iter *);
enum btree_insert_ret
bch2_extent_can_insert(struct btree_trans *, struct btree_iter *,
- struct bkey_i *, unsigned *);
-void bch2_insert_fixup_extent(struct btree_trans *,
- struct btree_iter *,
- struct bkey_i *);
+ struct bkey_i *);
#endif /* _BCACHEFS_EXTENT_UPDATE_H */
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index eca723121a2c..902c8da9dc15 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -422,6 +422,42 @@ static int bch2_inode_truncate(struct bch_fs *c, u64 inode_nr, u64 new_size)
POS(inode_nr + 1, 0), NULL);
}
+static int bch2_fix_overlapping_extent(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_s_c k, struct bpos cut_at)
+{
+ struct btree_iter *u_iter;
+ struct bkey_i *u;
+ int ret;
+
+ u = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
+ ret = PTR_ERR_OR_ZERO(u);
+ if (ret)
+ return ret;
+
+ bkey_reassemble(u, k);
+ bch2_cut_front(cut_at, u);
+
+ u_iter = bch2_trans_copy_iter(trans, iter);
+ ret = PTR_ERR_OR_ZERO(u_iter);
+ if (ret)
+ return ret;
+
+ /*
+ * We don't want to go through the
+ * extent_handle_overwrites path:
+ */
+ __bch2_btree_iter_set_pos(u_iter, u->k.p, false);
+
+ /*
+ * XXX: this is going to leave disk space
+ * accounting slightly wrong
+ */
+ ret = bch2_trans_update(trans, u_iter, u, 0);
+ bch2_trans_iter_put(trans, u_iter);
+ return ret;
+}
+
/*
* Walk extents: verify that extents have a corresponding S_ISREG inode, and
* that i_size an i_sectors are consistent
@@ -433,6 +469,7 @@ static int check_extents(struct bch_fs *c)
struct btree_trans trans;
struct btree_iter *iter;
struct bkey_s_c k;
+ struct bkey prev = KEY(0, 0, 0);
u64 i_sectors;
int ret = 0;
@@ -444,6 +481,25 @@ static int check_extents(struct bch_fs *c)
POS(BCACHEFS_ROOT_INO, 0), 0);
retry:
for_each_btree_key_continue(iter, 0, k, ret) {
+ if (bkey_cmp(prev.p, bkey_start_pos(k.k)) > 0) {
+ char buf1[100];
+ char buf2[100];
+
+ bch2_bkey_to_text(&PBUF(buf1), &prev);
+ bch2_bkey_to_text(&PBUF(buf2), k.k);
+
+ if (fsck_err(c, "overlapping extents: %s, %s", buf1, buf2)) {
+ ret = __bch2_trans_do(&trans, NULL, NULL,
+ BTREE_INSERT_NOFAIL|
+ BTREE_INSERT_LAZY_RW,
+ bch2_fix_overlapping_extent(&trans,
+ iter, k, prev.p));
+ if (ret)
+ goto err;
+ }
+ }
+ prev = *k.k;
+
ret = walk_inode(&trans, &w, k.k->p.inode);
if (ret)
break;
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index bd0edda7abf9..27378cc9cdd5 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -161,13 +161,16 @@ static void journal_entries_free(struct list_head *list)
}
}
+/*
+ * When keys compare equal, oldest compares first:
+ */
static int journal_sort_key_cmp(const void *_l, const void *_r)
{
const struct journal_key *l = _l;
const struct journal_key *r = _r;
return cmp_int(l->btree_id, r->btree_id) ?:
- bkey_cmp(l->pos, r->pos) ?:
+ bkey_cmp(l->k->k.p, r->k->k.p) ?:
cmp_int(l->journal_seq, r->journal_seq) ?:
cmp_int(l->journal_offset, r->journal_offset);
}
@@ -179,25 +182,11 @@ static int journal_sort_seq_cmp(const void *_l, const void *_r)
return cmp_int(l->journal_seq, r->journal_seq) ?:
cmp_int(l->btree_id, r->btree_id) ?:
- bkey_cmp(l->pos, r->pos);
-}
-
-static void journal_keys_sift(struct journal_keys *keys, struct journal_key *i)
-{
- while (i + 1 < keys->d + keys->nr &&
- journal_sort_key_cmp(i, i + 1) > 0) {
- swap(i[0], i[1]);
- i++;
- }
+ bkey_cmp(l->k->k.p, r->k->k.p);
}
static void journal_keys_free(struct journal_keys *keys)
{
- struct journal_key *i;
-
- for_each_journal_key(*keys, i)
- if (i->allocated)
- kfree(i->k);
kvfree(keys->d);
keys->d = NULL;
keys->nr = 0;
@@ -208,15 +197,15 @@ static struct journal_keys journal_keys_sort(struct list_head *journal_entries)
struct journal_replay *p;
struct jset_entry *entry;
struct bkey_i *k, *_n;
- struct journal_keys keys = { NULL }, keys_deduped = { NULL };
- struct journal_key *i;
+ struct journal_keys keys = { NULL };
+ struct journal_key *src, *dst;
size_t nr_keys = 0;
list_for_each_entry(p, journal_entries, list)
for_each_jset_key(k, _n, entry, &p->j)
nr_keys++;
- keys.journal_seq_base = keys_deduped.journal_seq_base =
+ keys.journal_seq_base =
le64_to_cpu(list_first_entry(journal_entries,
struct journal_replay,
list)->j.seq);
@@ -225,96 +214,31 @@ static struct journal_keys journal_keys_sort(struct list_head *journal_entries)
if (!keys.d)
goto err;
- keys_deduped.d = kvmalloc(sizeof(keys.d[0]) * nr_keys * 2, GFP_KERNEL);
- if (!keys_deduped.d)
- goto err;
-
list_for_each_entry(p, journal_entries, list)
- for_each_jset_key(k, _n, entry, &p->j) {
- if (bkey_deleted(&k->k) &&
- btree_node_type_is_extents(entry->btree_id))
- continue;
-
+ for_each_jset_key(k, _n, entry, &p->j)
keys.d[keys.nr++] = (struct journal_key) {
.btree_id = entry->btree_id,
- .pos = bkey_start_pos(&k->k),
.k = k,
.journal_seq = le64_to_cpu(p->j.seq) -
keys.journal_seq_base,
.journal_offset = k->_data - p->j._data,
};
- }
sort(keys.d, keys.nr, sizeof(keys.d[0]), journal_sort_key_cmp, NULL);
- i = keys.d;
- while (i < keys.d + keys.nr) {
- if (i + 1 < keys.d + keys.nr &&
- i[0].btree_id == i[1].btree_id &&
- !bkey_cmp(i[0].pos, i[1].pos)) {
- if (bkey_cmp(i[0].k->k.p, i[1].k->k.p) <= 0) {
- i++;
- } else {
- bch2_cut_front(i[1].k->k.p, i[0].k);
- i[0].pos = i[1].k->k.p;
- journal_keys_sift(&keys, i);
- }
- continue;
- }
-
- if (i + 1 < keys.d + keys.nr &&
- i[0].btree_id == i[1].btree_id &&
- bkey_cmp(i[0].k->k.p, bkey_start_pos(&i[1].k->k)) > 0) {
- if ((cmp_int(i[0].journal_seq, i[1].journal_seq) ?:
- cmp_int(i[0].journal_offset, i[1].journal_offset)) < 0) {
- if (bkey_cmp(i[0].k->k.p, i[1].k->k.p) <= 0) {
- bch2_cut_back(bkey_start_pos(&i[1].k->k), i[0].k);
- } else {
- struct bkey_i *split =
- kmalloc(bkey_bytes(i[0].k), GFP_KERNEL);
-
- if (!split)
- goto err;
-
- bkey_copy(split, i[0].k);
- bch2_cut_back(bkey_start_pos(&i[1].k->k), split);
- keys_deduped.d[keys_deduped.nr++] = (struct journal_key) {
- .btree_id = i[0].btree_id,
- .allocated = true,
- .pos = bkey_start_pos(&split->k),
- .k = split,
- .journal_seq = i[0].journal_seq,
- .journal_offset = i[0].journal_offset,
- };
-
- bch2_cut_front(i[1].k->k.p, i[0].k);
- i[0].pos = i[1].k->k.p;
- journal_keys_sift(&keys, i);
- continue;
- }
- } else {
- if (bkey_cmp(i[0].k->k.p, i[1].k->k.p) >= 0) {
- i[1] = i[0];
- i++;
- continue;
- } else {
- bch2_cut_front(i[0].k->k.p, i[1].k);
- i[1].pos = i[0].k->k.p;
- journal_keys_sift(&keys, i + 1);
- continue;
- }
- }
- }
+ src = dst = keys.d;
+ while (src < keys.d + keys.nr) {
+ while (src + 1 < keys.d + keys.nr &&
+ src[0].btree_id == src[1].btree_id &&
+ !bkey_cmp(src[0].k->k.p, src[1].k->k.p))
+ src++;
- keys_deduped.d[keys_deduped.nr++] = *i++;
+ *dst++ = *src++;
}
- kvfree(keys.d);
- return keys_deduped;
+ keys.nr = dst - keys.d;
err:
- journal_keys_free(&keys_deduped);
- kvfree(keys.d);
- return (struct journal_keys) { NULL };
+ return keys;
}
/* journal replay: */
@@ -365,11 +289,6 @@ retry:
atomic_end = bpos_min(k->k.p, iter->l[0].b->key.k.p);
- split_iter = bch2_trans_copy_iter(&trans, iter);
- ret = PTR_ERR_OR_ZERO(split_iter);
- if (ret)
- goto err;
-
split = bch2_trans_kmalloc(&trans, bkey_bytes(&k->k));
ret = PTR_ERR_OR_ZERO(split);
if (ret)
@@ -388,12 +307,25 @@ retry:
}
bkey_copy(split, k);
- bch2_cut_front(split_iter->pos, split);
+ bch2_cut_front(iter->pos, split);
bch2_cut_back(atomic_end, split);
+ split_iter = bch2_trans_copy_iter(&trans, iter);
+ ret = PTR_ERR_OR_ZERO(split_iter);
+ if (ret)
+ goto err;
+
+ /*
+ * It's important that we don't go through the
+ * extent_handle_overwrites() and extent_update_to_keys() path
+ * here: journal replay is supposed to treat extents like
+ * regular keys
+ */
+ __bch2_btree_iter_set_pos(split_iter, split->k.p, false);
bch2_trans_update(&trans, split_iter, split, !remark
? BTREE_TRIGGER_NORUN
: BTREE_TRIGGER_NOOVERWRITES);
+
bch2_btree_iter_set_pos(iter, split->k.p);
} while (bkey_cmp(iter->pos, k->k.p) < 0);
@@ -424,11 +356,18 @@ static int __bch2_journal_replay_key(struct btree_trans *trans,
struct btree_iter *iter;
int ret;
- iter = bch2_trans_get_iter(trans, id, bkey_start_pos(&k->k),
- BTREE_ITER_INTENT);
+ iter = bch2_trans_get_iter(trans, id, k->k.p, BTREE_ITER_INTENT);
if (IS_ERR(iter))
return PTR_ERR(iter);
+ /*
+ * iter->flags & BTREE_ITER_IS_EXTENTS triggers the update path to run
+ * extent_handle_overwrites() and extent_update_to_keys() - but we don't
+ * want that here, journal replay is supposed to treat extents like
+ * regular keys:
+ */
+ __bch2_btree_iter_set_pos(iter, k->k.p, false);
+
ret = bch2_btree_iter_traverse(iter) ?:
bch2_trans_update(trans, iter, k, BTREE_TRIGGER_NORUN);
bch2_trans_iter_put(trans, iter);
@@ -459,7 +398,7 @@ static int bch2_journal_replay(struct bch_fs *c,
if (i->btree_id == BTREE_ID_ALLOC)
ret = bch2_alloc_replay_key(c, i->k);
- else if (btree_node_type_is_extents(i->btree_id))
+ else if (i->k->k.size)
ret = bch2_extent_replay_key(c, i->btree_id, i->k);
else
ret = bch2_journal_replay_key(c, i->btree_id, i->k);
@@ -859,6 +798,15 @@ int bch2_fs_recovery(struct bch_fs *c)
journal_seq = le64_to_cpu(clean->journal_seq) + 1;
}
+ if (!c->sb.clean &&
+ !(c->sb.features & (1ULL << BCH_FEATURE_extents_above_btree_updates))) {
+ bch_err(c, "filesystem needs recovery from older version; run fsck from older bcachefs-tools to fix");
+ ret = -EINVAL;
+ goto err;
+ }
+
+ c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_extents_above_btree_updates;
+
ret = journal_replay_early(c, clean, &journal_entries);
if (ret)
goto err;
diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h
index ccd84a8fe60d..c91309301563 100644
--- a/fs/bcachefs/recovery.h
+++ b/fs/bcachefs/recovery.h
@@ -5,8 +5,6 @@
struct journal_keys {
struct journal_key {
enum btree_id btree_id:8;
- unsigned allocated:1;
- struct bpos pos;
struct bkey_i *k;
u32 journal_seq;
u32 journal_offset;