From d4e3b928ab487a8aecd1f6a140b40ac365116cfb Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 17 Nov 2023 19:13:27 -0500 Subject: closures: CLOSURE_CALLBACK() to fix type punning Control flow integrity is now checking that type signatures match on indirect function calls. That breaks closures, which embed a work_struct in a closure in such a way that a closure_fn may also be used as a workqueue fn by the underlying closure code. So we have to change closure fns to take a work_struct as their argument - but that results in a loss of clarity, as closure fns have different semantics from normal workqueue functions (they run owning a ref on the closure, which must be released with continue_at() or closure_return()). Thus, this patc introduces CLOSURE_CALLBACK() and closure_type() macros as suggested by Kees, to smooth things over a bit. Suggested-by: Kees Cook Cc: Coly Li Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_io.c | 7 +++---- fs/bcachefs/btree_update_interior.c | 4 ++-- fs/bcachefs/fs-io-direct.c | 8 ++++---- fs/bcachefs/io_write.c | 14 +++++++------- fs/bcachefs/io_write.h | 3 +-- fs/bcachefs/journal_io.c | 17 ++++++++--------- fs/bcachefs/journal_io.h | 2 +- 7 files changed, 26 insertions(+), 29 deletions(-) (limited to 'fs/bcachefs') diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index 37d896edb06e..57c20390e10e 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -1358,10 +1358,9 @@ static bool btree_node_has_extra_bsets(struct bch_fs *c, unsigned offset, void * return offset; } -static void btree_node_read_all_replicas_done(struct closure *cl) +static CLOSURE_CALLBACK(btree_node_read_all_replicas_done) { - struct btree_node_read_all *ra = - container_of(cl, struct btree_node_read_all, cl); + closure_type(ra, struct btree_node_read_all, cl); struct bch_fs *c = ra->c; struct btree *b = ra->b; struct printbuf buf = PRINTBUF; @@ -1567,7 +1566,7 @@ static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool if (sync) { closure_sync(&ra->cl); - btree_node_read_all_replicas_done(&ra->cl); + btree_node_read_all_replicas_done(&ra->cl.work); } else { continue_at(&ra->cl, btree_node_read_all_replicas_done, c->io_complete_wq); diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index 76f27bc9fa24..d08efd6d958e 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -778,9 +778,9 @@ static void btree_interior_update_work(struct work_struct *work) } } -static void btree_update_set_nodes_written(struct closure *cl) +static CLOSURE_CALLBACK(btree_update_set_nodes_written) { - struct btree_update *as = container_of(cl, struct btree_update, cl); + closure_type(as, struct btree_update, cl); struct bch_fs *c = as->c; mutex_lock(&c->btree_interior_update_lock); diff --git a/fs/bcachefs/fs-io-direct.c b/fs/bcachefs/fs-io-direct.c index 5b42a76c4796..9a479e4de6b3 100644 --- a/fs/bcachefs/fs-io-direct.c +++ b/fs/bcachefs/fs-io-direct.c @@ -35,9 +35,9 @@ static void bio_check_or_release(struct bio *bio, bool check_dirty) } } -static void bch2_dio_read_complete(struct closure *cl) +static CLOSURE_CALLBACK(bch2_dio_read_complete) { - struct dio_read *dio = container_of(cl, struct dio_read, cl); + closure_type(dio, struct dio_read, cl); dio->req->ki_complete(dio->req, dio->ret); bio_check_or_release(&dio->rbio.bio, dio->should_dirty); @@ -325,9 +325,9 @@ static noinline int bch2_dio_write_copy_iov(struct dio_write *dio) return 0; } -static void bch2_dio_write_flush_done(struct closure *cl) +static CLOSURE_CALLBACK(bch2_dio_write_flush_done) { - struct dio_write *dio = container_of(cl, struct dio_write, op.cl); + closure_type(dio, struct dio_write, op.cl); struct bch_fs *c = dio->op.c; closure_debug_destroy(cl); diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c index d704a8f829c8..8ede46b1e354 100644 --- a/fs/bcachefs/io_write.c +++ b/fs/bcachefs/io_write.c @@ -580,9 +580,9 @@ static inline void wp_update_state(struct write_point *wp, bool running) __wp_update_state(wp, state); } -static void bch2_write_index(struct closure *cl) +static CLOSURE_CALLBACK(bch2_write_index) { - struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); + closure_type(op, struct bch_write_op, cl); struct write_point *wp = op->wp; struct workqueue_struct *wq = index_update_wq(op); unsigned long flags; @@ -1208,9 +1208,9 @@ static void __bch2_nocow_write_done(struct bch_write_op *op) bch2_nocow_write_convert_unwritten(op); } -static void bch2_nocow_write_done(struct closure *cl) +static CLOSURE_CALLBACK(bch2_nocow_write_done) { - struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); + closure_type(op, struct bch_write_op, cl); __bch2_nocow_write_done(op); bch2_write_done(cl); @@ -1363,7 +1363,7 @@ err: op->insert_keys.top = op->insert_keys.keys; } else if (op->flags & BCH_WRITE_SYNC) { closure_sync(&op->cl); - bch2_nocow_write_done(&op->cl); + bch2_nocow_write_done(&op->cl.work); } else { /* * XXX @@ -1566,9 +1566,9 @@ err: * If op->discard is true, instead of inserting the data it invalidates the * region of the cache represented by op->bio and op->inode. */ -void bch2_write(struct closure *cl) +CLOSURE_CALLBACK(bch2_write) { - struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); + closure_type(op, struct bch_write_op, cl); struct bio *bio = &op->wbio.bio; struct bch_fs *c = op->c; unsigned data_len; diff --git a/fs/bcachefs/io_write.h b/fs/bcachefs/io_write.h index 9323167229ee..6c276a48f95d 100644 --- a/fs/bcachefs/io_write.h +++ b/fs/bcachefs/io_write.h @@ -90,8 +90,7 @@ static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c, op->devs_need_flush = NULL; } -void bch2_write(struct closure *); - +CLOSURE_CALLBACK(bch2_write); void bch2_write_point_do_index_updates(struct work_struct *); static inline struct bch_write_bio *wbio_init(struct bio *bio) diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index 786a09285509..02e6484f9953 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -1025,10 +1025,9 @@ next_block: return 0; } -static void bch2_journal_read_device(struct closure *cl) +static CLOSURE_CALLBACK(bch2_journal_read_device) { - struct journal_device *ja = - container_of(cl, struct journal_device, read); + closure_type(ja, struct journal_device, read); struct bch_dev *ca = container_of(ja, struct bch_dev, journal); struct bch_fs *c = ca->fs; struct journal_list *jlist = @@ -1520,9 +1519,9 @@ static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j) return j->buf + (journal_last_unwritten_seq(j) & JOURNAL_BUF_MASK); } -static void journal_write_done(struct closure *cl) +static CLOSURE_CALLBACK(journal_write_done) { - struct journal *j = container_of(cl, struct journal, io); + closure_type(j, struct journal, io); struct bch_fs *c = container_of(j, struct bch_fs, journal); struct journal_buf *w = journal_last_unwritten_buf(j); struct bch_replicas_padded replicas; @@ -1638,9 +1637,9 @@ static void journal_write_endio(struct bio *bio) percpu_ref_put(&ca->io_ref); } -static void do_journal_write(struct closure *cl) +static CLOSURE_CALLBACK(do_journal_write) { - struct journal *j = container_of(cl, struct journal, io); + closure_type(j, struct journal, io); struct bch_fs *c = container_of(j, struct bch_fs, journal); struct bch_dev *ca; struct journal_buf *w = journal_last_unwritten_buf(j); @@ -1850,9 +1849,9 @@ static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf * return 0; } -void bch2_journal_write(struct closure *cl) +CLOSURE_CALLBACK(bch2_journal_write) { - struct journal *j = container_of(cl, struct journal, io); + closure_type(j, struct journal, io); struct bch_fs *c = container_of(j, struct bch_fs, journal); struct bch_dev *ca; struct journal_buf *w = journal_last_unwritten_buf(j); diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h index a88d097b13f1..c035e7c108e1 100644 --- a/fs/bcachefs/journal_io.h +++ b/fs/bcachefs/journal_io.h @@ -60,6 +60,6 @@ void bch2_journal_ptrs_to_text(struct printbuf *, struct bch_fs *, int bch2_journal_read(struct bch_fs *, u64 *, u64 *, u64 *); -void bch2_journal_write(struct closure *); +CLOSURE_CALLBACK(bch2_journal_write); #endif /* _BCACHEFS_JOURNAL_IO_H */ -- cgit v1.2.3 From 6201d91ee32cf92e9bcca69a3cf73461827b5ce5 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 23 Nov 2023 17:56:14 -0500 Subject: bcachefs: Put erasure coding behind an EXPERIMENTAL kconfig option We still have disk space accounting changes coming for erasure coding, and the changes won't be as strictly backwards compatible as they'd ought to be - specifically, we need to start accounting striped data under a separate counter in bch_alloc (which describes buckets). A fsck will suffice for upgrading/downgrading, but since erasure coding is the most incomplete major feature of bcachefs it still makes sense to put behind a separate kconfig option, so that users are fully aware. Signed-off-by: Kent Overstreet --- fs/bcachefs/Kconfig | 12 ++++++++++++ fs/bcachefs/alloc_foreground.c | 3 +++ 2 files changed, 15 insertions(+) (limited to 'fs/bcachefs') diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig index c08c2c7d6fbb..fddc7be58022 100644 --- a/fs/bcachefs/Kconfig +++ b/fs/bcachefs/Kconfig @@ -33,6 +33,18 @@ config BCACHEFS_QUOTA depends on BCACHEFS_FS select QUOTACTL +config BCACHEFS_ERASURE_CODING + bool "bcachefs erasure coding (RAID5/6) support (EXPERIMENTAL)" + depends on BCACHEFS_FS + select QUOTACTL + help + This enables the "erasure_code" filesysystem and inode option, which + organizes data into reed-solomon stripes instead of ordinary + replication. + + WARNING: this feature is still undergoing on disk format changes, and + should only be enabled for testing purposes. + config BCACHEFS_POSIX_ACL bool "bcachefs POSIX ACL support" depends on BCACHEFS_FS diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c index b85c7765272f..5042858719c0 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -1321,6 +1321,9 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *trans, int ret; int i; + if (!IS_ENABLED(CONFIG_BCACHEFS_ERASURE_CODING)) + erasure_code = false; + BUG_ON(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS); BUG_ON(!nr_replicas || !nr_replicas_required); -- cgit v1.2.3 From 50e029c6390a6795869b742a5fce1e57d6a76c82 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 20 Nov 2023 17:24:32 -0500 Subject: bcachefs: bch2_moving_ctxt_flush_all() Introduce a new helper to flush all move IOs, and use it in a few places where we should have been. The new helper also drops btree locks before waiting on outstanding move writes, avoiding potential deadlocks. Signed-off-by: Kent Overstreet --- fs/bcachefs/move.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 'fs/bcachefs') diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index ab749bf2fcbc..7819ed8d9df9 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -163,12 +163,18 @@ void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt) atomic_read(&ctxt->write_sectors) != sectors_pending); } +static void bch2_moving_ctxt_flush_all(struct moving_context *ctxt) +{ + move_ctxt_wait_event(ctxt, list_empty(&ctxt->reads)); + bch2_trans_unlock_long(ctxt->trans); + closure_sync(&ctxt->cl); +} + void bch2_moving_ctxt_exit(struct moving_context *ctxt) { struct bch_fs *c = ctxt->trans->c; - move_ctxt_wait_event(ctxt, list_empty(&ctxt->reads)); - closure_sync(&ctxt->cl); + bch2_moving_ctxt_flush_all(ctxt); EBUG_ON(atomic_read(&ctxt->write_sectors)); EBUG_ON(atomic_read(&ctxt->write_ios)); @@ -484,8 +490,8 @@ int bch2_move_ratelimit(struct moving_context *ctxt) struct bch_fs *c = ctxt->trans->c; u64 delay; - if (ctxt->wait_on_copygc && !c->copygc_running) { - bch2_trans_unlock_long(ctxt->trans); + if (ctxt->wait_on_copygc && c->copygc_running) { + bch2_moving_ctxt_flush_all(ctxt); wait_event_killable(c->copygc_running_wq, !c->copygc_running || kthread_should_stop()); @@ -512,7 +518,7 @@ int bch2_move_ratelimit(struct moving_context *ctxt) schedule_timeout(delay); if (unlikely(freezing(current))) { - move_ctxt_wait_event(ctxt, list_empty(&ctxt->reads)); + bch2_moving_ctxt_flush_all(ctxt); try_to_freeze(); } } while (delay); -- cgit v1.2.3 From 261af2f1c6b668586325a37df02f904c1c273a7d Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 22 Nov 2023 23:44:47 -0500 Subject: bcachefs: Make sure bch2_move_ratelimit() also waits for move_ops This adds move_ctxt_wait_event_timeout(), which can sleep for a timeout while also issueing pending moves as reads complete. Co-developed-by: Daniel Hill Signed-off-by: Kent Overstreet --- fs/bcachefs/move.c | 17 ++++------------- fs/bcachefs/move.h | 19 +++++++++++++++++++ 2 files changed, 23 insertions(+), 13 deletions(-) (limited to 'fs/bcachefs') diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index 7819ed8d9df9..71f865352816 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -500,22 +500,13 @@ int bch2_move_ratelimit(struct moving_context *ctxt) do { delay = ctxt->rate ? bch2_ratelimit_delay(ctxt->rate) : 0; - - if (delay) { - if (delay > HZ / 10) - bch2_trans_unlock_long(ctxt->trans); - else - bch2_trans_unlock(ctxt->trans); - set_current_state(TASK_INTERRUPTIBLE); - } - - if ((current->flags & PF_KTHREAD) && kthread_should_stop()) { - __set_current_state(TASK_RUNNING); + if ((current->flags & PF_KTHREAD) && kthread_should_stop()) return 1; - } if (delay) - schedule_timeout(delay); + move_ctxt_wait_event_timeout(ctxt, + freezing(current) || kthread_should_stop(), + delay); if (unlikely(freezing(current))) { bch2_moving_ctxt_flush_all(ctxt); diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h index 07cf9d42643b..0906aa2d1de2 100644 --- a/fs/bcachefs/move.h +++ b/fs/bcachefs/move.h @@ -38,6 +38,25 @@ struct moving_context { wait_queue_head_t wait; }; +#define move_ctxt_wait_event_timeout(_ctxt, _cond, _timeout) \ +({ \ + int _ret = 0; \ + while (true) { \ + bool cond_finished = false; \ + bch2_moving_ctxt_do_pending_writes(_ctxt); \ + \ + if (_cond) \ + break; \ + bch2_trans_unlock_long((_ctxt)->trans); \ + _ret = __wait_event_timeout((_ctxt)->wait, \ + bch2_moving_ctxt_next_pending_write(_ctxt) || \ + (cond_finished = (_cond)), _timeout); \ + if (_ret || ( cond_finished)) \ + break; \ + } \ + _ret; \ +}) + #define move_ctxt_wait_event(_ctxt, _cond) \ do { \ bool cond_finished = false; \ -- cgit v1.2.3 From 202a7c292ec65d5a602aa356f4fa4f50ff84cdee Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 24 Nov 2023 00:54:43 -0500 Subject: bcachefs: Don't stop copygc thread on device resize copygc no longer has to scan the buckets, so it's no longer a problem if the number of buckets is changing while it's running. This also fixes a bug where we forgot to restart copygc. Signed-off-by: Kent Overstreet --- fs/bcachefs/buckets.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs/bcachefs') diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index 58d8c6ffd955..61e4cc09ad83 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -2091,8 +2091,6 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) bucket_gens->first_bucket = ca->mi.first_bucket; bucket_gens->nbuckets = nbuckets; - bch2_copygc_stop(c); - if (resize) { down_write(&c->gc_lock); down_write(&ca->bucket_lock); -- cgit v1.2.3 From 468035ca4b83fd49be99a67b5aac3960d577c7c5 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 24 Nov 2023 00:55:59 -0500 Subject: bcachefs: Start gc, copygc, rebalance threads after initing writes ref This fixes a bug where copygc would occasionally race with going read-write and die, thinking we were read only, because it couldn't take a ref on c->writes. It's not necessary for copygc (or rebalance, or copygc) to take write refs; they could run with BCH_TRANS_COMMIT_nocheck_rw, but this is an easier fix that making sure that flag is passed correctly everywhere. Signed-off-by: Kent Overstreet --- fs/bcachefs/super.c | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) (limited to 'fs/bcachefs') diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 24672bb31cbe..14f4eff421cf 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -423,6 +423,18 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) bch2_dev_allocator_add(c, ca); bch2_recalc_capacity(c); + set_bit(BCH_FS_RW, &c->flags); + set_bit(BCH_FS_WAS_RW, &c->flags); + +#ifndef BCH_WRITE_REF_DEBUG + percpu_ref_reinit(&c->writes); +#else + for (i = 0; i < BCH_WRITE_REF_NR; i++) { + BUG_ON(atomic_long_read(&c->writes[i])); + atomic_long_inc(&c->writes[i]); + } +#endif + ret = bch2_gc_thread_start(c); if (ret) { bch_err(c, "error starting gc thread"); @@ -439,24 +451,16 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) goto err; } -#ifndef BCH_WRITE_REF_DEBUG - percpu_ref_reinit(&c->writes); -#else - for (i = 0; i < BCH_WRITE_REF_NR; i++) { - BUG_ON(atomic_long_read(&c->writes[i])); - atomic_long_inc(&c->writes[i]); - } -#endif - set_bit(BCH_FS_RW, &c->flags); - set_bit(BCH_FS_WAS_RW, &c->flags); - bch2_do_discards(c); bch2_do_invalidates(c); bch2_do_stripe_deletes(c); bch2_do_pending_node_rewrites(c); return 0; err: - __bch2_fs_read_only(c); + if (test_bit(BCH_FS_RW, &c->flags)) + bch2_fs_read_only(c); + else + __bch2_fs_read_only(c); return ret; } -- cgit v1.2.3 From 0a11adfb7aceb65831ff1c4d129d611e54fc3f57 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 11 Nov 2023 16:31:20 -0500 Subject: bcachefs: Fix an endianness conversion cpu_to_le32(), not le32_to_cpu() - fixes a sparse complaint. Signed-off-by: Kent Overstreet --- fs/bcachefs/snapshot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/bcachefs') diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c index e9af77b384c7..5dac038f0851 100644 --- a/fs/bcachefs/snapshot.c +++ b/fs/bcachefs/snapshot.c @@ -959,7 +959,7 @@ static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id) parent_id, id)) goto err; - parent->v.children[i] = le32_to_cpu(child_id); + parent->v.children[i] = cpu_to_le32(child_id); normalize_snapshot_child_pointers(&parent->v); } -- cgit v1.2.3 From 63807d951803e422cea8bfb4fdd36f57de191ada Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Thu, 16 Nov 2023 12:13:43 -0500 Subject: bcachefs: preserve device path as device name Various userspace scripts/tools may expect mount entries in /proc/mounts to reflect the device path names used to mount the associated filesystem. bcachefs seems to normalize the device path to the underlying device name based on the block device. This confuses tools like fstests when the test devices might be lvm or device-mapper based. The default behavior for show_vfsmnt() appers to be to use the string passed to alloc_vfsmnt(), so tweak bcachefs to copy the path at device superblock read time and to display it via ->show_devname(). Signed-off-by: Brian Foster Signed-off-by: Kent Overstreet --- fs/bcachefs/fs.c | 3 +-- fs/bcachefs/super-io.c | 5 +++++ fs/bcachefs/super_types.h | 1 + 3 files changed, 7 insertions(+), 2 deletions(-) (limited to 'fs/bcachefs') diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 8ef817304e4a..4d51be813509 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -1667,8 +1667,7 @@ static int bch2_show_devname(struct seq_file *seq, struct dentry *root) if (!first) seq_putc(seq, ':'); first = false; - seq_puts(seq, "/dev/"); - seq_puts(seq, ca->name); + seq_puts(seq, ca->disk_sb.sb_name); } return 0; diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index f4cad903f4d6..f3e12f7979d5 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -166,6 +166,7 @@ void bch2_free_super(struct bch_sb_handle *sb) if (!IS_ERR_OR_NULL(sb->bdev)) blkdev_put(sb->bdev, sb->holder); kfree(sb->holder); + kfree(sb->sb_name); kfree(sb->sb); memset(sb, 0, sizeof(*sb)); @@ -675,6 +676,10 @@ retry: if (!sb->holder) return -ENOMEM; + sb->sb_name = kstrdup(path, GFP_KERNEL); + if (!sb->sb_name) + return -ENOMEM; + #ifndef __KERNEL__ if (opt_get(*opts, direct_io) == false) sb->mode |= BLK_OPEN_BUFFERED; diff --git a/fs/bcachefs/super_types.h b/fs/bcachefs/super_types.h index 7dda4985b99f..9c1fd4ca2b10 100644 --- a/fs/bcachefs/super_types.h +++ b/fs/bcachefs/super_types.h @@ -5,6 +5,7 @@ struct bch_sb_handle { struct bch_sb *sb; struct block_device *bdev; + char *sb_name; struct bio *bio; void *holder; size_t buffer_size; -- cgit v1.2.3 From 8a443d3ea1327fea5ac3be77d2e39ebe35bfe9cf Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 17 Nov 2023 23:13:49 -0500 Subject: bcachefs: Proper refcounting for journal_keys The btree iterator code overlays keys from the journal until journal replay is finished; since we're now starting copygc/rebalance etc. before replay is finished, this is multithreaded access and thus needs refcounting. Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs.h | 2 ++ fs/bcachefs/btree_iter.c | 6 +++++- fs/bcachefs/btree_journal_iter.c | 18 +++++++++++++++--- fs/bcachefs/btree_journal_iter.h | 10 +++++++++- fs/bcachefs/recovery.c | 11 +++++++---- fs/bcachefs/super.c | 6 ++++-- 6 files changed, 42 insertions(+), 11 deletions(-) (limited to 'fs/bcachefs') diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 403aa3389fcc..e9d753b04a90 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -638,6 +638,8 @@ struct journal_keys { size_t gap; size_t nr; size_t size; + atomic_t ref; + bool initial_ref_held; }; struct btree_trans_buf { diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index 6fa90bcd7016..8e0fe65f6101 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -2981,7 +2981,8 @@ struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx) trans->fn_idx = fn_idx; trans->locking_wait.task = current; trans->journal_replay_not_finished = - !test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags); + unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) && + atomic_inc_not_zero(&c->journal_keys.ref); closure_init_stack(&trans->ref); s = btree_trans_stats(trans); @@ -3098,6 +3099,9 @@ void bch2_trans_put(struct btree_trans *trans) kfree(trans->fs_usage_deltas); } + if (unlikely(trans->journal_replay_not_finished)) + bch2_journal_keys_put(c); + if (trans->mem_bytes == BTREE_TRANS_MEM_MAX) mempool_free(trans->mem, &c->btree_trans_mem_pool); else diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c index 58a981bcf3aa..ec52f50d249d 100644 --- a/fs/bcachefs/btree_journal_iter.c +++ b/fs/bcachefs/btree_journal_iter.c @@ -80,6 +80,8 @@ struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *c, enum btree_id btree struct journal_keys *keys = &c->journal_keys; unsigned iters = 0; struct journal_key *k; + + BUG_ON(*idx > keys->nr); search: if (!*idx) *idx = __bch2_journal_key_search(keys, btree_id, level, pos); @@ -189,10 +191,12 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id, /* Since @keys was full, there was no gap: */ memcpy(new_keys.d, keys->d, sizeof(keys->d[0]) * keys->nr); kvfree(keys->d); - *keys = new_keys; + keys->d = new_keys.d; + keys->nr = new_keys.nr; + keys->size = new_keys.size; /* And now the gap is at the end: */ - keys->gap = keys->nr; + keys->gap = keys->nr; } journal_iters_move_gap(c, keys->gap, idx); @@ -415,10 +419,16 @@ static int journal_sort_key_cmp(const void *_l, const void *_r) cmp_int(l->journal_offset, r->journal_offset); } -void bch2_journal_keys_free(struct journal_keys *keys) +void bch2_journal_keys_put(struct bch_fs *c) { + struct journal_keys *keys = &c->journal_keys; struct journal_key *i; + BUG_ON(atomic_read(&keys->ref) <= 0); + + if (!atomic_dec_and_test(&keys->ref)) + return; + move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr); keys->gap = keys->nr; @@ -429,6 +439,8 @@ void bch2_journal_keys_free(struct journal_keys *keys) kvfree(keys->d); keys->d = NULL; keys->nr = keys->gap = keys->size = 0; + + bch2_journal_entries_free(c); } static void __journal_keys_sort(struct journal_keys *keys) diff --git a/fs/bcachefs/btree_journal_iter.h b/fs/bcachefs/btree_journal_iter.h index 5d64e7e22f26..8ca4c100b2e3 100644 --- a/fs/bcachefs/btree_journal_iter.h +++ b/fs/bcachefs/btree_journal_iter.h @@ -49,7 +49,15 @@ void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *, struct bch_fs *, struct btree *); -void bch2_journal_keys_free(struct journal_keys *); +void bch2_journal_keys_put(struct bch_fs *); + +static inline void bch2_journal_keys_put_initial(struct bch_fs *c) +{ + if (c->journal_keys.initial_ref_held) + bch2_journal_keys_put(c); + c->journal_keys.initial_ref_held = false; +} + void bch2_journal_entries_free(struct bch_fs *); int bch2_journal_keys_sort(struct bch_fs *); diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 9c30500ce920..770ced1c6285 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -167,6 +167,8 @@ static int bch2_journal_replay(struct bch_fs *c) goto err; } + BUG_ON(!atomic_read(&keys->ref)); + for (i = 0; i < keys->nr; i++) { k = keys_sorted[i]; @@ -188,6 +190,9 @@ static int bch2_journal_replay(struct bch_fs *c) } } + if (!c->opts.keep_journal) + bch2_journal_keys_put_initial(c); + replay_now_at(j, j->replay_journal_seq_end); j->replay_journal_seq = 0; @@ -909,10 +914,8 @@ out: bch2_flush_fsck_errs(c); if (!c->opts.keep_journal && - test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) { - bch2_journal_keys_free(&c->journal_keys); - bch2_journal_entries_free(c); - } + test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) + bch2_journal_keys_put_initial(c); kfree(clean); if (!ret && test_bit(BCH_FS_NEED_DELETE_DEAD_SNAPSHOTS, &c->flags)) { diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 14f4eff421cf..f63474c5c5a2 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -508,8 +508,8 @@ static void __bch2_fs_free(struct bch_fs *c) bch2_io_clock_exit(&c->io_clock[WRITE]); bch2_io_clock_exit(&c->io_clock[READ]); bch2_fs_compress_exit(c); - bch2_journal_keys_free(&c->journal_keys); - bch2_journal_entries_free(c); + bch2_journal_keys_put_initial(c); + BUG_ON(atomic_read(&c->journal_keys.ref)); bch2_fs_btree_write_buffer_exit(c); percpu_free_rwsem(&c->mark_lock); free_percpu(c->online_reserved); @@ -706,6 +706,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) init_rwsem(&c->gc_lock); mutex_init(&c->gc_gens_lock); + atomic_set(&c->journal_keys.ref, 1); + c->journal_keys.initial_ref_held = true; for (i = 0; i < BCH_TIME_STAT_NR; i++) bch2_time_stats_init(&c->times[i]); -- cgit v1.2.3 From 0af8a06a4ce823e380385cdd9538cdd968a1ffae Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 20 Nov 2023 18:23:26 -0500 Subject: bcachefs: deallocate_extra_replicas() When allocating from devices with different durability, we might end up with more replicas than required; this changes bch2_alloc_sectors_start() to check for this, and drop replicas that aren't needed to hit the number of replicas requested. Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_foreground.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) (limited to 'fs/bcachefs') diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c index 5042858719c0..1ba0eeb7552a 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -1297,6 +1297,30 @@ out: return wp; } +static noinline void +deallocate_extra_replicas(struct bch_fs *c, + struct open_buckets *ptrs, + struct open_buckets *ptrs_no_use, + unsigned extra_replicas) +{ + struct open_buckets ptrs2 = { 0 }; + struct open_bucket *ob; + unsigned i; + + open_bucket_for_each(c, ptrs, ob, i) { + unsigned d = bch_dev_bkey_exists(c, ob->dev)->mi.durability; + + if (d && d <= extra_replicas) { + extra_replicas -= d; + ob_push(c, ptrs_no_use, ob); + } else { + ob_push(c, &ptrs2, ob); + } + } + + *ptrs = ptrs2; +} + /* * Get us an open_bucket we can allocate from, return with it locked: */ @@ -1385,6 +1409,9 @@ alloc_done: if (ret) goto err; + if (nr_effective > nr_replicas) + deallocate_extra_replicas(c, &ptrs, &wp->ptrs, nr_effective - nr_replicas); + /* Free buckets we didn't use: */ open_bucket_for_each(c, &wp->ptrs, ob, i) open_bucket_free_unused(c, ob); -- cgit v1.2.3 From 7d9f8468ff7589073981b3eb8b175945c7dcd13c Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 24 Nov 2023 21:51:45 -0500 Subject: bcachefs: Data update path won't accidentaly grow replicas Previously, there was a bug where if an extent had greater durability than required (because we needed to move a durability=1 pointer and ended up putting it on a durability 2 device), we would submit a write for replicas=2 - the durability of the pointer being rewritten - instead of the number of replicas required to bring it back up to the data_replicas option. This, plus the allocation path sometimes allocating on a greater durability device than requested, meant that extents could continue having more and more replicas added as they were being rewritten. Signed-off-by: Kent Overstreet --- fs/bcachefs/data_update.c | 92 +++++++++++++++++++++++++++++++++++++++++------ fs/bcachefs/data_update.h | 9 +++-- fs/bcachefs/errcode.h | 2 +- fs/bcachefs/io_read.c | 2 +- fs/bcachefs/move.c | 58 +++--------------------------- 5 files changed, 96 insertions(+), 67 deletions(-) (limited to 'fs/bcachefs') diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c index 5ed66202c226..71aa5e59787b 100644 --- a/fs/bcachefs/data_update.c +++ b/fs/bcachefs/data_update.c @@ -356,7 +356,7 @@ void bch2_data_update_exit(struct data_update *update) bch2_bio_free_pages_pool(c, &update->op.wbio.bio); } -void bch2_update_unwritten_extent(struct btree_trans *trans, +static void bch2_update_unwritten_extent(struct btree_trans *trans, struct data_update *update) { struct bch_fs *c = update->op.c; @@ -436,7 +436,51 @@ void bch2_update_unwritten_extent(struct btree_trans *trans, } } +int bch2_extent_drop_ptrs(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k, + struct data_update_opts data_opts) +{ + struct bch_fs *c = trans->c; + struct bkey_i *n; + int ret; + + n = bch2_bkey_make_mut_noupdate(trans, k); + ret = PTR_ERR_OR_ZERO(n); + if (ret) + return ret; + + while (data_opts.kill_ptrs) { + unsigned i = 0, drop = __fls(data_opts.kill_ptrs); + struct bch_extent_ptr *ptr; + + bch2_bkey_drop_ptrs(bkey_i_to_s(n), ptr, i++ == drop); + data_opts.kill_ptrs ^= 1U << drop; + } + + /* + * If the new extent no longer has any pointers, bch2_extent_normalize() + * will do the appropriate thing with it (turning it into a + * KEY_TYPE_error key, or just a discard if it was a cached extent) + */ + bch2_extent_normalize(c, bkey_i_to_s(n)); + + /* + * Since we're not inserting through an extent iterator + * (BTREE_ITER_ALL_SNAPSHOTS iterators aren't extent iterators), + * we aren't using the extent overwrite path to delete, we're + * just using the normal key deletion path: + */ + if (bkey_deleted(&n->k)) + n->k.size = 0; + + return bch2_trans_relock(trans) ?: + bch2_trans_update(trans, iter, n, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: + bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL); +} + int bch2_data_update_init(struct btree_trans *trans, + struct btree_iter *iter, struct moving_context *ctxt, struct data_update *m, struct write_point_specifier wp, @@ -452,7 +496,7 @@ int bch2_data_update_init(struct btree_trans *trans, const struct bch_extent_ptr *ptr; unsigned i, reserve_sectors = k.k->size * data_opts.extra_replicas; unsigned ptrs_locked = 0; - int ret; + int ret = 0; bch2_bkey_buf_init(&m->k); bch2_bkey_buf_reassemble(&m->k, c, k); @@ -478,6 +522,8 @@ int bch2_data_update_init(struct btree_trans *trans, bkey_for_each_ptr(ptrs, ptr) percpu_ref_get(&bch_dev_bkey_exists(c, ptr->dev)->ref); + unsigned durability_have = 0, durability_removing = 0; + i = 0; bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { bool locked; @@ -489,8 +535,11 @@ int bch2_data_update_init(struct btree_trans *trans, reserve_sectors += k.k->size; m->op.nr_replicas += bch2_extent_ptr_desired_durability(c, &p); - } else if (!p.ptr.cached) { + durability_removing += bch2_extent_ptr_desired_durability(c, &p); + } else if (!p.ptr.cached && + !((1U << i) & m->data_opts.kill_ptrs)) { bch2_dev_list_add_dev(&m->op.devs_have, p.ptr.dev); + durability_have += bch2_extent_ptr_durability(c, &p); } /* @@ -529,6 +578,29 @@ int bch2_data_update_init(struct btree_trans *trans, i++; } + /* + * If current extent durability is less than io_opts.data_replicas, + * we're not trying to rereplicate the extent up to data_replicas here - + * unless extra_replicas was specified + * + * Increasing replication is an explicit operation triggered by + * rereplicate, currently, so that users don't get an unexpected -ENOSPC + */ + if (durability_have >= io_opts.data_replicas) { + m->data_opts.kill_ptrs |= m->data_opts.rewrite_ptrs; + m->data_opts.rewrite_ptrs = 0; + /* if iter == NULL, it's just a promote */ + if (iter) + ret = bch2_extent_drop_ptrs(trans, iter, k, data_opts); + goto done; + } + + m->op.nr_replicas = min(durability_removing, io_opts.data_replicas - durability_have) + + m->data_opts.extra_replicas; + m->op.nr_replicas_required = m->op.nr_replicas; + + BUG_ON(!m->op.nr_replicas); + if (reserve_sectors) { ret = bch2_disk_reservation_add(c, &m->op.res, reserve_sectors, m->data_opts.extra_replicas @@ -538,14 +610,11 @@ int bch2_data_update_init(struct btree_trans *trans, goto err; } - m->op.nr_replicas += m->data_opts.extra_replicas; - m->op.nr_replicas_required = m->op.nr_replicas; - - BUG_ON(!m->op.nr_replicas); + if (bkey_extent_is_unwritten(k)) { + bch2_update_unwritten_extent(trans, m); + goto done; + } - /* Special handling required: */ - if (bkey_extent_is_unwritten(k)) - return -BCH_ERR_unwritten_extent_update; return 0; err: i = 0; @@ -560,6 +629,9 @@ err: bch2_bkey_buf_exit(&m->k, c); bch2_bio_free_pages_pool(c, &m->op.wbio.bio); return ret; +done: + bch2_data_update_exit(m); + return ret ?: -BCH_ERR_data_update_done; } void bch2_data_update_opts_normalize(struct bkey_s_c k, struct data_update_opts *opts) diff --git a/fs/bcachefs/data_update.h b/fs/bcachefs/data_update.h index 9dc17b9d8379..991095bbd469 100644 --- a/fs/bcachefs/data_update.h +++ b/fs/bcachefs/data_update.h @@ -32,9 +32,14 @@ int bch2_data_update_index_update(struct bch_write_op *); void bch2_data_update_read_done(struct data_update *, struct bch_extent_crc_unpacked); +int bch2_extent_drop_ptrs(struct btree_trans *, + struct btree_iter *, + struct bkey_s_c, + struct data_update_opts); + void bch2_data_update_exit(struct data_update *); -void bch2_update_unwritten_extent(struct btree_trans *, struct data_update *); -int bch2_data_update_init(struct btree_trans *, struct moving_context *, +int bch2_data_update_init(struct btree_trans *, struct btree_iter *, + struct moving_context *, struct data_update *, struct write_point_specifier, struct bch_io_opts, struct data_update_opts, diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h index 68a1a96bb7ca..69b0627c2180 100644 --- a/fs/bcachefs/errcode.h +++ b/fs/bcachefs/errcode.h @@ -162,7 +162,7 @@ x(BCH_ERR_fsck, fsck_repair_unimplemented) \ x(BCH_ERR_fsck, fsck_repair_impossible) \ x(0, restart_recovery) \ - x(0, unwritten_extent_update) \ + x(0, data_update_done) \ x(EINVAL, device_state_not_allowed) \ x(EINVAL, member_info_missing) \ x(EINVAL, mismatched_block_size) \ diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c index a56ed553dc15..36763865facd 100644 --- a/fs/bcachefs/io_read.c +++ b/fs/bcachefs/io_read.c @@ -209,7 +209,7 @@ static struct promote_op *__promote_alloc(struct btree_trans *trans, bio = &op->write.op.wbio.bio; bio_init(bio, NULL, bio->bi_inline_vecs, pages, 0); - ret = bch2_data_update_init(trans, NULL, &op->write, + ret = bch2_data_update_init(trans, NULL, NULL, &op->write, writepoint_hashed((unsigned long) current), opts, (struct data_update_opts) { diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index 71f865352816..3b0a501b6baf 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -229,49 +229,6 @@ void bch2_move_stats_init(struct bch_move_stats *stats, char *name) scnprintf(stats->name, sizeof(stats->name), "%s", name); } -static int bch2_extent_drop_ptrs(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k, - struct data_update_opts data_opts) -{ - struct bch_fs *c = trans->c; - struct bkey_i *n; - int ret; - - n = bch2_bkey_make_mut_noupdate(trans, k); - ret = PTR_ERR_OR_ZERO(n); - if (ret) - return ret; - - while (data_opts.kill_ptrs) { - unsigned i = 0, drop = __fls(data_opts.kill_ptrs); - struct bch_extent_ptr *ptr; - - bch2_bkey_drop_ptrs(bkey_i_to_s(n), ptr, i++ == drop); - data_opts.kill_ptrs ^= 1U << drop; - } - - /* - * If the new extent no longer has any pointers, bch2_extent_normalize() - * will do the appropriate thing with it (turning it into a - * KEY_TYPE_error key, or just a discard if it was a cached extent) - */ - bch2_extent_normalize(c, bkey_i_to_s(n)); - - /* - * Since we're not inserting through an extent iterator - * (BTREE_ITER_ALL_SNAPSHOTS iterators aren't extent iterators), - * we aren't using the extent overwrite path to delete, we're - * just using the normal key deletion path: - */ - if (bkey_deleted(&n->k)) - n->k.size = 0; - - return bch2_trans_relock(trans) ?: - bch2_trans_update(trans, iter, n, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: - bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL); -} - int bch2_move_extent(struct moving_context *ctxt, struct move_bucket_in_flight *bucket_in_flight, struct btree_iter *iter, @@ -341,19 +298,11 @@ int bch2_move_extent(struct moving_context *ctxt, io->rbio.bio.bi_iter.bi_sector = bkey_start_offset(k.k); io->rbio.bio.bi_end_io = move_read_endio; - ret = bch2_data_update_init(trans, ctxt, &io->write, ctxt->wp, + ret = bch2_data_update_init(trans, iter, ctxt, &io->write, ctxt->wp, io_opts, data_opts, iter->btree_id, k); - if (ret && ret != -BCH_ERR_unwritten_extent_update) + if (ret) goto err_free_pages; - if (ret == -BCH_ERR_unwritten_extent_update) { - bch2_update_unwritten_extent(trans, &io->write); - move_free(io); - return 0; - } - - BUG_ON(ret); - io->write.op.end_io = move_write_done; if (ctxt->rate) @@ -397,6 +346,9 @@ err_free_pages: err_free: kfree(io); err: + if (ret == -BCH_ERR_data_update_done) + return 0; + this_cpu_inc(c->counters[BCH_COUNTER_move_extent_alloc_mem_fail]); trace_move_extent_alloc_mem_fail2(c, k); return ret; -- cgit v1.2.3 From e4f72bb46a774b449ffe864fa6fffa7ecbf8f3f7 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 24 Nov 2023 21:52:06 -0500 Subject: bcachefs: Fix ec + durability calculation Durability of an erasure coded pointer doesn't add the device durability; durability is the same for any extent in that stripe so the calculation only comes from the stripe. Signed-off-by: Kent Overstreet --- fs/bcachefs/extents.c | 30 ++++++++++++------------------ 1 file changed, 12 insertions(+), 18 deletions(-) (limited to 'fs/bcachefs') diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index a864de231b69..f6c92df55270 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -649,37 +649,31 @@ unsigned bch2_bkey_replicas(struct bch_fs *c, struct bkey_s_c k) return replicas; } -unsigned bch2_extent_ptr_desired_durability(struct bch_fs *c, struct extent_ptr_decoded *p) +static inline unsigned __extent_ptr_durability(struct bch_dev *ca, struct extent_ptr_decoded *p) { - struct bch_dev *ca; - if (p->ptr.cached) return 0; - ca = bch_dev_bkey_exists(c, p->ptr.dev); - - return ca->mi.durability + - (p->has_ec - ? p->ec.redundancy - : 0); + return p->has_ec + ? p->ec.redundancy + 1 + : ca->mi.durability; } -unsigned bch2_extent_ptr_durability(struct bch_fs *c, struct extent_ptr_decoded *p) +unsigned bch2_extent_ptr_desired_durability(struct bch_fs *c, struct extent_ptr_decoded *p) { - struct bch_dev *ca; + struct bch_dev *ca = bch_dev_bkey_exists(c, p->ptr.dev); - if (p->ptr.cached) - return 0; + return __extent_ptr_durability(ca, p); +} - ca = bch_dev_bkey_exists(c, p->ptr.dev); +unsigned bch2_extent_ptr_durability(struct bch_fs *c, struct extent_ptr_decoded *p) +{ + struct bch_dev *ca = bch_dev_bkey_exists(c, p->ptr.dev); if (ca->mi.state == BCH_MEMBER_STATE_failed) return 0; - return ca->mi.durability + - (p->has_ec - ? p->ec.redundancy - : 0); + return __extent_ptr_durability(ca, p); } unsigned bch2_bkey_durability(struct bch_fs *c, struct bkey_s_c k) -- cgit v1.2.3 From 3f3ae1250e739fb446639efad0ba916ba0a012f0 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 25 Nov 2023 12:16:22 -0500 Subject: bcachefs: bpos is misaligned on big endian bkey embeds a bpos that is misaligned on big endian; this is so that bch2_bkey_swab() works correctly without having to differentiate between packed and non-packed keys (a debatable design decision). This means it can't have the __aligned() tag on big endian. Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs_format.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'fs/bcachefs') diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index 0a750953ff92..ca0842f9b7e9 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -151,7 +151,11 @@ struct bpos { #else #error edit for your odd byteorder. #endif -} __packed __aligned(4); +} __packed +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +__aligned(4) +#endif +; #define KEY_INODE_MAX ((__u64)~0ULL) #define KEY_OFFSET_MAX ((__u64)~0ULL) -- cgit v1.2.3 From bbc3a46065d08f9ab3412b1f26bbfa778c444833 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 24 Nov 2023 23:12:45 -0500 Subject: bcachefs: Fix zstd compress workspace size zstd apparently lies about the size of the compression workspace it requires; if we double it compression succeeds. Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs.h | 2 +- fs/bcachefs/compress.c | 16 ++++++++++------ 2 files changed, 11 insertions(+), 7 deletions(-) (limited to 'fs/bcachefs') diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index e9d753b04a90..dfa22f9d9a1d 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -931,7 +931,7 @@ struct bch_fs { mempool_t compression_bounce[2]; mempool_t compress_workspace[BCH_COMPRESSION_TYPE_NR]; mempool_t decompress_workspace; - ZSTD_parameters zstd_params; + size_t zstd_workspace_size; struct crypto_shash *sha256; struct crypto_sync_skcipher *chacha20; diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c index a8b148ec2a2b..51af8ea230ed 100644 --- a/fs/bcachefs/compress.c +++ b/fs/bcachefs/compress.c @@ -354,8 +354,7 @@ static int attempt_compress(struct bch_fs *c, */ unsigned level = min((compression.level * 3) / 2, zstd_max_clevel()); ZSTD_parameters params = zstd_get_params(level, c->opts.encoded_extent_max); - ZSTD_CCtx *ctx = zstd_init_cctx(workspace, - zstd_cctx_workspace_bound(¶ms.cParams)); + ZSTD_CCtx *ctx = zstd_init_cctx(workspace, c->zstd_workspace_size); /* * ZSTD requires that when we decompress we pass in the exact @@ -371,7 +370,7 @@ static int attempt_compress(struct bch_fs *c, size_t len = zstd_compress_cctx(ctx, dst + 4, dst_len - 4 - 7, src, src_len, - &c->zstd_params); + ¶ms); if (zstd_is_error(len)) return 0; @@ -572,6 +571,13 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features) size_t decompress_workspace_size = 0; ZSTD_parameters params = zstd_get_params(zstd_max_clevel(), c->opts.encoded_extent_max); + + /* + * ZSTD is lying: if we allocate the size of the workspace it says it + * requires, it returns memory allocation errors + */ + c->zstd_workspace_size = zstd_cctx_workspace_bound(¶ms.cParams); + struct { unsigned feature; enum bch_compression_type type; @@ -585,13 +591,11 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features) zlib_deflate_workspacesize(MAX_WBITS, DEF_MEM_LEVEL), zlib_inflate_workspacesize(), }, { BCH_FEATURE_zstd, BCH_COMPRESSION_TYPE_zstd, - zstd_cctx_workspace_bound(¶ms.cParams), + c->zstd_workspace_size, zstd_dctx_workspace_bound() }, }, *i; bool have_compressed = false; - c->zstd_params = params; - for (i = compression_types; i < compression_types + ARRAY_SIZE(compression_types); i++) -- cgit v1.2.3 From d5bd37872a93e07ef3f9cbd4e2044ba4e17b5021 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 25 Nov 2023 21:42:08 -0500 Subject: bcachefs: Add missing validation for jset_entry_data_usage Validation was completely missing for replicas entries in the journal (not the superblock replicas section) - we can't have replicas entries pointing to invalid devices. Signed-off-by: Kent Overstreet --- fs/bcachefs/errcode.h | 1 + fs/bcachefs/journal_io.c | 12 ++++++++- fs/bcachefs/replicas.c | 69 +++++++++++++++++++++++++++--------------------- fs/bcachefs/replicas.h | 2 ++ 4 files changed, 53 insertions(+), 31 deletions(-) (limited to 'fs/bcachefs') diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h index 69b0627c2180..ae7910bf2228 100644 --- a/fs/bcachefs/errcode.h +++ b/fs/bcachefs/errcode.h @@ -210,6 +210,7 @@ x(BCH_ERR_invalid_sb, invalid_sb_members) \ x(BCH_ERR_invalid_sb, invalid_sb_disk_groups) \ x(BCH_ERR_invalid_sb, invalid_sb_replicas) \ + x(BCH_ERR_invalid_sb, invalid_replicas_entry) \ x(BCH_ERR_invalid_sb, invalid_sb_journal) \ x(BCH_ERR_invalid_sb, invalid_sb_journal_seq_blacklist) \ x(BCH_ERR_invalid_sb, invalid_sb_crypt) \ diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index 02e6484f9953..0f17fc5f8d68 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -547,6 +547,7 @@ static int journal_entry_data_usage_validate(struct bch_fs *c, struct jset_entry_data_usage *u = container_of(entry, struct jset_entry_data_usage, entry); unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); + struct printbuf err = PRINTBUF; int ret = 0; if (journal_entry_err_on(bytes < sizeof(*u) || @@ -555,10 +556,19 @@ static int journal_entry_data_usage_validate(struct bch_fs *c, journal_entry_data_usage_bad_size, "invalid journal entry usage: bad size")) { journal_entry_null_range(entry, vstruct_next(entry)); - return ret; + goto out; } + if (journal_entry_err_on(bch2_replicas_entry_validate(&u->r, c->disk_sb.sb, &err), + c, version, jset, entry, + journal_entry_data_usage_bad_size, + "invalid journal entry usage: %s", err.buf)) { + journal_entry_null_range(entry, vstruct_next(entry)); + goto out; + } +out: fsck_err: + printbuf_exit(&err); return ret; } diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c index 1c3ae13bfced..2008fe8bf706 100644 --- a/fs/bcachefs/replicas.c +++ b/fs/bcachefs/replicas.c @@ -68,6 +68,33 @@ void bch2_replicas_entry_to_text(struct printbuf *out, prt_printf(out, "]"); } +int bch2_replicas_entry_validate(struct bch_replicas_entry *r, + struct bch_sb *sb, + struct printbuf *err) +{ + if (!r->nr_devs) { + prt_printf(err, "no devices in entry "); + goto bad; + } + + if (r->nr_required > 1 && + r->nr_required >= r->nr_devs) { + prt_printf(err, "bad nr_required in entry "); + goto bad; + } + + for (unsigned i = 0; i < r->nr_devs; i++) + if (!bch2_dev_exists(sb, r->devs[i])) { + prt_printf(err, "invalid device %u in entry ", r->devs[i]); + goto bad; + } + + return 0; +bad: + bch2_replicas_entry_to_text(err, r); + return -BCH_ERR_invalid_replicas_entry; +} + void bch2_cpu_replicas_to_text(struct printbuf *out, struct bch_replicas_cpu *r) { @@ -163,7 +190,8 @@ void bch2_devlist_to_replicas(struct bch_replicas_entry *e, } static struct bch_replicas_cpu -cpu_replicas_add_entry(struct bch_replicas_cpu *old, +cpu_replicas_add_entry(struct bch_fs *c, + struct bch_replicas_cpu *old, struct bch_replicas_entry *new_entry) { unsigned i; @@ -173,6 +201,9 @@ cpu_replicas_add_entry(struct bch_replicas_cpu *old, replicas_entry_bytes(new_entry)), }; + for (i = 0; i < new_entry->nr_devs; i++) + BUG_ON(!bch2_dev_exists2(c, new_entry->devs[i])); + BUG_ON(!new_entry->data_type); verify_replicas_entry(new_entry); @@ -382,7 +413,7 @@ static int bch2_mark_replicas_slowpath(struct bch_fs *c, if (c->replicas_gc.entries && !__replicas_has_entry(&c->replicas_gc, new_entry)) { - new_gc = cpu_replicas_add_entry(&c->replicas_gc, new_entry); + new_gc = cpu_replicas_add_entry(c, &c->replicas_gc, new_entry); if (!new_gc.entries) { ret = -BCH_ERR_ENOMEM_cpu_replicas; goto err; @@ -390,7 +421,7 @@ static int bch2_mark_replicas_slowpath(struct bch_fs *c, } if (!__replicas_has_entry(&c->replicas, new_entry)) { - new_r = cpu_replicas_add_entry(&c->replicas, new_entry); + new_r = cpu_replicas_add_entry(c, &c->replicas, new_entry); if (!new_r.entries) { ret = -BCH_ERR_ENOMEM_cpu_replicas; goto err; @@ -598,7 +629,7 @@ int bch2_replicas_set_usage(struct bch_fs *c, if (idx < 0) { struct bch_replicas_cpu n; - n = cpu_replicas_add_entry(&c->replicas, r); + n = cpu_replicas_add_entry(c, &c->replicas, r); if (!n.entries) return -BCH_ERR_ENOMEM_cpu_replicas; @@ -797,7 +828,7 @@ static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r, struct bch_sb *sb, struct printbuf *err) { - unsigned i, j; + unsigned i; sort_cmp_size(cpu_r->entries, cpu_r->nr, @@ -808,31 +839,9 @@ static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r, struct bch_replicas_entry *e = cpu_replicas_entry(cpu_r, i); - if (e->data_type >= BCH_DATA_NR) { - prt_printf(err, "invalid data type in entry "); - bch2_replicas_entry_to_text(err, e); - return -BCH_ERR_invalid_sb_replicas; - } - - if (!e->nr_devs) { - prt_printf(err, "no devices in entry "); - bch2_replicas_entry_to_text(err, e); - return -BCH_ERR_invalid_sb_replicas; - } - - if (e->nr_required > 1 && - e->nr_required >= e->nr_devs) { - prt_printf(err, "bad nr_required in entry "); - bch2_replicas_entry_to_text(err, e); - return -BCH_ERR_invalid_sb_replicas; - } - - for (j = 0; j < e->nr_devs; j++) - if (!bch2_dev_exists(sb, e->devs[j])) { - prt_printf(err, "invalid device %u in entry ", e->devs[j]); - bch2_replicas_entry_to_text(err, e); - return -BCH_ERR_invalid_sb_replicas; - } + int ret = bch2_replicas_entry_validate(e, sb, err); + if (ret) + return ret; if (i + 1 < cpu_r->nr) { struct bch_replicas_entry *n = diff --git a/fs/bcachefs/replicas.h b/fs/bcachefs/replicas.h index 4887675a86f0..f70a642775d1 100644 --- a/fs/bcachefs/replicas.h +++ b/fs/bcachefs/replicas.h @@ -9,6 +9,8 @@ void bch2_replicas_entry_sort(struct bch_replicas_entry *); void bch2_replicas_entry_to_text(struct printbuf *, struct bch_replicas_entry *); +int bch2_replicas_entry_validate(struct bch_replicas_entry *, + struct bch_sb *, struct printbuf *); void bch2_cpu_replicas_to_text(struct printbuf *, struct bch_replicas_cpu *); static inline struct bch_replicas_entry * -- cgit v1.2.3 From 03013bb0c6b157630018800a456a08646655ea97 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 25 Nov 2023 23:55:26 -0500 Subject: bcachefs: Fix bucket data type for stripe buckets Signed-off-by: Kent Overstreet --- fs/bcachefs/buckets.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'fs/bcachefs') diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index 61e4cc09ad83..5a91d3189fcf 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -854,8 +854,12 @@ static int __mark_pointer(struct btree_trans *trans, return ret; *dst_sectors += sectors; - *bucket_data_type = *dirty_sectors || *cached_sectors - ? ptr_data_type : 0; + + if (!*dirty_sectors && !*cached_sectors) + *bucket_data_type = 0; + else if (*bucket_data_type != BCH_DATA_stripe) + *bucket_data_type = ptr_data_type; + return 0; } -- cgit v1.2.3 From 5510a4af521c34df339fcceb0f26ace5b1090ad6 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 26 Nov 2023 18:31:11 -0500 Subject: bcachefs: Fix split_race livelock bch2_btree_update_start() calculates which nodes are going to have to be split/rewritten, so that we know how many nodes to reserve and how deep in the tree we have to take locks. But btree node merges require inserting two keys into the parent node, not just splits. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_update_interior.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'fs/bcachefs') diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index d08efd6d958e..d68fb01944b1 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -1071,8 +1071,12 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, break; } + /* + * Always check for space for two keys, even if we won't have to + * split at prior level - it might have been a merge instead: + */ if (bch2_btree_node_insert_fits(c, path->l[update_level].b, - BKEY_BTREE_PTR_U64s_MAX * (1 + split))) + BKEY_BTREE_PTR_U64s_MAX * 2)) break; split = path->l[update_level].b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c); -- cgit v1.2.3 From ae4d612cc1f23a6137d0978b1620a2af2e201c1a Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 26 Nov 2023 21:13:54 -0500 Subject: bcachefs: trace_move_extent_start_fail() now includes errcode Renamed from trace_move_extent_alloc_mem_fail, because there are other reasons we colud fail (disk space allocation failure). Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs_format.h | 2 +- fs/bcachefs/move.c | 23 ++++++++++------------- fs/bcachefs/trace.h | 6 +++--- 3 files changed, 14 insertions(+), 17 deletions(-) (limited to 'fs/bcachefs') diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index ca0842f9b7e9..1ab1f08d763b 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -1532,7 +1532,7 @@ struct bch_sb_field_disk_groups { x(move_extent_write, 36) \ x(move_extent_finish, 37) \ x(move_extent_fail, 38) \ - x(move_extent_alloc_mem_fail, 39) \ + x(move_extent_start_fail, 39) \ x(copygc, 40) \ x(copygc_wait, 41) \ x(gc_gens_end, 42) \ diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index 3b0a501b6baf..0a04adff1144 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -49,17 +49,6 @@ static void trace_move_extent_read2(struct bch_fs *c, struct bkey_s_c k) } } -static void trace_move_extent_alloc_mem_fail2(struct bch_fs *c, struct bkey_s_c k) -{ - if (trace_move_extent_alloc_mem_fail_enabled()) { - struct printbuf buf = PRINTBUF; - - bch2_bkey_val_to_text(&buf, c, k); - trace_move_extent_alloc_mem_fail(c, buf.buf); - printbuf_exit(&buf); - } -} - struct moving_io { struct list_head read_list; struct list_head io_list; @@ -349,8 +338,16 @@ err: if (ret == -BCH_ERR_data_update_done) return 0; - this_cpu_inc(c->counters[BCH_COUNTER_move_extent_alloc_mem_fail]); - trace_move_extent_alloc_mem_fail2(c, k); + this_cpu_inc(c->counters[BCH_COUNTER_move_extent_start_fail]); + if (trace_move_extent_start_fail_enabled()) { + struct printbuf buf = PRINTBUF; + + bch2_bkey_val_to_text(&buf, c, k); + prt_str(&buf, ": "); + prt_str(&buf, bch2_err_str(ret)); + trace_move_extent_start_fail(c, buf.buf); + printbuf_exit(&buf); + } return ret; } diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h index 7857671159b4..fd49b63562c3 100644 --- a/fs/bcachefs/trace.h +++ b/fs/bcachefs/trace.h @@ -754,9 +754,9 @@ TRACE_EVENT(move_extent_fail, TP_printk("%d:%d %s", MAJOR(__entry->dev), MINOR(__entry->dev), __get_str(msg)) ); -DEFINE_EVENT(bkey, move_extent_alloc_mem_fail, - TP_PROTO(struct bch_fs *c, const char *k), - TP_ARGS(c, k) +DEFINE_EVENT(bkey, move_extent_start_fail, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) ); TRACE_EVENT(move_data, -- cgit v1.2.3 From 1b1bd0fd41577400434d5948edc2679092c57b08 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 26 Nov 2023 23:11:18 -0500 Subject: bcachefs: -EROFS doesn't count as move_extent_start_fail The automated tests check if we've hit too many slowpath/error path events and fail the test - if we're just shutting down, that naturally shouldn't count. Signed-off-by: Kent Overstreet --- fs/bcachefs/move.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs/bcachefs') diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index 0a04adff1144..67ac68f9dd3f 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -338,6 +338,10 @@ err: if (ret == -BCH_ERR_data_update_done) return 0; + if (bch2_err_matches(ret, EROFS) || + bch2_err_matches(ret, BCH_ERR_transaction_restart)) + return ret; + this_cpu_inc(c->counters[BCH_COUNTER_move_extent_start_fail]); if (trace_move_extent_start_fail_enabled()) { struct printbuf buf = PRINTBUF; -- cgit v1.2.3 From ef0beeb8dd343a57cf8ad4967b508b8e7452f347 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 27 Nov 2023 00:53:46 -0500 Subject: bcachefs: move journal seq assertion journal_cur_seq() can legitimately be used outside of the journal lock, where this assert can race Signed-off-by: Kent Overstreet --- fs/bcachefs/journal.c | 2 ++ fs/bcachefs/journal.h | 4 +--- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'fs/bcachefs') diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index 23a9b7845d11..489b34046e78 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -321,6 +321,8 @@ static int journal_entry_open(struct journal *j) atomic64_inc(&j->seq); journal_pin_list_init(fifo_push_ref(&j->pin), 1); + BUG_ON(j->pin.back - 1 != atomic64_read(&j->seq)); + BUG_ON(j->buf + (journal_cur_seq(j) & JOURNAL_BUF_MASK) != buf); bkey_extent_init(&buf->key); diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h index c85d01cf4948..4c513fca5ef2 100644 --- a/fs/bcachefs/journal.h +++ b/fs/bcachefs/journal.h @@ -136,9 +136,7 @@ static inline u64 journal_last_seq(struct journal *j) static inline u64 journal_cur_seq(struct journal *j) { - EBUG_ON(j->pin.back - 1 != atomic64_read(&j->seq)); - - return j->pin.back - 1; + return atomic64_read(&j->seq); } static inline u64 journal_last_unwritten_seq(struct journal *j) -- cgit v1.2.3 From 2111f39459b0b2ec54859b756c625edb0befce0e Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 28 Nov 2023 19:26:23 -0500 Subject: bcachefs: Fix race between btree writes and metadata drop btree writes update the btree node key after every write, in order to update sectors_written, and they also might need to drop pointers if one of the writes failed in a replicated btree node. But the btree node might also have had a pointer dropped while the write was in flight, by bch2_dev_metadata_drop(), and thus there was a bug where the btree node write would ovewrite the btree node's key with what it had at the start of the write. Fix this by dropping pointers not currently in the btree node key. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_update_interior.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs/bcachefs') diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index d68fb01944b1..6697417273aa 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -2270,6 +2270,10 @@ int bch2_btree_node_update_key_get_iter(struct btree_trans *trans, BUG_ON(!btree_node_hashed(b)); + struct bch_extent_ptr *ptr; + bch2_bkey_drop_ptrs(bkey_i_to_s(new_key), ptr, + !bch2_bkey_has_device(bkey_i_to_s(&b->key), ptr->dev)); + ret = bch2_btree_node_update_key(trans, &iter, b, new_key, commit_flags, skip_triggers); out: -- cgit v1.2.3 From 463086d99889c84ce0960fac2815c596f26cc0e5 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 28 Nov 2023 16:31:48 -0500 Subject: bcachefs: Convert gc_alloc_start() to for_each_btree_key2() This eliminates some SRCU warnings: for_each_btree_key2() runs every loop iteration in a distinct transaction context. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_gc.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'fs/bcachefs') diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index 0b5d09c8475d..30ab78a24517 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -1541,8 +1541,8 @@ static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only) rcu_assign_pointer(ca->buckets_gc, buckets); } - for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN, - BTREE_ITER_PREFETCH, k, ret) { + ret = for_each_btree_key2(trans, iter, BTREE_ID_alloc, POS_MIN, + BTREE_ITER_PREFETCH, k, ({ ca = bch_dev_bkey_exists(c, k.k->p.inode); g = gc_bucket(ca, k.k->p.offset); @@ -1561,8 +1561,9 @@ static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only) g->stripe = a->stripe; g->stripe_redundancy = a->stripe_redundancy; } - } - bch2_trans_iter_exit(trans, &iter); + + 0; + })); err: bch2_trans_put(trans); if (ret) -- cgit v1.2.3 From 415e5107b0dce0e5407ae4a46700cd7e8859e252 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 28 Nov 2023 16:33:52 -0500 Subject: bcachefs: Extra kthread_should_stop() calls for copygc This fixes a bug where going read-only was taking longer than it should have due to copygc forgetting to check kthread_should_stop() Additionally: fix a missing is_kthread check in bch2_move_ratelimit(). Signed-off-by: Kent Overstreet --- fs/bcachefs/move.c | 12 +++++++++--- fs/bcachefs/movinggc.c | 2 +- 2 files changed, 10 insertions(+), 4 deletions(-) (limited to 'fs/bcachefs') diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index 67ac68f9dd3f..54830ee0ed88 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -441,24 +441,26 @@ int bch2_move_get_io_opts_one(struct btree_trans *trans, int bch2_move_ratelimit(struct moving_context *ctxt) { struct bch_fs *c = ctxt->trans->c; + bool is_kthread = current->flags & PF_KTHREAD; u64 delay; if (ctxt->wait_on_copygc && c->copygc_running) { bch2_moving_ctxt_flush_all(ctxt); wait_event_killable(c->copygc_running_wq, !c->copygc_running || - kthread_should_stop()); + (is_kthread && kthread_should_stop())); } do { delay = ctxt->rate ? bch2_ratelimit_delay(ctxt->rate) : 0; - if ((current->flags & PF_KTHREAD) && kthread_should_stop()) + if (is_kthread && kthread_should_stop()) return 1; if (delay) move_ctxt_wait_event_timeout(ctxt, - freezing(current) || kthread_should_stop(), + freezing(current) || + (is_kthread && kthread_should_stop()), delay); if (unlikely(freezing(current))) { @@ -633,6 +635,7 @@ int __bch2_evacuate_bucket(struct moving_context *ctxt, { struct btree_trans *trans = ctxt->trans; struct bch_fs *c = trans->c; + bool is_kthread = current->flags & PF_KTHREAD; struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); struct btree_iter iter; struct bkey_buf sk; @@ -678,6 +681,9 @@ int __bch2_evacuate_bucket(struct moving_context *ctxt, } while (!(ret = bch2_move_ratelimit(ctxt))) { + if (is_kthread && kthread_should_stop()) + break; + bch2_trans_begin(trans); ret = bch2_get_next_backpointer(trans, bucket, gen, diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c index 0a0576326c5b..a84e79f79e5e 100644 --- a/fs/bcachefs/movinggc.c +++ b/fs/bcachefs/movinggc.c @@ -207,7 +207,7 @@ static int bch2_copygc(struct moving_context *ctxt, goto err; darray_for_each(buckets, i) { - if (unlikely(freezing(current))) + if (kthread_should_stop() || freezing(current)) break; f = move_bucket_in_flight_add(buckets_in_flight, *i); -- cgit v1.2.3