summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKent Overstreet <kent.overstreet@gmail.com>2021-04-18 03:37:04 +0300
committerKent Overstreet <kent.overstreet@linux.dev>2023-10-23 00:09:01 +0300
commit89baec780f8b218f5a8bce777b13b6116e416ff6 (patch)
tree08793bfe9d327b6807c208eeac450741b790f2da
parentfa272f33bbfc68856efa7aa0f2e33d9fe5982e17 (diff)
downloadlinux-89baec780f8b218f5a8bce777b13b6116e416ff6.tar.xz
bcachefs: Allocator refactoring
This uses the kthread_wait_freezable() macro to simplify a lot of the allocator thread code, along with cleaning up bch2_invalidate_bucket2(). Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com> Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
-rw-r--r--fs/bcachefs/alloc_background.c382
-rw-r--r--fs/bcachefs/alloc_foreground.c47
-rw-r--r--fs/bcachefs/trace.h43
3 files changed, 161 insertions, 311 deletions
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index ab60bf259b0c..2d532fe4d30b 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -441,50 +441,6 @@ out:
* commands to the newly free buckets, then puts them on the various freelists.
*/
-/**
- * wait_buckets_available - wait on reclaimable buckets
- *
- * If there aren't enough available buckets to fill up free_inc, wait until
- * there are.
- */
-static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca)
-{
- unsigned long gc_count = c->gc_count;
- s64 available;
- int ret = 0;
-
- ca->allocator_state = ALLOCATOR_blocked;
- closure_wake_up(&c->freelist_wait);
-
- while (1) {
- set_current_state(TASK_INTERRUPTIBLE);
- if (kthread_should_stop()) {
- ret = 1;
- break;
- }
-
- if (gc_count != c->gc_count)
- ca->inc_gen_really_needs_gc = 0;
-
- available = dev_buckets_reclaimable(ca);
- available -= ca->inc_gen_really_needs_gc;
-
- available = max(available, 0LL);
-
- if (available)
- break;
-
- schedule();
- try_to_freeze();
- }
-
- __set_current_state(TASK_RUNNING);
- ca->allocator_state = ALLOCATOR_running;
- closure_wake_up(&c->freelist_wait);
-
- return ret;
-}
-
static bool bch2_can_invalidate_bucket(struct bch_dev *ca, size_t b,
struct bucket_mark m)
{
@@ -502,11 +458,8 @@ static bool bch2_can_invalidate_bucket(struct bch_dev *ca, size_t b,
gc_gen = bucket_gc_gen(bucket(ca, b));
- if (gc_gen >= BUCKET_GC_GEN_MAX / 2)
- ca->inc_gen_needs_gc++;
-
- if (gc_gen >= BUCKET_GC_GEN_MAX)
- ca->inc_gen_really_needs_gc++;
+ ca->inc_gen_needs_gc += gc_gen >= BUCKET_GC_GEN_MAX / 2;
+ ca->inc_gen_really_needs_gc += gc_gen >= BUCKET_GC_GEN_MAX;
return gc_gen < BUCKET_GC_GEN_MAX;
}
@@ -583,6 +536,8 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
struct bucket_mark m = READ_ONCE(g->mark);
unsigned key = bucket_sort_key(g, m, now, last_seq_ondisk);
+ cond_resched();
+
if (!bch2_can_invalidate_bucket(ca, b, m))
continue;
@@ -599,8 +554,6 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
.key = key,
};
}
-
- cond_resched();
}
if (e.nr)
@@ -693,6 +646,7 @@ static size_t find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca)
size_t i, nr = 0;
ca->inc_gen_needs_gc = 0;
+ ca->inc_gen_really_needs_gc = 0;
switch (ca->mi.replacement) {
case BCH_CACHE_REPLACEMENT_lru:
@@ -714,25 +668,6 @@ static size_t find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca)
return nr;
}
-static inline long next_alloc_bucket(struct bch_dev *ca)
-{
- struct alloc_heap_entry e, *top = ca->alloc_heap.data;
-
- while (ca->alloc_heap.used) {
- if (top->nr) {
- size_t b = top->bucket;
-
- top->bucket++;
- top->nr--;
- return b;
- }
-
- heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp, NULL);
- }
-
- return -1;
-}
-
/*
* returns sequence number of most recent journal entry that updated this
* bucket:
@@ -755,17 +690,56 @@ static u64 bucket_journal_seq(struct bch_fs *c, struct bucket_mark m)
}
}
-static int bch2_invalidate_one_bucket2(struct btree_trans *trans,
- struct bch_dev *ca,
- struct btree_iter *iter,
- u64 *journal_seq, unsigned flags)
+static int bucket_invalidate_btree(struct btree_trans *trans,
+ struct bch_dev *ca, u64 b)
{
struct bch_fs *c = trans->c;
- struct bkey_alloc_buf a;
+ struct bkey_alloc_buf *a;
struct bkey_alloc_unpacked u;
struct bucket *g;
struct bucket_mark m;
- bool invalidating_cached_data;
+ struct btree_iter *iter =
+ bch2_trans_get_iter(trans, BTREE_ID_alloc,
+ POS(ca->dev_idx, b),
+ BTREE_ITER_CACHED|
+ BTREE_ITER_CACHED_NOFILL|
+ BTREE_ITER_INTENT);
+ int ret;
+
+ a = bch2_trans_kmalloc(trans, sizeof(*a));
+ ret = PTR_ERR_OR_ZERO(a);
+ if (ret)
+ goto err;
+
+ ret = bch2_btree_iter_traverse(iter);
+ if (ret)
+ goto err;
+
+ percpu_down_read(&c->mark_lock);
+ g = bucket(ca, b);
+ m = READ_ONCE(g->mark);
+ u = alloc_mem_to_key(iter, g, m);
+ percpu_up_read(&c->mark_lock);
+
+ u.gen++;
+ u.data_type = 0;
+ u.dirty_sectors = 0;
+ u.cached_sectors = 0;
+ u.read_time = atomic64_read(&c->io_clock[READ].now);
+ u.write_time = atomic64_read(&c->io_clock[WRITE].now);
+
+ bch2_alloc_pack(c, a, u);
+ bch2_trans_update(trans, iter, &a->k, BTREE_TRIGGER_BUCKET_INVALIDATE);
+err:
+ bch2_trans_iter_put(trans, iter);
+ return ret;
+}
+
+static int bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca,
+ u64 *journal_seq, unsigned flags)
+{
+ struct bucket *g;
+ struct bucket_mark m;
size_t b;
int ret = 0;
@@ -811,48 +785,12 @@ static int bch2_invalidate_one_bucket2(struct btree_trans *trans,
goto out;
}
- bch2_btree_iter_set_pos(iter, POS(ca->dev_idx, b));
-retry:
- ret = bch2_btree_iter_traverse(iter);
- if (ret)
- return ret;
-
- percpu_down_read(&c->mark_lock);
- g = bucket(ca, iter->pos.offset);
- m = READ_ONCE(g->mark);
- u = alloc_mem_to_key(iter, g, m);
-
- percpu_up_read(&c->mark_lock);
-
- invalidating_cached_data = u.cached_sectors != 0;
-
- u.gen++;
- u.data_type = 0;
- u.dirty_sectors = 0;
- u.cached_sectors = 0;
- u.read_time = atomic64_read(&c->io_clock[READ].now);
- u.write_time = atomic64_read(&c->io_clock[WRITE].now);
-
- bch2_alloc_pack(c, &a, u);
- bch2_trans_update(trans, iter, &a.k,
- BTREE_TRIGGER_BUCKET_INVALIDATE);
-
- /*
- * XXX:
- * when using deferred btree updates, we have journal reclaim doing
- * btree updates and thus requiring the allocator to make forward
- * progress, and here the allocator is requiring space in the journal -
- * so we need a journal pre-reservation:
- */
- ret = bch2_trans_commit(trans, NULL,
- invalidating_cached_data ? journal_seq : NULL,
- BTREE_INSERT_NOUNLOCK|
- BTREE_INSERT_NOCHECK_RW|
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_JOURNAL_RESERVED|
- flags);
- if (ret == -EINTR)
- goto retry;
+ ret = bch2_trans_do(c, NULL, journal_seq,
+ BTREE_INSERT_NOCHECK_RW|
+ BTREE_INSERT_NOFAIL|
+ BTREE_INSERT_JOURNAL_RESERVED|
+ flags,
+ bucket_invalidate_btree(&trans, ca, b));
out:
if (!ret) {
/* remove from alloc_heap: */
@@ -894,28 +832,23 @@ out:
*/
static int bch2_invalidate_buckets(struct bch_fs *c, struct bch_dev *ca)
{
- struct btree_trans trans;
- struct btree_iter *iter;
u64 journal_seq = 0;
int ret = 0;
- bch2_trans_init(&trans, c, 0, 0);
- iter = bch2_trans_get_iter(&trans, BTREE_ID_alloc,
- POS(ca->dev_idx, 0),
- BTREE_ITER_CACHED|
- BTREE_ITER_CACHED_NOFILL|
- BTREE_ITER_INTENT);
-
/* Only use nowait if we've already invalidated at least one bucket: */
while (!ret &&
!fifo_full(&ca->free_inc) &&
- ca->alloc_heap.used)
- ret = bch2_invalidate_one_bucket2(&trans, ca, iter, &journal_seq,
+ ca->alloc_heap.used) {
+ ret = bch2_invalidate_one_bucket(c, ca, &journal_seq,
(!fifo_empty(&ca->free_inc)
? BTREE_INSERT_NOWAIT : 0));
-
- bch2_trans_iter_put(&trans, iter);
- bch2_trans_exit(&trans);
+ /*
+ * We only want to batch up invalidates when they're going to
+ * require flushing the journal:
+ */
+ if (!journal_seq)
+ break;
+ }
/* If we used NOWAIT, don't return the error: */
if (!fifo_empty(&ca->free_inc))
@@ -935,83 +868,72 @@ static int bch2_invalidate_buckets(struct bch_fs *c, struct bch_dev *ca)
return 0;
}
-static int push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, size_t bucket)
+static void alloc_thread_set_state(struct bch_dev *ca, unsigned new_state)
+{
+ if (ca->allocator_state != new_state) {
+ ca->allocator_state = new_state;
+ closure_wake_up(&ca->fs->freelist_wait);
+ }
+}
+
+static int push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, u64 b)
{
unsigned i;
int ret = 0;
- while (1) {
- set_current_state(TASK_INTERRUPTIBLE);
-
- spin_lock(&c->freelist_lock);
- for (i = 0; i < RESERVE_NR; i++) {
-
- /*
- * Don't strand buckets on the copygc freelist until
- * after recovery is finished:
- */
- if (!test_bit(BCH_FS_STARTED, &c->flags) &&
- i == RESERVE_MOVINGGC)
- continue;
-
- if (fifo_push(&ca->free[i], bucket)) {
- fifo_pop(&ca->free_inc, bucket);
-
- closure_wake_up(&c->freelist_wait);
- ca->allocator_state = ALLOCATOR_running;
-
- spin_unlock(&c->freelist_lock);
- goto out;
- }
- }
-
- if (ca->allocator_state != ALLOCATOR_blocked_full) {
- ca->allocator_state = ALLOCATOR_blocked_full;
- closure_wake_up(&c->freelist_wait);
- }
-
- spin_unlock(&c->freelist_lock);
+ spin_lock(&c->freelist_lock);
+ for (i = 0; i < RESERVE_NR; i++) {
+ /*
+ * Don't strand buckets on the copygc freelist until
+ * after recovery is finished:
+ */
+ if (i == RESERVE_MOVINGGC &&
+ !test_bit(BCH_FS_STARTED, &c->flags))
+ continue;
- if ((current->flags & PF_KTHREAD) &&
- kthread_should_stop()) {
+ if (fifo_push(&ca->free[i], b)) {
+ fifo_pop(&ca->free_inc, b);
ret = 1;
break;
}
-
- schedule();
- try_to_freeze();
}
-out:
- __set_current_state(TASK_RUNNING);
+ spin_unlock(&c->freelist_lock);
+
+ ca->allocator_state = ret
+ ? ALLOCATOR_running
+ : ALLOCATOR_blocked_full;
+ closure_wake_up(&c->freelist_wait);
return ret;
}
-/*
- * Pulls buckets off free_inc, discards them (if enabled), then adds them to
- * freelists, waiting until there's room if necessary:
- */
-static int discard_invalidated_buckets(struct bch_fs *c, struct bch_dev *ca)
+static void discard_one_bucket(struct bch_fs *c, struct bch_dev *ca, u64 b)
{
- while (!fifo_empty(&ca->free_inc)) {
- size_t bucket = fifo_peek(&ca->free_inc);
-
- if (ca->mi.discard &&
- bdev_max_discard_sectors(ca->disk_sb.bdev))
- blkdev_issue_discard(ca->disk_sb.bdev,
- bucket_to_sector(ca, bucket),
- ca->mi.bucket_size, GFP_NOIO);
-
- if (push_invalidated_bucket(c, ca, bucket))
- return 1;
- }
+ if (ca->mi.discard &&
+ bdev_max_discard_sectors(ca->disk_sb.bdev))
+ blkdev_issue_discard(ca->disk_sb.bdev, bucket_to_sector(ca, b),
+ ca->mi.bucket_size, GFP_NOFS);
+}
- return 0;
+static bool allocator_thread_running(struct bch_dev *ca)
+{
+ unsigned state = ca->mi.state == BCH_MEMBER_STATE_rw &&
+ test_bit(BCH_FS_ALLOCATOR_RUNNING, &ca->fs->flags)
+ ? ALLOCATOR_running
+ : ALLOCATOR_stopped;
+ alloc_thread_set_state(ca, state);
+ return state == ALLOCATOR_running;
}
-static inline bool allocator_thread_running(struct bch_dev *ca)
+static int buckets_available(struct bch_dev *ca, unsigned long gc_count)
{
- return ca->mi.state == BCH_MEMBER_STATE_rw &&
- test_bit(BCH_FS_ALLOCATOR_RUNNING, &ca->fs->flags);
+ s64 available = dev_buckets_reclaimable(ca) -
+ (gc_count == ca->fs->gc_count ? ca->inc_gen_really_needs_gc : 0);
+ bool ret = available > 0;
+
+ alloc_thread_set_state(ca, ret
+ ? ALLOCATOR_running
+ : ALLOCATOR_blocked);
+ return ret;
}
/**
@@ -1026,56 +948,29 @@ static int bch2_allocator_thread(void *arg)
{
struct bch_dev *ca = arg;
struct bch_fs *c = ca->fs;
+ unsigned long gc_count = c->gc_count;
size_t nr;
int ret;
set_freezable();
while (1) {
- if (!allocator_thread_running(ca)) {
- ca->allocator_state = ALLOCATOR_stopped;
- if (kthread_wait_freezable(allocator_thread_running(ca)))
- break;
- }
-
- ca->allocator_state = ALLOCATOR_running;
-
- cond_resched();
- if (kthread_should_stop())
- break;
-
- pr_debug("discarding %zu invalidated buckets",
- fifo_used(&ca->free_inc));
-
- ret = discard_invalidated_buckets(c, ca);
+ ret = kthread_wait_freezable(allocator_thread_running(ca));
if (ret)
goto stop;
- ret = bch2_invalidate_buckets(c, ca);
- if (ret)
- goto stop;
-
- if (!fifo_empty(&ca->free_inc))
- continue;
-
- pr_debug("free_inc now empty");
-
- while (1) {
+ while (!ca->alloc_heap.used) {
cond_resched();
- /*
- * Find some buckets that we can invalidate, either
- * they're completely unused, or only contain clean data
- * that's been written back to the backing device or
- * another cache tier
- */
- pr_debug("scanning for reclaimable buckets");
+ ret = kthread_wait_freezable(buckets_available(ca, gc_count));
+ if (ret)
+ goto stop;
+ gc_count = c->gc_count;
nr = find_reclaimable_buckets(c, ca);
- pr_debug("found %zu buckets", nr);
-
- trace_alloc_batch(ca, nr, ca->alloc_heap.size);
+ trace_alloc_scan(ca, nr, ca->inc_gen_needs_gc,
+ ca->inc_gen_really_needs_gc);
if ((ca->inc_gen_needs_gc >= ALLOC_SCAN_BATCH(ca) ||
ca->inc_gen_really_needs_gc) &&
@@ -1083,33 +978,24 @@ static int bch2_allocator_thread(void *arg)
atomic_inc(&c->kick_gc);
wake_up_process(c->gc_thread);
}
+ }
- if (nr)
- break;
+ ret = bch2_invalidate_buckets(c, ca);
+ if (ret)
+ goto stop;
- /*
- * If we found any buckets, we have to invalidate them
- * before we scan for more - but if we didn't find very
- * many we may want to wait on more buckets being
- * available so we don't spin:
- */
- ret = wait_buckets_available(c, ca);
+ while (!fifo_empty(&ca->free_inc)) {
+ u64 b = fifo_peek(&ca->free_inc);
+
+ discard_one_bucket(c, ca, b);
+
+ ret = kthread_wait_freezable(push_invalidated_bucket(c, ca, b));
if (ret)
goto stop;
}
-
- pr_debug("%zu buckets to invalidate", nr);
-
- /*
- * alloc_heap is now full of newly-invalidated buckets: next,
- * write out the new bucket gens:
- */
}
-
stop:
- pr_debug("alloc thread stopping (ret %i)", ret);
- ca->allocator_state = ALLOCATOR_stopped;
- closure_wake_up(&c->freelist_wait);
+ alloc_thread_set_state(ca, ALLOCATOR_stopped);
return 0;
}
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index f2f392eeb54a..6bf4140477a0 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -1,57 +1,14 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * Primary bucket allocation code
- *
* Copyright 2012 Google, Inc.
*
- * Allocation in bcache is done in terms of buckets:
- *
- * Each bucket has associated an 8 bit gen; this gen corresponds to the gen in
- * btree pointers - they must match for the pointer to be considered valid.
- *
- * Thus (assuming a bucket has no dirty data or metadata in it) we can reuse a
- * bucket simply by incrementing its gen.
- *
- * The gens (along with the priorities; it's really the gens are important but
- * the code is named as if it's the priorities) are written in an arbitrary list
- * of buckets on disk, with a pointer to them in the journal header.
- *
- * When we invalidate a bucket, we have to write its new gen to disk and wait
- * for that write to complete before we use it - otherwise after a crash we
- * could have pointers that appeared to be good but pointed to data that had
- * been overwritten.
- *
- * Since the gens and priorities are all stored contiguously on disk, we can
- * batch this up: We fill up the free_inc list with freshly invalidated buckets,
- * call prio_write(), and when prio_write() finishes we pull buckets off the
- * free_inc list and optionally discard them.
- *
- * free_inc isn't the only freelist - if it was, we'd often have to sleep while
- * priorities and gens were being written before we could allocate. c->free is a
- * smaller freelist, and buckets on that list are always ready to be used.
- *
- * If we've got discards enabled, that happens when a bucket moves from the
- * free_inc list to the free list.
- *
- * It's important to ensure that gens don't wrap around - with respect to
- * either the oldest gen in the btree or the gen on disk. This is quite
- * difficult to do in practice, but we explicitly guard against it anyways - if
- * a bucket is in danger of wrapping around we simply skip invalidating it that
- * time around, and we garbage collect or rewrite the priorities sooner than we
- * would have otherwise.
+ * Foreground allocator code: allocate buckets from freelist, and allocate in
+ * sector granularity from writepoints.
*
* bch2_bucket_alloc() allocates a single bucket from a specific device.
*
* bch2_bucket_alloc_set() allocates one or more buckets from different devices
* in a given filesystem.
- *
- * invalidate_buckets() drives all the processes described above. It's called
- * from bch2_bucket_alloc() and a few other places that need to make sure free
- * buckets are ready.
- *
- * invalidate_buckets_(lru|fifo)() find buckets that are available to be
- * invalidated, and then invalidate them and stick them on the free_inc list -
- * in either lru or fifo order.
*/
#include "bcachefs.h"
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index 887c0adddf12..c6d98f4c50e7 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -380,24 +380,27 @@ DEFINE_EVENT(bch_fs, gc_cannot_inc_gens,
/* Allocator */
-TRACE_EVENT(alloc_batch,
- TP_PROTO(struct bch_dev *ca, size_t free, size_t total),
- TP_ARGS(ca, free, total),
+TRACE_EVENT(alloc_scan,
+ TP_PROTO(struct bch_dev *ca, u64 found, u64 inc_gen, u64 inc_gen_skipped),
+ TP_ARGS(ca, found, inc_gen, inc_gen_skipped),
TP_STRUCT__entry(
- __array(char, uuid, 16 )
- __field(size_t, free )
- __field(size_t, total )
+ __field(dev_t, dev )
+ __field(u64, found )
+ __field(u64, inc_gen )
+ __field(u64, inc_gen_skipped )
),
TP_fast_assign(
- memcpy(__entry->uuid, ca->uuid.b, 16);
- __entry->free = free;
- __entry->total = total;
+ __entry->dev = ca->disk_sb.bdev->bd_dev;
+ __entry->found = found;
+ __entry->inc_gen = inc_gen;
+ __entry->inc_gen_skipped = inc_gen_skipped;
),
- TP_printk("%pU free %zu total %zu",
- __entry->uuid, __entry->free, __entry->total)
+ TP_printk("%d,%d found %llu inc_gen %llu inc_gen_skipped %llu",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->found, __entry->inc_gen, __entry->inc_gen_skipped)
);
TRACE_EVENT(invalidate,
@@ -417,8 +420,10 @@ TRACE_EVENT(invalidate,
),
TP_printk("invalidated %u sectors at %d,%d sector=%llu",
- __entry->sectors, MAJOR(__entry->dev),
- MINOR(__entry->dev), __entry->offset)
+ __entry->sectors,
+ MAJOR(__entry->dev),
+ MINOR(__entry->dev),
+ __entry->offset)
);
DECLARE_EVENT_CLASS(bucket_alloc,
@@ -426,16 +431,18 @@ DECLARE_EVENT_CLASS(bucket_alloc,
TP_ARGS(ca, reserve),
TP_STRUCT__entry(
- __array(char, uuid, 16)
- __field(enum alloc_reserve, reserve )
+ __field(dev_t, dev )
+ __field(enum alloc_reserve, reserve )
),
TP_fast_assign(
- memcpy(__entry->uuid, ca->uuid.b, 16);
- __entry->reserve = reserve;
+ __entry->dev = ca->disk_sb.bdev->bd_dev;
+ __entry->reserve = reserve;
),
- TP_printk("%pU reserve %d", __entry->uuid, __entry->reserve)
+ TP_printk("%d,%d reserve %d",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->reserve)
);
DEFINE_EVENT(bucket_alloc, bucket_alloc,