summaryrefslogtreecommitdiff
path: root/fs/bcachefs/alloc_background.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/bcachefs/alloc_background.c')
-rw-r--r--fs/bcachefs/alloc_background.c560
1 files changed, 4 insertions, 556 deletions
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 5d553d9b6151..3ba2b35fad53 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -27,13 +27,6 @@
#include <linux/sched/task.h>
#include <linux/sort.h>
-const char * const bch2_allocator_states[] = {
-#define x(n) #n,
- ALLOC_THREAD_STATES()
-#undef x
- NULL
-};
-
/* Persistent alloc info: */
static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = {
@@ -431,7 +424,6 @@ int bch2_alloc_read(struct bch_fs *c, bool gc, bool metadata_only)
g->_mark.gen = a.gen;
g->io_time[READ] = a.io_time[READ];
g->io_time[WRITE] = a.io_time[WRITE];
- g->oldest_gen = !gc ? a.oldest_gen : a.gen;
g->gen_valid = 1;
if (!gc ||
@@ -553,7 +545,6 @@ int bch2_trans_mark_alloc(struct btree_trans *trans,
new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now));
new_a->io_time[WRITE]= max_t(u64, 1, atomic64_read(&c->io_clock[WRITE].now));
SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, true);
- SET_BCH_ALLOC_V4_NEED_DISCARD(new_a, true);
}
if (old_a.data_type && !new_a->data_type &&
@@ -698,493 +689,6 @@ out:
return ret;
}
-/* Background allocator thread: */
-
-/*
- * Scans for buckets to be invalidated, invalidates them, rewrites prios/gens
- * (marking them as invalidated on disk), then optionally issues discard
- * commands to the newly free buckets, then puts them on the various freelists.
- */
-
-/*
- * bucket_gc_gen() returns the difference between the bucket's current gen and
- * the oldest gen of any pointer into that bucket in the btree.
- */
-
-static inline u8 bucket_gc_gen(struct bucket *g)
-{
- return g->mark.gen - g->oldest_gen;
-}
-
-static bool bch2_can_invalidate_bucket(struct bch_dev *ca, size_t b,
- struct bucket_mark m)
-{
- u8 gc_gen;
-
- if (!is_available_bucket(m))
- return false;
-
- if (m.owned_by_allocator)
- return false;
-
- if (ca->buckets_nouse &&
- test_bit(b, ca->buckets_nouse))
- return false;
-
- if (ca->new_fs_bucket_idx) {
- /*
- * Device or filesystem is still being initialized, and we
- * haven't fully marked superblocks & journal:
- */
- if (is_superblock_bucket(ca, b))
- return false;
-
- if (b < ca->new_fs_bucket_idx)
- return false;
- }
-
- gc_gen = bucket_gc_gen(bucket(ca, b));
-
- ca->inc_gen_needs_gc += gc_gen >= BUCKET_GC_GEN_MAX / 2;
- ca->inc_gen_really_needs_gc += gc_gen >= BUCKET_GC_GEN_MAX;
-
- return gc_gen < BUCKET_GC_GEN_MAX;
-}
-
-/*
- * Determines what order we're going to reuse buckets, smallest bucket_key()
- * first.
- */
-
-static unsigned bucket_sort_key(struct bucket *g, struct bucket_mark m,
- u64 now, u64 last_seq_ondisk)
-{
- unsigned used = m.cached_sectors;
-
- if (used) {
- /*
- * Prefer to keep buckets that have been read more recently, and
- * buckets that have more data in them:
- */
- u64 last_read = max_t(s64, 0, now - g->io_time[READ]);
- u32 last_read_scaled = max_t(u64, U32_MAX, div_u64(last_read, used));
-
- return -last_read_scaled;
- } else {
- /*
- * Prefer to use buckets with smaller gc_gen so that we don't
- * have to walk the btree and recalculate oldest_gen - but shift
- * off the low bits so that buckets will still have equal sort
- * keys when there's only a small difference, so that we can
- * keep sequential buckets together:
- */
- return bucket_gc_gen(g) >> 4;
- }
-}
-
-static inline int bucket_alloc_cmp(alloc_heap *h,
- struct alloc_heap_entry l,
- struct alloc_heap_entry r)
-{
- return cmp_int(l.key, r.key) ?:
- cmp_int(r.nr, l.nr) ?:
- cmp_int(l.bucket, r.bucket);
-}
-
-static inline int bucket_idx_cmp(const void *_l, const void *_r)
-{
- const struct alloc_heap_entry *l = _l, *r = _r;
-
- return cmp_int(l->bucket, r->bucket);
-}
-
-static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
-{
- struct bucket_array *buckets;
- struct alloc_heap_entry e = { 0 };
- u64 now, last_seq_ondisk;
- size_t b, i, nr = 0;
-
- down_read(&ca->bucket_lock);
-
- buckets = bucket_array(ca);
- ca->alloc_heap.used = 0;
- now = atomic64_read(&c->io_clock[READ].now);
- last_seq_ondisk = c->journal.flushed_seq_ondisk;
-
- /*
- * Find buckets with lowest read priority, by building a maxheap sorted
- * by read priority and repeatedly replacing the maximum element until
- * all buckets have been visited.
- */
- for (b = ca->mi.first_bucket; b < ca->mi.nbuckets; b++) {
- struct bucket *g = &buckets->b[b];
- struct bucket_mark m = READ_ONCE(g->mark);
- unsigned key = bucket_sort_key(g, m, now, last_seq_ondisk);
-
- cond_resched();
-
- if (!bch2_can_invalidate_bucket(ca, b, m))
- continue;
-
- if (!m.data_type &&
- bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
- last_seq_ondisk,
- ca->dev_idx, b)) {
- ca->buckets_waiting_on_journal++;
- continue;
- }
-
- if (e.nr && e.bucket + e.nr == b && e.key == key) {
- e.nr++;
- } else {
- if (e.nr)
- heap_add_or_replace(&ca->alloc_heap, e,
- -bucket_alloc_cmp, NULL);
-
- e = (struct alloc_heap_entry) {
- .bucket = b,
- .nr = 1,
- .key = key,
- };
- }
- }
-
- if (e.nr)
- heap_add_or_replace(&ca->alloc_heap, e,
- -bucket_alloc_cmp, NULL);
-
- for (i = 0; i < ca->alloc_heap.used; i++)
- nr += ca->alloc_heap.data[i].nr;
-
- while (nr - ca->alloc_heap.data[0].nr >= ALLOC_SCAN_BATCH(ca)) {
- nr -= ca->alloc_heap.data[0].nr;
- heap_pop(&ca->alloc_heap, e, -bucket_alloc_cmp, NULL);
- }
-
- up_read(&ca->bucket_lock);
-}
-
-static size_t find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca)
-{
- size_t i, nr = 0;
-
- ca->inc_gen_needs_gc = 0;
- ca->inc_gen_really_needs_gc = 0;
- ca->buckets_waiting_on_journal = 0;
-
- find_reclaimable_buckets_lru(c, ca);
-
- heap_resort(&ca->alloc_heap, bucket_alloc_cmp, NULL);
-
- for (i = 0; i < ca->alloc_heap.used; i++)
- nr += ca->alloc_heap.data[i].nr;
-
- return nr;
-}
-
-static int bucket_invalidate_btree(struct btree_trans *trans,
- struct bch_dev *ca, u64 b,
- struct bkey_i_alloc_v4 *a)
-{
- struct bch_fs *c = trans->c;
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret;
-
- bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
- POS(ca->dev_idx, b),
- BTREE_ITER_CACHED|
- BTREE_ITER_INTENT);
-
- k = bch2_btree_iter_peek_slot(&iter);
- ret = bkey_err(k);
- if (ret)
- goto err;
-
- bkey_alloc_v4_init(&a->k_i);
- a->k.p = iter.pos;
- bch2_alloc_to_v4(k, &a->v);
- a->v.gen++;
- a->v.data_type = 0;
- a->v.dirty_sectors = 0;
- a->v.cached_sectors = 0;
- a->v.io_time[READ] = atomic64_read(&c->io_clock[READ].now);
- a->v.io_time[WRITE] = atomic64_read(&c->io_clock[WRITE].now);
-
- ret = bch2_trans_update(trans, &iter, &a->k_i,
- BTREE_TRIGGER_BUCKET_INVALIDATE|
- BTREE_UPDATE_NO_KEY_CACHE_COHERENCY);
-err:
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-static int bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca,
- u64 *journal_seq, unsigned flags)
-{
- struct bkey_i_alloc_v4 a;
- size_t b;
- u64 commit_seq = 0;
- int ret = 0;
-
- /*
- * If the read-only path is trying to shut down, we can't be generating
- * new btree updates:
- */
- if (test_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags))
- return 1;
-
- BUG_ON(!ca->alloc_heap.used ||
- !ca->alloc_heap.data[0].nr);
- b = ca->alloc_heap.data[0].bucket;
-
- /* first, put on free_inc and mark as owned by allocator: */
- percpu_down_read(&c->mark_lock);
-
- bch2_mark_alloc_bucket(c, ca, b, true);
-
- spin_lock(&c->freelist_lock);
- verify_not_on_freelist(c, ca, b);
- BUG_ON(!fifo_push(&ca->free_inc, b));
- spin_unlock(&c->freelist_lock);
-
- percpu_up_read(&c->mark_lock);
-
- ret = bch2_trans_do(c, NULL, &commit_seq,
- BTREE_INSERT_NOCHECK_RW|
- BTREE_INSERT_NOFAIL|
- flags,
- bucket_invalidate_btree(&trans, ca, b, &a));
-
- if (!ret) {
- /* remove from alloc_heap: */
- struct alloc_heap_entry e, *top = ca->alloc_heap.data;
-
- top->bucket++;
- top->nr--;
-
- if (!top->nr)
- heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp, NULL);
-
- /*
- * If we invalidating cached data then we need to wait on the
- * journal commit:
- */
- if (a.v.data_type)
- *journal_seq = max(*journal_seq, commit_seq);
-
- /*
- * We already waiting on u.alloc_seq when we filtered out
- * buckets that need journal commit:
- */
- BUG_ON(*journal_seq > a.v.journal_seq);
- } else {
- size_t b2;
-
- /* remove from free_inc: */
- percpu_down_read(&c->mark_lock);
- spin_lock(&c->freelist_lock);
-
- bch2_mark_alloc_bucket(c, ca, b, false);
-
- BUG_ON(!fifo_pop_back(&ca->free_inc, b2));
- BUG_ON(b != b2);
-
- spin_unlock(&c->freelist_lock);
- percpu_up_read(&c->mark_lock);
- }
-
- return ret < 0 ? ret : 0;
-}
-
-/*
- * Pull buckets off ca->alloc_heap, invalidate them, move them to ca->free_inc:
- */
-static int bch2_invalidate_buckets(struct bch_fs *c, struct bch_dev *ca)
-{
- u64 journal_seq = 0;
- int ret = 0;
-
- /* Only use nowait if we've already invalidated at least one bucket: */
- while (!ret &&
- !fifo_full(&ca->free_inc) &&
- ca->alloc_heap.used) {
- if (kthread_should_stop()) {
- ret = 1;
- break;
- }
-
- ret = bch2_invalidate_one_bucket(c, ca, &journal_seq,
- (!fifo_empty(&ca->free_inc)
- ? BTREE_INSERT_NOWAIT : 0));
- /*
- * We only want to batch up invalidates when they're going to
- * require flushing the journal:
- */
- if (!journal_seq)
- break;
- }
-
- /* If we used NOWAIT, don't return the error: */
- if (!fifo_empty(&ca->free_inc))
- ret = 0;
- if (ret < 0)
- bch_err(ca, "error invalidating buckets: %i", ret);
- if (ret)
- return ret;
-
- if (journal_seq)
- ret = bch2_journal_flush_seq(&c->journal, journal_seq);
- if (ret) {
- bch_err(ca, "journal error: %i", ret);
- return ret;
- }
-
- return 0;
-}
-
-static void alloc_thread_set_state(struct bch_dev *ca, unsigned new_state)
-{
- if (ca->allocator_state != new_state) {
- ca->allocator_state = new_state;
- closure_wake_up(&ca->fs->freelist_wait);
- }
-}
-
-static int push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, u64 b)
-{
- unsigned i;
- int ret = 0;
-
- spin_lock(&c->freelist_lock);
- for (i = 0; i < RESERVE_NR; i++) {
- /*
- * Don't strand buckets on the copygc freelist until
- * after recovery is finished:
- */
- if (i == RESERVE_movinggc &&
- !test_bit(BCH_FS_STARTED, &c->flags))
- continue;
-
- if (fifo_push(&ca->free[i], b)) {
- fifo_pop(&ca->free_inc, b);
- ret = 1;
- break;
- }
- }
- spin_unlock(&c->freelist_lock);
-
- ca->allocator_state = ret
- ? ALLOCATOR_running
- : ALLOCATOR_blocked_full;
- closure_wake_up(&c->freelist_wait);
- return ret;
-}
-
-static void discard_one_bucket(struct bch_fs *c, struct bch_dev *ca, u64 b)
-{
- if (!c->opts.nochanges &&
- ca->mi.discard &&
- bdev_max_discard_sectors(ca->disk_sb.bdev))
- blkdev_issue_discard(ca->disk_sb.bdev, bucket_to_sector(ca, b),
- ca->mi.bucket_size, GFP_NOFS);
-}
-
-static bool allocator_thread_running(struct bch_dev *ca)
-{
- unsigned state = ca->mi.state == BCH_MEMBER_STATE_rw &&
- test_bit(BCH_FS_ALLOCATOR_RUNNING, &ca->fs->flags)
- ? ALLOCATOR_running
- : ALLOCATOR_stopped;
- alloc_thread_set_state(ca, state);
- return state == ALLOCATOR_running;
-}
-
-static int buckets_available(struct bch_dev *ca, unsigned long gc_count)
-{
- s64 available = dev_buckets_reclaimable(ca) -
- (gc_count == ca->fs->gc_count ? ca->inc_gen_really_needs_gc : 0);
- bool ret = available > 0;
-
- alloc_thread_set_state(ca, ret
- ? ALLOCATOR_running
- : ALLOCATOR_blocked);
- return ret;
-}
-
-/**
- * bch_allocator_thread - move buckets from free_inc to reserves
- *
- * The free_inc FIFO is populated by find_reclaimable_buckets(), and
- * the reserves are depleted by bucket allocation. When we run out
- * of free_inc, try to invalidate some buckets and write out
- * prios and gens.
- */
-static int bch2_allocator_thread(void *arg)
-{
- struct bch_dev *ca = arg;
- struct bch_fs *c = ca->fs;
- unsigned long gc_count = c->gc_count;
- size_t nr;
- int ret;
-
- set_freezable();
-
- while (1) {
- ret = kthread_wait_freezable(allocator_thread_running(ca));
- if (ret)
- goto stop;
-
- while (!ca->alloc_heap.used) {
- cond_resched();
-
- ret = kthread_wait_freezable(buckets_available(ca, gc_count));
- if (ret)
- goto stop;
-
- gc_count = c->gc_count;
- nr = find_reclaimable_buckets(c, ca);
-
- if (!nr && ca->buckets_waiting_on_journal) {
- ret = bch2_journal_flush(&c->journal);
- if (ret)
- goto stop;
- } else if (nr < (ca->mi.nbuckets >> 6) &&
- ca->buckets_waiting_on_journal >= nr / 2) {
- bch2_journal_flush_async(&c->journal, NULL);
- }
-
- if ((ca->inc_gen_needs_gc >= ALLOC_SCAN_BATCH(ca) ||
- ca->inc_gen_really_needs_gc) &&
- c->gc_thread) {
- atomic_inc(&c->kick_gc);
- wake_up_process(c->gc_thread);
- }
-
- trace_alloc_scan(ca, nr, ca->inc_gen_needs_gc,
- ca->inc_gen_really_needs_gc);
- }
-
- ret = bch2_invalidate_buckets(c, ca);
- if (ret)
- goto stop;
-
- while (!fifo_empty(&ca->free_inc)) {
- u64 b = fifo_peek(&ca->free_inc);
-
- discard_one_bucket(c, ca, b);
-
- ret = kthread_wait_freezable(push_invalidated_bucket(c, ca, b));
- if (ret)
- goto stop;
- }
- }
-stop:
- alloc_thread_set_state(ca, ALLOCATOR_stopped);
- return 0;
-}
-
/* Startup/shutdown (ro/rw): */
void bch2_recalc_capacity(struct bch_fs *c)
@@ -1193,7 +697,7 @@ void bch2_recalc_capacity(struct bch_fs *c)
u64 capacity = 0, reserved_sectors = 0, gc_reserve;
unsigned bucket_size_max = 0;
unsigned long ra_pages = 0;
- unsigned i, j;
+ unsigned i;
lockdep_assert_held(&c->state_lock);
@@ -1224,8 +728,9 @@ void bch2_recalc_capacity(struct bch_fs *c)
* allocations for foreground writes must wait -
* not -ENOSPC calculations.
*/
- for (j = 0; j < RESERVE_none; j++)
- dev_reserve += ca->free[j].size;
+
+ dev_reserve += ca->nr_btree_reserve * 2;
+ dev_reserve += ca->mi.nbuckets >> 6; /* copygc reserve */
dev_reserve += 1; /* btree write point */
dev_reserve += 1; /* copygc write point */
@@ -1281,8 +786,6 @@ void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca)
{
unsigned i;
- BUG_ON(ca->alloc_thread);
-
/* First, remove device from allocation groups: */
for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
@@ -1356,61 +859,6 @@ void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca)
set_bit(ca->dev_idx, c->rw_devs[i].d);
}
-void bch2_dev_allocator_quiesce(struct bch_fs *c, struct bch_dev *ca)
-{
- if (ca->alloc_thread)
- closure_wait_event(&c->freelist_wait,
- ca->allocator_state != ALLOCATOR_running);
-}
-
-/* stop allocator thread: */
-void bch2_dev_allocator_stop(struct bch_dev *ca)
-{
- struct task_struct *p;
-
- p = rcu_dereference_protected(ca->alloc_thread, 1);
- ca->alloc_thread = NULL;
-
- /*
- * We need an rcu barrier between setting ca->alloc_thread = NULL and
- * the thread shutting down to avoid bch2_wake_allocator() racing:
- *
- * XXX: it would be better to have the rcu barrier be asynchronous
- * instead of blocking us here
- */
- synchronize_rcu();
-
- if (p) {
- kthread_stop(p);
- put_task_struct(p);
- }
-}
-
-/* start allocator thread: */
-int bch2_dev_allocator_start(struct bch_dev *ca)
-{
- struct task_struct *p;
-
- /*
- * allocator thread already started?
- */
- if (ca->alloc_thread)
- return 0;
-
- p = kthread_create(bch2_allocator_thread, ca,
- "bch-alloc/%s", ca->name);
- if (IS_ERR(p)) {
- bch_err(ca->fs, "error creating allocator thread: %li",
- PTR_ERR(p));
- return PTR_ERR(p);
- }
-
- get_task_struct(p);
- rcu_assign_pointer(ca->alloc_thread, p);
- wake_up_process(p);
- return 0;
-}
-
void bch2_fs_allocator_background_init(struct bch_fs *c)
{
spin_lock_init(&c->freelist_lock);