1 files changed, 4 insertions, 556 deletions
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 5d553d9b6151..3ba2b35fad53 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -27,13 +27,6 @@
 #include <linux/sched/task.h>
 #include <linux/sort.h>
 
-const char * const bch2_allocator_states[] = {
-#define x(n)	#n,
-	ALLOC_THREAD_STATES()
-#undef x
-	NULL
-};
-
 /* Persistent alloc info: */
 
 static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = {
@@ -431,7 +424,6 @@ int bch2_alloc_read(struct bch_fs *c, bool gc, bool metadata_only)
 		g->_mark.gen		= a.gen;
 		g->io_time[READ]	= a.io_time[READ];
 		g->io_time[WRITE]	= a.io_time[WRITE];
-		g->oldest_gen		= !gc ? a.oldest_gen : a.gen;
 		g->gen_valid		= 1;
 
 		if (!gc ||
@@ -553,7 +545,6 @@ int bch2_trans_mark_alloc(struct btree_trans *trans,
 		new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now));
 		new_a->io_time[WRITE]= max_t(u64, 1, atomic64_read(&c->io_clock[WRITE].now));
 		SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, true);
-		SET_BCH_ALLOC_V4_NEED_DISCARD(new_a, true);
 	}
 
 	if (old_a.data_type && !new_a->data_type &&
@@ -698,493 +689,6 @@ out:
 	return ret;
 }
 
-/* Background allocator thread: */
-
-/*
- * Scans for buckets to be invalidated, invalidates them, rewrites prios/gens
- * (marking them as invalidated on disk), then optionally issues discard
- * commands to the newly free buckets, then puts them on the various freelists.
- */
-
-/*
- * bucket_gc_gen() returns the difference between the bucket's current gen and
- * the oldest gen of any pointer into that bucket in the btree.
- */
-
-static inline u8 bucket_gc_gen(struct bucket *g)
-{
-	return g->mark.gen - g->oldest_gen;
-}
-
-static bool bch2_can_invalidate_bucket(struct bch_dev *ca, size_t b,
-				       struct bucket_mark m)
-{
-	u8 gc_gen;
-
-	if (!is_available_bucket(m))
-		return false;
-
-	if (m.owned_by_allocator)
-		return false;
-
-	if (ca->buckets_nouse &&
-	    test_bit(b, ca->buckets_nouse))
-		return false;
-
-	if (ca->new_fs_bucket_idx) {
-		/*
-		 * Device or filesystem is still being initialized, and we
-		 * haven't fully marked superblocks & journal:
-		 */
-		if (is_superblock_bucket(ca, b))
-			return false;
-
-		if (b < ca->new_fs_bucket_idx)
-			return false;
-	}
-
-	gc_gen = bucket_gc_gen(bucket(ca, b));
-
-	ca->inc_gen_needs_gc		+= gc_gen >= BUCKET_GC_GEN_MAX / 2;
-	ca->inc_gen_really_needs_gc	+= gc_gen >= BUCKET_GC_GEN_MAX;
-
-	return gc_gen < BUCKET_GC_GEN_MAX;
-}
-
-/*
- * Determines what order we're going to reuse buckets, smallest bucket_key()
- * first.
- */
-
-static unsigned bucket_sort_key(struct bucket *g, struct bucket_mark m,
-				u64 now, u64 last_seq_ondisk)
-{
-	unsigned used = m.cached_sectors;
-
-	if (used) {
-		/*
-		 * Prefer to keep buckets that have been read more recently, and
-		 * buckets that have more data in them:
-		 */
-		u64 last_read = max_t(s64, 0, now - g->io_time[READ]);
-		u32 last_read_scaled = max_t(u64, U32_MAX, div_u64(last_read, used));
-
-		return -last_read_scaled;
-	} else {
-		/*
-		 * Prefer to use buckets with smaller gc_gen so that we don't
-		 * have to walk the btree and recalculate oldest_gen - but shift
-		 * off the low bits so that buckets will still have equal sort
-		 * keys when there's only a small difference, so that we can
-		 * keep sequential buckets together:
-		 */
-		return bucket_gc_gen(g) >> 4;
-	}
-}
-
-static inline int bucket_alloc_cmp(alloc_heap *h,
-				   struct alloc_heap_entry l,
-				   struct alloc_heap_entry r)
-{
-	return  cmp_int(l.key, r.key) ?:
-		cmp_int(r.nr, l.nr) ?:
-		cmp_int(l.bucket, r.bucket);
-}
-
-static inline int bucket_idx_cmp(const void *_l, const void *_r)
-{
-	const struct alloc_heap_entry *l = _l, *r = _r;
-
-	return cmp_int(l->bucket, r->bucket);
-}
-
-static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
-{
-	struct bucket_array *buckets;
-	struct alloc_heap_entry e = { 0 };
-	u64 now, last_seq_ondisk;
-	size_t b, i, nr = 0;
-
-	down_read(&ca->bucket_lock);
-
-	buckets = bucket_array(ca);
-	ca->alloc_heap.used = 0;
-	now = atomic64_read(&c->io_clock[READ].now);
-	last_seq_ondisk = c->journal.flushed_seq_ondisk;
-
-	/*
-	 * Find buckets with lowest read priority, by building a maxheap sorted
-	 * by read priority and repeatedly replacing the maximum element until
-	 * all buckets have been visited.
-	 */
-	for (b = ca->mi.first_bucket; b < ca->mi.nbuckets; b++) {
-		struct bucket *g = &buckets->b[b];
-		struct bucket_mark m = READ_ONCE(g->mark);
-		unsigned key = bucket_sort_key(g, m, now, last_seq_ondisk);
-
-		cond_resched();
-
-		if (!bch2_can_invalidate_bucket(ca, b, m))
-			continue;
-
-		if (!m.data_type &&
-		    bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
-						     last_seq_ondisk,
-						     ca->dev_idx, b)) {
-			ca->buckets_waiting_on_journal++;
-			continue;
-		}
-
-		if (e.nr && e.bucket + e.nr == b && e.key == key) {
-			e.nr++;
-		} else {
-			if (e.nr)
-				heap_add_or_replace(&ca->alloc_heap, e,
-					-bucket_alloc_cmp, NULL);
-
-			e = (struct alloc_heap_entry) {
-				.bucket = b,
-				.nr	= 1,
-				.key	= key,
-			};
-		}
-	}
-
-	if (e.nr)
-		heap_add_or_replace(&ca->alloc_heap, e,
-				-bucket_alloc_cmp, NULL);
-
-	for (i = 0; i < ca->alloc_heap.used; i++)
-		nr += ca->alloc_heap.data[i].nr;
-
-	while (nr - ca->alloc_heap.data[0].nr >= ALLOC_SCAN_BATCH(ca)) {
-		nr -= ca->alloc_heap.data[0].nr;
-		heap_pop(&ca->alloc_heap, e, -bucket_alloc_cmp, NULL);
-	}
-
-	up_read(&ca->bucket_lock);
-}
-
-static size_t find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca)
-{
-	size_t i, nr = 0;
-
-	ca->inc_gen_needs_gc			= 0;
-	ca->inc_gen_really_needs_gc		= 0;
-	ca->buckets_waiting_on_journal		= 0;
-
-	find_reclaimable_buckets_lru(c, ca);
-
-	heap_resort(&ca->alloc_heap, bucket_alloc_cmp, NULL);
-
-	for (i = 0; i < ca->alloc_heap.used; i++)
-		nr += ca->alloc_heap.data[i].nr;
-
-	return nr;
-}
-
-static int bucket_invalidate_btree(struct btree_trans *trans,
-				   struct bch_dev *ca, u64 b,
-				   struct bkey_i_alloc_v4 *a)
-{
-	struct bch_fs *c = trans->c;
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	int ret;
-
-	bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
-			     POS(ca->dev_idx, b),
-			     BTREE_ITER_CACHED|
-			     BTREE_ITER_INTENT);
-
-	k = bch2_btree_iter_peek_slot(&iter);
-	ret = bkey_err(k);
-	if (ret)
-		goto err;
-
-	bkey_alloc_v4_init(&a->k_i);
-	a->k.p = iter.pos;
-	bch2_alloc_to_v4(k, &a->v);
-	a->v.gen++;
-	a->v.data_type		= 0;
-	a->v.dirty_sectors	= 0;
-	a->v.cached_sectors	= 0;
-	a->v.io_time[READ]	= atomic64_read(&c->io_clock[READ].now);
-	a->v.io_time[WRITE]	= atomic64_read(&c->io_clock[WRITE].now);
-
-	ret = bch2_trans_update(trans, &iter, &a->k_i,
-			       BTREE_TRIGGER_BUCKET_INVALIDATE|
-			       BTREE_UPDATE_NO_KEY_CACHE_COHERENCY);
-err:
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-static int bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca,
-				      u64 *journal_seq, unsigned flags)
-{
-	struct bkey_i_alloc_v4 a;
-	size_t b;
-	u64 commit_seq = 0;
-	int ret = 0;
-
-	/*
-	 * If the read-only path is trying to shut down, we can't be generating
-	 * new btree updates:
-	 */
-	if (test_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags))
-		return 1;
-
-	BUG_ON(!ca->alloc_heap.used ||
-	       !ca->alloc_heap.data[0].nr);
-	b = ca->alloc_heap.data[0].bucket;
-
-	/* first, put on free_inc and mark as owned by allocator: */
-	percpu_down_read(&c->mark_lock);
-
-	bch2_mark_alloc_bucket(c, ca, b, true);
-
-	spin_lock(&c->freelist_lock);
-	verify_not_on_freelist(c, ca, b);
-	BUG_ON(!fifo_push(&ca->free_inc, b));
-	spin_unlock(&c->freelist_lock);
-
-	percpu_up_read(&c->mark_lock);
-
-	ret = bch2_trans_do(c, NULL, &commit_seq,
-			    BTREE_INSERT_NOCHECK_RW|
-			    BTREE_INSERT_NOFAIL|
-			    flags,
-			    bucket_invalidate_btree(&trans, ca, b, &a));
-
-	if (!ret) {
-		/* remove from alloc_heap: */
-		struct alloc_heap_entry e, *top = ca->alloc_heap.data;
-
-		top->bucket++;
-		top->nr--;
-
-		if (!top->nr)
-			heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp, NULL);
-
-		/*
-		 * If we invalidating cached data then we need to wait on the
-		 * journal commit:
-		 */
-		if (a.v.data_type)
-			*journal_seq = max(*journal_seq, commit_seq);
-
-		/*
-		 * We already waiting on u.alloc_seq when we filtered out
-		 * buckets that need journal commit:
-		 */
-		BUG_ON(*journal_seq > a.v.journal_seq);
-	} else {
-		size_t b2;
-
-		/* remove from free_inc: */
-		percpu_down_read(&c->mark_lock);
-		spin_lock(&c->freelist_lock);
-
-		bch2_mark_alloc_bucket(c, ca, b, false);
-
-		BUG_ON(!fifo_pop_back(&ca->free_inc, b2));
-		BUG_ON(b != b2);
-
-		spin_unlock(&c->freelist_lock);
-		percpu_up_read(&c->mark_lock);
-	}
-
-	return ret < 0 ? ret : 0;
-}
-
-/*
- * Pull buckets off ca->alloc_heap, invalidate them, move them to ca->free_inc:
- */
-static int bch2_invalidate_buckets(struct bch_fs *c, struct bch_dev *ca)
-{
-	u64 journal_seq = 0;
-	int ret = 0;
-
-	/* Only use nowait if we've already invalidated at least one bucket: */
-	while (!ret &&
-	       !fifo_full(&ca->free_inc) &&
-	       ca->alloc_heap.used) {
-		if (kthread_should_stop()) {
-			ret = 1;
-			break;
-		}
-
-		ret = bch2_invalidate_one_bucket(c, ca, &journal_seq,
-				(!fifo_empty(&ca->free_inc)
-				 ? BTREE_INSERT_NOWAIT : 0));
-		/*
-		 * We only want to batch up invalidates when they're going to
-		 * require flushing the journal:
-		 */
-		if (!journal_seq)
-			break;
-	}
-
-	/* If we used NOWAIT, don't return the error: */
-	if (!fifo_empty(&ca->free_inc))
-		ret = 0;
-	if (ret < 0)
-		bch_err(ca, "error invalidating buckets: %i", ret);
-	if (ret)
-		return ret;
-
-	if (journal_seq)
-		ret = bch2_journal_flush_seq(&c->journal, journal_seq);
-	if (ret) {
-		bch_err(ca, "journal error: %i", ret);
-		return ret;
-	}
-
-	return 0;
-}
-
-static void alloc_thread_set_state(struct bch_dev *ca, unsigned new_state)
-{
-	if (ca->allocator_state != new_state) {
-		ca->allocator_state = new_state;
-		closure_wake_up(&ca->fs->freelist_wait);
-	}
-}
-
-static int push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, u64 b)
-{
-	unsigned i;
-	int ret = 0;
-
-	spin_lock(&c->freelist_lock);
-	for (i = 0; i < RESERVE_NR; i++) {
-		/*
-		 * Don't strand buckets on the copygc freelist until
-		 * after recovery is finished:
-		 */
-		if (i == RESERVE_movinggc &&
-		    !test_bit(BCH_FS_STARTED, &c->flags))
-			continue;
-
-		if (fifo_push(&ca->free[i], b)) {
-			fifo_pop(&ca->free_inc, b);
-			ret = 1;
-			break;
-		}
-	}
-	spin_unlock(&c->freelist_lock);
-
-	ca->allocator_state = ret
-		? ALLOCATOR_running
-		: ALLOCATOR_blocked_full;
-	closure_wake_up(&c->freelist_wait);
-	return ret;
-}
-
-static void discard_one_bucket(struct bch_fs *c, struct bch_dev *ca, u64 b)
-{
-	if (!c->opts.nochanges &&
-	    ca->mi.discard &&
-	    bdev_max_discard_sectors(ca->disk_sb.bdev))
-		blkdev_issue_discard(ca->disk_sb.bdev, bucket_to_sector(ca, b),
-				     ca->mi.bucket_size, GFP_NOFS);
-}
-
-static bool allocator_thread_running(struct bch_dev *ca)
-{
-	unsigned state = ca->mi.state == BCH_MEMBER_STATE_rw &&
-		test_bit(BCH_FS_ALLOCATOR_RUNNING, &ca->fs->flags)
-		? ALLOCATOR_running
-		: ALLOCATOR_stopped;
-	alloc_thread_set_state(ca, state);
-	return state == ALLOCATOR_running;
-}
-
-static int buckets_available(struct bch_dev *ca, unsigned long gc_count)
-{
-	s64 available = dev_buckets_reclaimable(ca) -
-		(gc_count == ca->fs->gc_count ? ca->inc_gen_really_needs_gc : 0);
-	bool ret = available > 0;
-
-	alloc_thread_set_state(ca, ret
-			       ? ALLOCATOR_running
-			       : ALLOCATOR_blocked);
-	return ret;
-}
-
-/**
- * bch_allocator_thread - move buckets from free_inc to reserves
- *
- * The free_inc FIFO is populated by find_reclaimable_buckets(), and
- * the reserves are depleted by bucket allocation. When we run out
- * of free_inc, try to invalidate some buckets and write out
- * prios and gens.
- */
-static int bch2_allocator_thread(void *arg)
-{
-	struct bch_dev *ca = arg;
-	struct bch_fs *c = ca->fs;
-	unsigned long gc_count = c->gc_count;
-	size_t nr;
-	int ret;
-
-	set_freezable();
-
-	while (1) {
-		ret = kthread_wait_freezable(allocator_thread_running(ca));
-		if (ret)
-			goto stop;
-
-		while (!ca->alloc_heap.used) {
-			cond_resched();
-
-			ret = kthread_wait_freezable(buckets_available(ca, gc_count));
-			if (ret)
-				goto stop;
-
-			gc_count = c->gc_count;
-			nr = find_reclaimable_buckets(c, ca);
-
-			if (!nr && ca->buckets_waiting_on_journal) {
-				ret = bch2_journal_flush(&c->journal);
-				if (ret)
-					goto stop;
-			} else if (nr < (ca->mi.nbuckets >> 6) &&
-				   ca->buckets_waiting_on_journal >= nr / 2) {
-				bch2_journal_flush_async(&c->journal, NULL);
-			}
-
-			if ((ca->inc_gen_needs_gc >= ALLOC_SCAN_BATCH(ca) ||
-			     ca->inc_gen_really_needs_gc) &&
-			    c->gc_thread) {
-				atomic_inc(&c->kick_gc);
-				wake_up_process(c->gc_thread);
-			}
-
-			trace_alloc_scan(ca, nr, ca->inc_gen_needs_gc,
-					 ca->inc_gen_really_needs_gc);
-		}
-
-		ret = bch2_invalidate_buckets(c, ca);
-		if (ret)
-			goto stop;
-
-		while (!fifo_empty(&ca->free_inc)) {
-			u64 b = fifo_peek(&ca->free_inc);
-
-			discard_one_bucket(c, ca, b);
-
-			ret = kthread_wait_freezable(push_invalidated_bucket(c, ca, b));
-			if (ret)
-				goto stop;
-		}
-	}
-stop:
-	alloc_thread_set_state(ca, ALLOCATOR_stopped);
-	return 0;
-}
-
 /* Startup/shutdown (ro/rw): */
 
 void bch2_recalc_capacity(struct bch_fs *c)
@@ -1193,7 +697,7 @@ void bch2_recalc_capacity(struct bch_fs *c)
 	u64 capacity = 0, reserved_sectors = 0, gc_reserve;
 	unsigned bucket_size_max = 0;
 	unsigned long ra_pages = 0;
-	unsigned i, j;
+	unsigned i;
 
 	lockdep_assert_held(&c->state_lock);
 
@@ -1224,8 +728,9 @@ void bch2_recalc_capacity(struct bch_fs *c)
 		 * allocations for foreground writes must wait -
 		 * not -ENOSPC calculations.
 		 */
-		for (j = 0; j < RESERVE_none; j++)
-			dev_reserve += ca->free[j].size;
+
+		dev_reserve += ca->nr_btree_reserve * 2;
+		dev_reserve += ca->mi.nbuckets >> 6; /* copygc reserve */
 
 		dev_reserve += 1;	/* btree write point */
 		dev_reserve += 1;	/* copygc write point */
@@ -1281,8 +786,6 @@ void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca)
 {
 	unsigned i;
 
-	BUG_ON(ca->alloc_thread);
-
 	/* First, remove device from allocation groups: */
 
 	for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
@@ -1356,61 +859,6 @@ void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca)
 			set_bit(ca->dev_idx, c->rw_devs[i].d);
 }
 
-void bch2_dev_allocator_quiesce(struct bch_fs *c, struct bch_dev *ca)
-{
-	if (ca->alloc_thread)
-		closure_wait_event(&c->freelist_wait,
-				   ca->allocator_state != ALLOCATOR_running);
-}
-
-/* stop allocator thread: */
-void bch2_dev_allocator_stop(struct bch_dev *ca)
-{
-	struct task_struct *p;
-
-	p = rcu_dereference_protected(ca->alloc_thread, 1);
-	ca->alloc_thread = NULL;
-
-	/*
-	 * We need an rcu barrier between setting ca->alloc_thread = NULL and
-	 * the thread shutting down to avoid bch2_wake_allocator() racing:
-	 *
-	 * XXX: it would be better to have the rcu barrier be asynchronous
-	 * instead of blocking us here
-	 */
-	synchronize_rcu();
-
-	if (p) {
-		kthread_stop(p);
-		put_task_struct(p);
-	}
-}
-
-/* start allocator thread: */
-int bch2_dev_allocator_start(struct bch_dev *ca)
-{
-	struct task_struct *p;
-
-	/*
-	 * allocator thread already started?
-	 */
-	if (ca->alloc_thread)
-		return 0;
-
-	p = kthread_create(bch2_allocator_thread, ca,
-			   "bch-alloc/%s", ca->name);
-	if (IS_ERR(p)) {
-		bch_err(ca->fs, "error creating allocator thread: %li",
-			PTR_ERR(p));
-		return PTR_ERR(p);
-	}
-
-	get_task_struct(p);
-	rcu_assign_pointer(ca->alloc_thread, p);
-	wake_up_process(p);
-	return 0;
-}
-
 void bch2_fs_allocator_background_init(struct bch_fs *c)
 {
 	spin_lock_init(&c->freelist_lock);