diff options
Diffstat (limited to 'fs/bcachefs/alloc_background.c')
-rw-r--r-- | fs/bcachefs/alloc_background.c | 560 |
1 files changed, 4 insertions, 556 deletions
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index 5d553d9b6151..3ba2b35fad53 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -27,13 +27,6 @@ #include <linux/sched/task.h> #include <linux/sort.h> -const char * const bch2_allocator_states[] = { -#define x(n) #n, - ALLOC_THREAD_STATES() -#undef x - NULL -}; - /* Persistent alloc info: */ static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = { @@ -431,7 +424,6 @@ int bch2_alloc_read(struct bch_fs *c, bool gc, bool metadata_only) g->_mark.gen = a.gen; g->io_time[READ] = a.io_time[READ]; g->io_time[WRITE] = a.io_time[WRITE]; - g->oldest_gen = !gc ? a.oldest_gen : a.gen; g->gen_valid = 1; if (!gc || @@ -553,7 +545,6 @@ int bch2_trans_mark_alloc(struct btree_trans *trans, new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now)); new_a->io_time[WRITE]= max_t(u64, 1, atomic64_read(&c->io_clock[WRITE].now)); SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, true); - SET_BCH_ALLOC_V4_NEED_DISCARD(new_a, true); } if (old_a.data_type && !new_a->data_type && @@ -698,493 +689,6 @@ out: return ret; } -/* Background allocator thread: */ - -/* - * Scans for buckets to be invalidated, invalidates them, rewrites prios/gens - * (marking them as invalidated on disk), then optionally issues discard - * commands to the newly free buckets, then puts them on the various freelists. - */ - -/* - * bucket_gc_gen() returns the difference between the bucket's current gen and - * the oldest gen of any pointer into that bucket in the btree. - */ - -static inline u8 bucket_gc_gen(struct bucket *g) -{ - return g->mark.gen - g->oldest_gen; -} - -static bool bch2_can_invalidate_bucket(struct bch_dev *ca, size_t b, - struct bucket_mark m) -{ - u8 gc_gen; - - if (!is_available_bucket(m)) - return false; - - if (m.owned_by_allocator) - return false; - - if (ca->buckets_nouse && - test_bit(b, ca->buckets_nouse)) - return false; - - if (ca->new_fs_bucket_idx) { - /* - * Device or filesystem is still being initialized, and we - * haven't fully marked superblocks & journal: - */ - if (is_superblock_bucket(ca, b)) - return false; - - if (b < ca->new_fs_bucket_idx) - return false; - } - - gc_gen = bucket_gc_gen(bucket(ca, b)); - - ca->inc_gen_needs_gc += gc_gen >= BUCKET_GC_GEN_MAX / 2; - ca->inc_gen_really_needs_gc += gc_gen >= BUCKET_GC_GEN_MAX; - - return gc_gen < BUCKET_GC_GEN_MAX; -} - -/* - * Determines what order we're going to reuse buckets, smallest bucket_key() - * first. - */ - -static unsigned bucket_sort_key(struct bucket *g, struct bucket_mark m, - u64 now, u64 last_seq_ondisk) -{ - unsigned used = m.cached_sectors; - - if (used) { - /* - * Prefer to keep buckets that have been read more recently, and - * buckets that have more data in them: - */ - u64 last_read = max_t(s64, 0, now - g->io_time[READ]); - u32 last_read_scaled = max_t(u64, U32_MAX, div_u64(last_read, used)); - - return -last_read_scaled; - } else { - /* - * Prefer to use buckets with smaller gc_gen so that we don't - * have to walk the btree and recalculate oldest_gen - but shift - * off the low bits so that buckets will still have equal sort - * keys when there's only a small difference, so that we can - * keep sequential buckets together: - */ - return bucket_gc_gen(g) >> 4; - } -} - -static inline int bucket_alloc_cmp(alloc_heap *h, - struct alloc_heap_entry l, - struct alloc_heap_entry r) -{ - return cmp_int(l.key, r.key) ?: - cmp_int(r.nr, l.nr) ?: - cmp_int(l.bucket, r.bucket); -} - -static inline int bucket_idx_cmp(const void *_l, const void *_r) -{ - const struct alloc_heap_entry *l = _l, *r = _r; - - return cmp_int(l->bucket, r->bucket); -} - -static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca) -{ - struct bucket_array *buckets; - struct alloc_heap_entry e = { 0 }; - u64 now, last_seq_ondisk; - size_t b, i, nr = 0; - - down_read(&ca->bucket_lock); - - buckets = bucket_array(ca); - ca->alloc_heap.used = 0; - now = atomic64_read(&c->io_clock[READ].now); - last_seq_ondisk = c->journal.flushed_seq_ondisk; - - /* - * Find buckets with lowest read priority, by building a maxheap sorted - * by read priority and repeatedly replacing the maximum element until - * all buckets have been visited. - */ - for (b = ca->mi.first_bucket; b < ca->mi.nbuckets; b++) { - struct bucket *g = &buckets->b[b]; - struct bucket_mark m = READ_ONCE(g->mark); - unsigned key = bucket_sort_key(g, m, now, last_seq_ondisk); - - cond_resched(); - - if (!bch2_can_invalidate_bucket(ca, b, m)) - continue; - - if (!m.data_type && - bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, - last_seq_ondisk, - ca->dev_idx, b)) { - ca->buckets_waiting_on_journal++; - continue; - } - - if (e.nr && e.bucket + e.nr == b && e.key == key) { - e.nr++; - } else { - if (e.nr) - heap_add_or_replace(&ca->alloc_heap, e, - -bucket_alloc_cmp, NULL); - - e = (struct alloc_heap_entry) { - .bucket = b, - .nr = 1, - .key = key, - }; - } - } - - if (e.nr) - heap_add_or_replace(&ca->alloc_heap, e, - -bucket_alloc_cmp, NULL); - - for (i = 0; i < ca->alloc_heap.used; i++) - nr += ca->alloc_heap.data[i].nr; - - while (nr - ca->alloc_heap.data[0].nr >= ALLOC_SCAN_BATCH(ca)) { - nr -= ca->alloc_heap.data[0].nr; - heap_pop(&ca->alloc_heap, e, -bucket_alloc_cmp, NULL); - } - - up_read(&ca->bucket_lock); -} - -static size_t find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca) -{ - size_t i, nr = 0; - - ca->inc_gen_needs_gc = 0; - ca->inc_gen_really_needs_gc = 0; - ca->buckets_waiting_on_journal = 0; - - find_reclaimable_buckets_lru(c, ca); - - heap_resort(&ca->alloc_heap, bucket_alloc_cmp, NULL); - - for (i = 0; i < ca->alloc_heap.used; i++) - nr += ca->alloc_heap.data[i].nr; - - return nr; -} - -static int bucket_invalidate_btree(struct btree_trans *trans, - struct bch_dev *ca, u64 b, - struct bkey_i_alloc_v4 *a) -{ - struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_s_c k; - int ret; - - bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, - POS(ca->dev_idx, b), - BTREE_ITER_CACHED| - BTREE_ITER_INTENT); - - k = bch2_btree_iter_peek_slot(&iter); - ret = bkey_err(k); - if (ret) - goto err; - - bkey_alloc_v4_init(&a->k_i); - a->k.p = iter.pos; - bch2_alloc_to_v4(k, &a->v); - a->v.gen++; - a->v.data_type = 0; - a->v.dirty_sectors = 0; - a->v.cached_sectors = 0; - a->v.io_time[READ] = atomic64_read(&c->io_clock[READ].now); - a->v.io_time[WRITE] = atomic64_read(&c->io_clock[WRITE].now); - - ret = bch2_trans_update(trans, &iter, &a->k_i, - BTREE_TRIGGER_BUCKET_INVALIDATE| - BTREE_UPDATE_NO_KEY_CACHE_COHERENCY); -err: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static int bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca, - u64 *journal_seq, unsigned flags) -{ - struct bkey_i_alloc_v4 a; - size_t b; - u64 commit_seq = 0; - int ret = 0; - - /* - * If the read-only path is trying to shut down, we can't be generating - * new btree updates: - */ - if (test_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags)) - return 1; - - BUG_ON(!ca->alloc_heap.used || - !ca->alloc_heap.data[0].nr); - b = ca->alloc_heap.data[0].bucket; - - /* first, put on free_inc and mark as owned by allocator: */ - percpu_down_read(&c->mark_lock); - - bch2_mark_alloc_bucket(c, ca, b, true); - - spin_lock(&c->freelist_lock); - verify_not_on_freelist(c, ca, b); - BUG_ON(!fifo_push(&ca->free_inc, b)); - spin_unlock(&c->freelist_lock); - - percpu_up_read(&c->mark_lock); - - ret = bch2_trans_do(c, NULL, &commit_seq, - BTREE_INSERT_NOCHECK_RW| - BTREE_INSERT_NOFAIL| - flags, - bucket_invalidate_btree(&trans, ca, b, &a)); - - if (!ret) { - /* remove from alloc_heap: */ - struct alloc_heap_entry e, *top = ca->alloc_heap.data; - - top->bucket++; - top->nr--; - - if (!top->nr) - heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp, NULL); - - /* - * If we invalidating cached data then we need to wait on the - * journal commit: - */ - if (a.v.data_type) - *journal_seq = max(*journal_seq, commit_seq); - - /* - * We already waiting on u.alloc_seq when we filtered out - * buckets that need journal commit: - */ - BUG_ON(*journal_seq > a.v.journal_seq); - } else { - size_t b2; - - /* remove from free_inc: */ - percpu_down_read(&c->mark_lock); - spin_lock(&c->freelist_lock); - - bch2_mark_alloc_bucket(c, ca, b, false); - - BUG_ON(!fifo_pop_back(&ca->free_inc, b2)); - BUG_ON(b != b2); - - spin_unlock(&c->freelist_lock); - percpu_up_read(&c->mark_lock); - } - - return ret < 0 ? ret : 0; -} - -/* - * Pull buckets off ca->alloc_heap, invalidate them, move them to ca->free_inc: - */ -static int bch2_invalidate_buckets(struct bch_fs *c, struct bch_dev *ca) -{ - u64 journal_seq = 0; - int ret = 0; - - /* Only use nowait if we've already invalidated at least one bucket: */ - while (!ret && - !fifo_full(&ca->free_inc) && - ca->alloc_heap.used) { - if (kthread_should_stop()) { - ret = 1; - break; - } - - ret = bch2_invalidate_one_bucket(c, ca, &journal_seq, - (!fifo_empty(&ca->free_inc) - ? BTREE_INSERT_NOWAIT : 0)); - /* - * We only want to batch up invalidates when they're going to - * require flushing the journal: - */ - if (!journal_seq) - break; - } - - /* If we used NOWAIT, don't return the error: */ - if (!fifo_empty(&ca->free_inc)) - ret = 0; - if (ret < 0) - bch_err(ca, "error invalidating buckets: %i", ret); - if (ret) - return ret; - - if (journal_seq) - ret = bch2_journal_flush_seq(&c->journal, journal_seq); - if (ret) { - bch_err(ca, "journal error: %i", ret); - return ret; - } - - return 0; -} - -static void alloc_thread_set_state(struct bch_dev *ca, unsigned new_state) -{ - if (ca->allocator_state != new_state) { - ca->allocator_state = new_state; - closure_wake_up(&ca->fs->freelist_wait); - } -} - -static int push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, u64 b) -{ - unsigned i; - int ret = 0; - - spin_lock(&c->freelist_lock); - for (i = 0; i < RESERVE_NR; i++) { - /* - * Don't strand buckets on the copygc freelist until - * after recovery is finished: - */ - if (i == RESERVE_movinggc && - !test_bit(BCH_FS_STARTED, &c->flags)) - continue; - - if (fifo_push(&ca->free[i], b)) { - fifo_pop(&ca->free_inc, b); - ret = 1; - break; - } - } - spin_unlock(&c->freelist_lock); - - ca->allocator_state = ret - ? ALLOCATOR_running - : ALLOCATOR_blocked_full; - closure_wake_up(&c->freelist_wait); - return ret; -} - -static void discard_one_bucket(struct bch_fs *c, struct bch_dev *ca, u64 b) -{ - if (!c->opts.nochanges && - ca->mi.discard && - bdev_max_discard_sectors(ca->disk_sb.bdev)) - blkdev_issue_discard(ca->disk_sb.bdev, bucket_to_sector(ca, b), - ca->mi.bucket_size, GFP_NOFS); -} - -static bool allocator_thread_running(struct bch_dev *ca) -{ - unsigned state = ca->mi.state == BCH_MEMBER_STATE_rw && - test_bit(BCH_FS_ALLOCATOR_RUNNING, &ca->fs->flags) - ? ALLOCATOR_running - : ALLOCATOR_stopped; - alloc_thread_set_state(ca, state); - return state == ALLOCATOR_running; -} - -static int buckets_available(struct bch_dev *ca, unsigned long gc_count) -{ - s64 available = dev_buckets_reclaimable(ca) - - (gc_count == ca->fs->gc_count ? ca->inc_gen_really_needs_gc : 0); - bool ret = available > 0; - - alloc_thread_set_state(ca, ret - ? ALLOCATOR_running - : ALLOCATOR_blocked); - return ret; -} - -/** - * bch_allocator_thread - move buckets from free_inc to reserves - * - * The free_inc FIFO is populated by find_reclaimable_buckets(), and - * the reserves are depleted by bucket allocation. When we run out - * of free_inc, try to invalidate some buckets and write out - * prios and gens. - */ -static int bch2_allocator_thread(void *arg) -{ - struct bch_dev *ca = arg; - struct bch_fs *c = ca->fs; - unsigned long gc_count = c->gc_count; - size_t nr; - int ret; - - set_freezable(); - - while (1) { - ret = kthread_wait_freezable(allocator_thread_running(ca)); - if (ret) - goto stop; - - while (!ca->alloc_heap.used) { - cond_resched(); - - ret = kthread_wait_freezable(buckets_available(ca, gc_count)); - if (ret) - goto stop; - - gc_count = c->gc_count; - nr = find_reclaimable_buckets(c, ca); - - if (!nr && ca->buckets_waiting_on_journal) { - ret = bch2_journal_flush(&c->journal); - if (ret) - goto stop; - } else if (nr < (ca->mi.nbuckets >> 6) && - ca->buckets_waiting_on_journal >= nr / 2) { - bch2_journal_flush_async(&c->journal, NULL); - } - - if ((ca->inc_gen_needs_gc >= ALLOC_SCAN_BATCH(ca) || - ca->inc_gen_really_needs_gc) && - c->gc_thread) { - atomic_inc(&c->kick_gc); - wake_up_process(c->gc_thread); - } - - trace_alloc_scan(ca, nr, ca->inc_gen_needs_gc, - ca->inc_gen_really_needs_gc); - } - - ret = bch2_invalidate_buckets(c, ca); - if (ret) - goto stop; - - while (!fifo_empty(&ca->free_inc)) { - u64 b = fifo_peek(&ca->free_inc); - - discard_one_bucket(c, ca, b); - - ret = kthread_wait_freezable(push_invalidated_bucket(c, ca, b)); - if (ret) - goto stop; - } - } -stop: - alloc_thread_set_state(ca, ALLOCATOR_stopped); - return 0; -} - /* Startup/shutdown (ro/rw): */ void bch2_recalc_capacity(struct bch_fs *c) @@ -1193,7 +697,7 @@ void bch2_recalc_capacity(struct bch_fs *c) u64 capacity = 0, reserved_sectors = 0, gc_reserve; unsigned bucket_size_max = 0; unsigned long ra_pages = 0; - unsigned i, j; + unsigned i; lockdep_assert_held(&c->state_lock); @@ -1224,8 +728,9 @@ void bch2_recalc_capacity(struct bch_fs *c) * allocations for foreground writes must wait - * not -ENOSPC calculations. */ - for (j = 0; j < RESERVE_none; j++) - dev_reserve += ca->free[j].size; + + dev_reserve += ca->nr_btree_reserve * 2; + dev_reserve += ca->mi.nbuckets >> 6; /* copygc reserve */ dev_reserve += 1; /* btree write point */ dev_reserve += 1; /* copygc write point */ @@ -1281,8 +786,6 @@ void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca) { unsigned i; - BUG_ON(ca->alloc_thread); - /* First, remove device from allocation groups: */ for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++) @@ -1356,61 +859,6 @@ void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca) set_bit(ca->dev_idx, c->rw_devs[i].d); } -void bch2_dev_allocator_quiesce(struct bch_fs *c, struct bch_dev *ca) -{ - if (ca->alloc_thread) - closure_wait_event(&c->freelist_wait, - ca->allocator_state != ALLOCATOR_running); -} - -/* stop allocator thread: */ -void bch2_dev_allocator_stop(struct bch_dev *ca) -{ - struct task_struct *p; - - p = rcu_dereference_protected(ca->alloc_thread, 1); - ca->alloc_thread = NULL; - - /* - * We need an rcu barrier between setting ca->alloc_thread = NULL and - * the thread shutting down to avoid bch2_wake_allocator() racing: - * - * XXX: it would be better to have the rcu barrier be asynchronous - * instead of blocking us here - */ - synchronize_rcu(); - - if (p) { - kthread_stop(p); - put_task_struct(p); - } -} - -/* start allocator thread: */ -int bch2_dev_allocator_start(struct bch_dev *ca) -{ - struct task_struct *p; - - /* - * allocator thread already started? - */ - if (ca->alloc_thread) - return 0; - - p = kthread_create(bch2_allocator_thread, ca, - "bch-alloc/%s", ca->name); - if (IS_ERR(p)) { - bch_err(ca->fs, "error creating allocator thread: %li", - PTR_ERR(p)); - return PTR_ERR(p); - } - - get_task_struct(p); - rcu_assign_pointer(ca->alloc_thread, p); - wake_up_process(p); - return 0; -} - void bch2_fs_allocator_background_init(struct bch_fs *c) { spin_lock_init(&c->freelist_lock); |