summaryrefslogtreecommitdiff
path: root/fs/bcachefs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/bcachefs')
-rw-r--r--fs/bcachefs/alloc_background.c225
-rw-r--r--fs/bcachefs/alloc_types.h24
-rw-r--r--fs/bcachefs/bcachefs.h11
-rw-r--r--fs/bcachefs/bcachefs_format.h18
-rw-r--r--fs/bcachefs/btree_gc.c6
-rw-r--r--fs/bcachefs/buckets.h9
-rw-r--r--fs/bcachefs/buckets_types.h2
-rw-r--r--fs/bcachefs/clock.c8
-rw-r--r--fs/bcachefs/clock_types.h2
-rw-r--r--fs/bcachefs/journal.c3
-rw-r--r--fs/bcachefs/journal_io.c33
-rw-r--r--fs/bcachefs/movinggc.c4
-rw-r--r--fs/bcachefs/rebalance.c10
-rw-r--r--fs/bcachefs/rebalance_types.h2
-rw-r--r--fs/bcachefs/recovery.c19
-rw-r--r--fs/bcachefs/super-io.c60
-rw-r--r--fs/bcachefs/super-io.h5
-rw-r--r--fs/bcachefs/super.c6
-rw-r--r--fs/bcachefs/sysfs.c4
19 files changed, 141 insertions, 310 deletions
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 9a670bb2ccfb..bba83011b18b 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -31,8 +31,6 @@ static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = {
#undef x
};
-static void bch2_recalc_oldest_io(struct bch_fs *, struct bch_dev *, int);
-
/* Ratelimiting/PD controllers */
static void pd_controllers_update(struct work_struct *work)
@@ -340,9 +338,7 @@ static int bch2_alloc_read_fn(struct bch_fs *c, enum btree_id id,
int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys)
{
- struct bch_dev *ca;
- unsigned i;
- int ret = 0;
+ int ret;
down_read(&c->gc_lock);
ret = bch2_btree_and_journal_walk(c, journal_keys, BTREE_ID_ALLOC,
@@ -358,22 +354,6 @@ int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys)
bch2_dev_usage_from_buckets(c);
percpu_up_write(&c->mark_lock);
- mutex_lock(&c->bucket_clock[READ].lock);
- for_each_member_device(ca, c, i) {
- down_read(&ca->bucket_lock);
- bch2_recalc_oldest_io(c, ca, READ);
- up_read(&ca->bucket_lock);
- }
- mutex_unlock(&c->bucket_clock[READ].lock);
-
- mutex_lock(&c->bucket_clock[WRITE].lock);
- for_each_member_device(ca, c, i) {
- down_read(&ca->bucket_lock);
- bch2_recalc_oldest_io(c, ca, WRITE);
- up_read(&ca->bucket_lock);
- }
- mutex_unlock(&c->bucket_clock[WRITE].lock);
-
return 0;
}
@@ -460,114 +440,6 @@ err:
/* Bucket IO clocks: */
-static void bch2_recalc_oldest_io(struct bch_fs *c, struct bch_dev *ca, int rw)
-{
- struct bucket_clock *clock = &c->bucket_clock[rw];
- struct bucket_array *buckets = bucket_array(ca);
- struct bucket *g;
- u16 max_last_io = 0;
- unsigned i;
-
- lockdep_assert_held(&c->bucket_clock[rw].lock);
-
- /* Recalculate max_last_io for this device: */
- for_each_bucket(g, buckets)
- max_last_io = max(max_last_io, bucket_last_io(c, g, rw));
-
- ca->max_last_bucket_io[rw] = max_last_io;
-
- /* Recalculate global max_last_io: */
- max_last_io = 0;
-
- for_each_member_device(ca, c, i)
- max_last_io = max(max_last_io, ca->max_last_bucket_io[rw]);
-
- clock->max_last_io = max_last_io;
-}
-
-static void bch2_rescale_bucket_io_times(struct bch_fs *c, int rw)
-{
- struct bucket_clock *clock = &c->bucket_clock[rw];
- struct bucket_array *buckets;
- struct bch_dev *ca;
- struct bucket *g;
- unsigned i;
-
- trace_rescale_prios(c);
-
- for_each_member_device(ca, c, i) {
- down_read(&ca->bucket_lock);
- buckets = bucket_array(ca);
-
- for_each_bucket(g, buckets)
- g->io_time[rw] = clock->hand -
- bucket_last_io(c, g, rw) / 2;
-
- bch2_recalc_oldest_io(c, ca, rw);
-
- up_read(&ca->bucket_lock);
- }
-}
-
-static inline u64 bucket_clock_freq(u64 capacity)
-{
- return max(capacity >> 10, 2028ULL);
-}
-
-static void bch2_inc_clock_hand(struct io_timer *timer)
-{
- struct bucket_clock *clock = container_of(timer,
- struct bucket_clock, rescale);
- struct bch_fs *c = container_of(clock,
- struct bch_fs, bucket_clock[clock->rw]);
- struct bch_dev *ca;
- u64 capacity;
- unsigned i;
-
- mutex_lock(&clock->lock);
-
- /* if clock cannot be advanced more, rescale prio */
- if (clock->max_last_io >= U16_MAX - 2)
- bch2_rescale_bucket_io_times(c, clock->rw);
-
- BUG_ON(clock->max_last_io >= U16_MAX - 2);
-
- for_each_member_device(ca, c, i)
- ca->max_last_bucket_io[clock->rw]++;
- clock->max_last_io++;
- clock->hand++;
-
- mutex_unlock(&clock->lock);
-
- capacity = READ_ONCE(c->capacity);
-
- if (!capacity)
- return;
-
- /*
- * we only increment when 0.1% of the filesystem capacity has been read
- * or written too, this determines if it's time
- *
- * XXX: we shouldn't really be going off of the capacity of devices in
- * RW mode (that will be 0 when we're RO, yet we can still service
- * reads)
- */
- timer->expire += bucket_clock_freq(capacity);
-
- bch2_io_timer_add(&c->io_clock[clock->rw], timer);
-}
-
-static void bch2_bucket_clock_init(struct bch_fs *c, int rw)
-{
- struct bucket_clock *clock = &c->bucket_clock[rw];
-
- clock->hand = 1;
- clock->rw = rw;
- clock->rescale.fn = bch2_inc_clock_hand;
- clock->rescale.expire = bucket_clock_freq(c->capacity);
- mutex_init(&clock->lock);
-}
-
int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
size_t bucket_nr, int rw)
{
@@ -577,7 +449,7 @@ int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
struct bucket *g;
struct bkey_alloc_buf *a;
struct bkey_alloc_unpacked u;
- u64 *time;
+ u64 *time, now;
int ret = 0;
iter = bch2_trans_get_iter(trans, BTREE_ID_ALLOC, POS(dev, bucket_nr),
@@ -599,10 +471,11 @@ int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
percpu_up_read(&c->mark_lock);
time = rw == READ ? &u.read_time : &u.write_time;
- if (*time == c->bucket_clock[rw].hand)
+ now = atomic64_read(&c->io_clock[rw].now);
+ if (*time == now)
goto out;
- *time = c->bucket_clock[rw].hand;
+ *time = now;
bch2_alloc_pack(c, a, u);
ret = bch2_trans_update(trans, iter, &a->k, 0) ?:
@@ -674,23 +547,22 @@ static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca)
return ret;
}
-static bool bch2_can_invalidate_bucket(struct bch_dev *ca,
- size_t bucket,
- struct bucket_mark mark)
+static bool bch2_can_invalidate_bucket(struct bch_dev *ca, size_t b,
+ struct bucket_mark m)
{
u8 gc_gen;
- if (!is_available_bucket(mark))
+ if (!is_available_bucket(m))
return false;
- if (mark.owned_by_allocator)
+ if (m.owned_by_allocator)
return false;
if (ca->buckets_nouse &&
- test_bit(bucket, ca->buckets_nouse))
+ test_bit(b, ca->buckets_nouse))
return false;
- gc_gen = bucket_gc_gen(ca, bucket);
+ gc_gen = bucket_gc_gen(bucket(ca, b));
if (gc_gen >= BUCKET_GC_GEN_MAX / 2)
ca->inc_gen_needs_gc++;
@@ -704,43 +576,33 @@ static bool bch2_can_invalidate_bucket(struct bch_dev *ca,
/*
* Determines what order we're going to reuse buckets, smallest bucket_key()
* first.
- *
- *
- * - We take into account the read prio of the bucket, which gives us an
- * indication of how hot the data is -- we scale the prio so that the prio
- * farthest from the clock is worth 1/8th of the closest.
- *
- * - The number of sectors of cached data in the bucket, which gives us an
- * indication of the cost in cache misses this eviction will cause.
- *
- * - If hotness * sectors used compares equal, we pick the bucket with the
- * smallest bucket_gc_gen() - since incrementing the same bucket's generation
- * number repeatedly forces us to run mark and sweep gc to avoid generation
- * number wraparound.
*/
-static unsigned long bucket_sort_key(struct bch_fs *c, struct bch_dev *ca,
- size_t b, struct bucket_mark m)
+static unsigned bucket_sort_key(struct bucket *g, struct bucket_mark m,
+ u64 now, u64 last_seq_ondisk)
{
- unsigned last_io = bucket_last_io(c, bucket(ca, b), READ);
- unsigned max_last_io = ca->max_last_bucket_io[READ];
-
- /*
- * Time since last read, scaled to [0, 8) where larger value indicates
- * more recently read data:
- */
- unsigned long hotness = (max_last_io - last_io) * 7 / max_last_io;
-
- /* How much we want to keep the data in this bucket: */
- unsigned long data_wantness =
- (hotness + 1) * bucket_sectors_used(m);
+ unsigned used = bucket_sectors_used(m);
- unsigned long needs_journal_commit =
- bucket_needs_journal_commit(m, c->journal.last_seq_ondisk);
+ if (used) {
+ /*
+ * Prefer to keep buckets that have been read more recently, and
+ * buckets that have more data in them:
+ */
+ u64 last_read = max_t(s64, 0, now - g->io_time[READ]);
+ u32 last_read_scaled = max_t(u64, U32_MAX, div_u64(last_read, used));
- return (data_wantness << 9) |
- (needs_journal_commit << 8) |
- (bucket_gc_gen(ca, b) / 16);
+ return -last_read_scaled;
+ } else {
+ /*
+ * Prefer to use buckets with smaller gc_gen so that we don't
+ * have to walk the btree and recalculate oldest_gen - but shift
+ * off the low bits so that buckets will still have equal sort
+ * keys when there's only a small difference, so that we can
+ * keep sequential buckets together:
+ */
+ return (bucket_needs_journal_commit(m, last_seq_ondisk) << 4)|
+ (bucket_gc_gen(g) >> 4);
+ }
}
static inline int bucket_alloc_cmp(alloc_heap *h,
@@ -763,16 +625,15 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
{
struct bucket_array *buckets;
struct alloc_heap_entry e = { 0 };
+ u64 now, last_seq_ondisk;
size_t b, i, nr = 0;
- ca->alloc_heap.used = 0;
-
- mutex_lock(&c->bucket_clock[READ].lock);
down_read(&ca->bucket_lock);
buckets = bucket_array(ca);
-
- bch2_recalc_oldest_io(c, ca, READ);
+ ca->alloc_heap.used = 0;
+ now = atomic64_read(&c->io_clock[READ].now);
+ last_seq_ondisk = c->journal.last_seq_ondisk;
/*
* Find buckets with lowest read priority, by building a maxheap sorted
@@ -780,8 +641,9 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
* all buckets have been visited.
*/
for (b = ca->mi.first_bucket; b < ca->mi.nbuckets; b++) {
- struct bucket_mark m = READ_ONCE(buckets->b[b].mark);
- unsigned long key = bucket_sort_key(c, ca, b, m);
+ struct bucket *g = &buckets->b[b];
+ struct bucket_mark m = READ_ONCE(g->mark);
+ unsigned key = bucket_sort_key(g, m, now, last_seq_ondisk);
if (!bch2_can_invalidate_bucket(ca, b, m))
continue;
@@ -816,7 +678,6 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
}
up_read(&ca->bucket_lock);
- mutex_unlock(&c->bucket_clock[READ].lock);
}
static void find_reclaimable_buckets_fifo(struct bch_fs *c, struct bch_dev *ca)
@@ -1031,8 +892,8 @@ retry:
u.data_type = 0;
u.dirty_sectors = 0;
u.cached_sectors = 0;
- u.read_time = c->bucket_clock[READ].hand;
- u.write_time = c->bucket_clock[WRITE].hand;
+ u.read_time = atomic64_read(&c->io_clock[READ].now);
+ u.write_time = atomic64_read(&c->io_clock[WRITE].now);
bch2_alloc_pack(c, &a, u);
bch2_trans_update(trans, iter, &a.k,
@@ -1542,8 +1403,6 @@ int bch2_dev_allocator_start(struct bch_dev *ca)
void bch2_fs_allocator_background_init(struct bch_fs *c)
{
spin_lock_init(&c->freelist_lock);
- bch2_bucket_clock_init(c, READ);
- bch2_bucket_clock_init(c, WRITE);
c->pd_controllers_update_seconds = 5;
INIT_DELAYED_WORK(&c->pd_controllers_update, pd_controllers_update);
diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h
index 1abfff5290bc..be164d6108bb 100644
--- a/fs/bcachefs/alloc_types.h
+++ b/fs/bcachefs/alloc_types.h
@@ -10,30 +10,6 @@
struct ec_bucket_buf;
-/* There's two of these clocks, one for reads and one for writes: */
-struct bucket_clock {
- /*
- * "now" in (read/write) IO time - incremented whenever we do X amount
- * of reads or writes.
- *
- * Goes with the bucket read/write prios: when we read or write to a
- * bucket we reset the bucket's prio to the current hand; thus hand -
- * prio = time since bucket was last read/written.
- *
- * The units are some amount (bytes/sectors) of data read/written, and
- * the units can change on the fly if we need to rescale to fit
- * everything in a u16 - your only guarantee is that the units are
- * consistent.
- */
- u16 hand;
- u16 max_last_io;
-
- int rw;
-
- struct io_timer rescale;
- struct mutex lock;
-};
-
enum alloc_reserve {
RESERVE_BTREE_MOVINGGC = -2,
RESERVE_BTREE = -1,
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index bd675b88b354..763cac0efa0c 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -451,9 +451,6 @@ struct bch_dev {
size_t fifo_last_bucket;
- /* last calculated minimum prio */
- u16 max_last_bucket_io[2];
-
size_t inc_gen_needs_gc;
size_t inc_gen_really_needs_gc;
@@ -693,14 +690,6 @@ struct bch_fs {
struct mutex usage_scratch_lock;
struct bch_fs_usage_online *usage_scratch;
- /*
- * When we invalidate buckets, we use both the priority and the amount
- * of good data to determine which buckets to reuse first - to weight
- * those together consistently we keep track of the smallest nonzero
- * priority of any bucket.
- */
- struct bucket_clock bucket_clock[2];
-
struct io_clock io_clock[2];
/* JOURNAL SEQ BLACKLIST */
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index b6c7e57b6bcd..5dab5bfd228a 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -1143,8 +1143,8 @@ struct bch_sb_field_clean {
struct bch_sb_field field;
__le32 flags;
- __le16 read_clock;
- __le16 write_clock;
+ __le16 _read_clock; /* no longer used */
+ __le16 _write_clock;
__le64 journal_seq;
union {
@@ -1511,7 +1511,8 @@ static inline __u64 __bset_magic(struct bch_sb *sb)
x(blacklist, 3) \
x(blacklist_v2, 4) \
x(usage, 5) \
- x(data_usage, 6)
+ x(data_usage, 6) \
+ x(clock, 7)
enum {
#define x(f, nr) BCH_JSET_ENTRY_##f = nr,
@@ -1559,6 +1560,13 @@ struct jset_entry_data_usage {
struct bch_replicas_entry r;
} __attribute__((packed));
+struct jset_entry_clock {
+ struct jset_entry entry;
+ __u8 rw;
+ __u8 pad[7];
+ __le64 time;
+} __attribute__((packed));
+
/*
* On disk format for a journal entry:
* seq is monotonically increasing; every journal entry has its own unique
@@ -1581,8 +1589,8 @@ struct jset {
__u8 encrypted_start[0];
- __le16 read_clock;
- __le16 write_clock;
+ __le16 _read_clock; /* no longer used */
+ __le16 _write_clock;
/* Sequence number of oldest dirty journal entry */
__le64 last_seq;
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 9e123736a125..5ea9bae09d59 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -1489,7 +1489,7 @@ static int bch2_gc_thread(void *arg)
{
struct bch_fs *c = arg;
struct io_clock *clock = &c->io_clock[WRITE];
- unsigned long last = atomic_long_read(&clock->now);
+ unsigned long last = atomic64_read(&clock->now);
unsigned last_kick = atomic_read(&c->kick_gc);
int ret;
@@ -1510,7 +1510,7 @@ static int bch2_gc_thread(void *arg)
if (c->btree_gc_periodic) {
unsigned long next = last + c->capacity / 16;
- if (atomic_long_read(&clock->now) >= next)
+ if (atomic64_read(&clock->now) >= next)
break;
bch2_io_clock_schedule_timeout(clock, next);
@@ -1522,7 +1522,7 @@ static int bch2_gc_thread(void *arg)
}
__set_current_state(TASK_RUNNING);
- last = atomic_long_read(&clock->now);
+ last = atomic64_read(&clock->now);
last_kick = atomic_read(&c->kick_gc);
/*
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 4103ea7e769a..50989d286190 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -58,20 +58,13 @@ static inline struct bucket *bucket(struct bch_dev *ca, size_t b)
return __bucket(ca, b, false);
}
-static inline u16 bucket_last_io(struct bch_fs *c, struct bucket *g, int rw)
-{
- return c->bucket_clock[rw].hand - g->io_time[rw];
-}
-
/*
* bucket_gc_gen() returns the difference between the bucket's current gen and
* the oldest gen of any pointer into that bucket in the btree.
*/
-static inline u8 bucket_gc_gen(struct bch_dev *ca, size_t b)
+static inline u8 bucket_gc_gen(struct bucket *g)
{
- struct bucket *g = bucket(ca, b);
-
return g->mark.gen - g->oldest_gen;
}
diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h
index 99ab9f48ba9d..b6ea67506cc2 100644
--- a/fs/bcachefs/buckets_types.h
+++ b/fs/bcachefs/buckets_types.h
@@ -37,7 +37,7 @@ struct bucket {
const struct bucket_mark mark;
};
- u16 io_time[2];
+ u64 io_time[2];
u8 oldest_gen;
u8 gc_gen;
unsigned gen_valid:1;
diff --git a/fs/bcachefs/clock.c b/fs/bcachefs/clock.c
index 869ba1887757..da91c95e3ffc 100644
--- a/fs/bcachefs/clock.c
+++ b/fs/bcachefs/clock.c
@@ -19,7 +19,7 @@ void bch2_io_timer_add(struct io_clock *clock, struct io_timer *timer)
spin_lock(&clock->timer_lock);
- if (time_after_eq((unsigned long) atomic_long_read(&clock->now),
+ if (time_after_eq((unsigned long) atomic64_read(&clock->now),
timer->expire)) {
spin_unlock(&clock->timer_lock);
timer->fn(timer);
@@ -146,7 +146,7 @@ static struct io_timer *get_expired_timer(struct io_clock *clock,
void __bch2_increment_clock(struct io_clock *clock, unsigned sectors)
{
struct io_timer *timer;
- unsigned long now = atomic_long_add_return(sectors, &clock->now);
+ unsigned long now = atomic64_add_return(sectors, &clock->now);
while ((timer = get_expired_timer(clock, now)))
timer->fn(timer);
@@ -158,7 +158,7 @@ void bch2_io_timers_to_text(struct printbuf *out, struct io_clock *clock)
unsigned i;
spin_lock(&clock->timer_lock);
- now = atomic_long_read(&clock->now);
+ now = atomic64_read(&clock->now);
for (i = 0; i < clock->timers.used; i++)
pr_buf(out, "%ps:\t%li\n",
@@ -175,7 +175,7 @@ void bch2_io_clock_exit(struct io_clock *clock)
int bch2_io_clock_init(struct io_clock *clock)
{
- atomic_long_set(&clock->now, 0);
+ atomic64_set(&clock->now, 0);
spin_lock_init(&clock->timer_lock);
clock->max_slop = IO_CLOCK_PCPU_SECTORS * num_possible_cpus();
diff --git a/fs/bcachefs/clock_types.h b/fs/bcachefs/clock_types.h
index 92c740a47565..5fae0012d808 100644
--- a/fs/bcachefs/clock_types.h
+++ b/fs/bcachefs/clock_types.h
@@ -26,7 +26,7 @@ struct io_timer {
typedef HEAP(struct io_timer *) io_timer_heap;
struct io_clock {
- atomic_long_t now;
+ atomic64_t now;
u16 __percpu *pcpu_buf;
unsigned max_slop;
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index ba37c78c01db..379b9ad2c0f9 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -1123,6 +1123,9 @@ int bch2_fs_journal_init(struct journal *j)
j->entry_u64s_reserved +=
BTREE_ID_NR * (JSET_KEYS_U64s + BKEY_BTREE_PTR_U64s_MAX);
+ j->entry_u64s_reserved +=
+ 2 * (sizeof(struct jset_entry_clock) / sizeof(u64));
+
atomic64_set(&j->reservations.counter,
((union journal_res_state)
{ .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v);
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 7e726db77881..a82548983dbd 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -426,6 +426,32 @@ fsck_err:
return ret;
}
+static int journal_entry_validate_clock(struct bch_fs *c,
+ struct jset *jset,
+ struct jset_entry *entry,
+ int write)
+{
+ struct jset_entry_clock *clock =
+ container_of(entry, struct jset_entry_clock, entry);
+ unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
+ int ret = 0;
+
+ if (journal_entry_err_on(bytes != sizeof(*clock),
+ c, "invalid journal entry clock: bad size")) {
+ journal_entry_null_range(entry, vstruct_next(entry));
+ return ret;
+ }
+
+ if (journal_entry_err_on(clock->rw > 1,
+ c, "invalid journal entry clock: bad rw")) {
+ journal_entry_null_range(entry, vstruct_next(entry));
+ return ret;
+ }
+
+fsck_err:
+ return ret;
+}
+
struct jset_entry_ops {
int (*validate)(struct bch_fs *, struct jset *,
struct jset_entry *, int);
@@ -1361,8 +1387,8 @@ void bch2_journal_write(struct closure *cl)
end = bch2_btree_roots_to_journal_entries(c, jset->start, end);
- end = bch2_journal_super_entries_add_common(c, end,
- le64_to_cpu(jset->seq));
+ bch2_journal_super_entries_add_common(c, &end,
+ le64_to_cpu(jset->seq));
u64s = (u64 *) end - (u64 *) start;
BUG_ON(u64s > j->entry_u64s_reserved);
@@ -1371,10 +1397,7 @@ void bch2_journal_write(struct closure *cl)
journal_write_compact(jset);
- jset->read_clock = cpu_to_le16(c->bucket_clock[READ].hand);
- jset->write_clock = cpu_to_le16(c->bucket_clock[WRITE].hand);
jset->magic = cpu_to_le64(jset_magic(c));
-
jset->version = c->sb.version < bcachefs_metadata_version_new_versioning
? cpu_to_le32(BCH_JSET_VERSION_OLD)
: cpu_to_le32(c->sb.version);
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index 8e6e4cd73886..e2472c19beaf 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -298,7 +298,7 @@ static int bch2_copygc_thread(void *arg)
{
struct bch_fs *c = arg;
struct io_clock *clock = &c->io_clock[WRITE];
- unsigned long last, wait;
+ u64 last, wait;
set_freezable();
@@ -306,7 +306,7 @@ static int bch2_copygc_thread(void *arg)
if (kthread_wait_freezable(c->copy_gc_enabled))
break;
- last = atomic_long_read(&clock->now);
+ last = atomic64_read(&clock->now);
wait = bch2_copygc_wait_amount(c);
if (wait > clock->max_slop) {
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
index f9a12dd797a5..2263ee41c444 100644
--- a/fs/bcachefs/rebalance.c
+++ b/fs/bcachefs/rebalance.c
@@ -169,12 +169,12 @@ static int bch2_rebalance_thread(void *arg)
unsigned long start, prev_start;
unsigned long prev_run_time, prev_run_cputime;
unsigned long cputime, prev_cputime;
- unsigned long io_start;
+ u64 io_start;
long throttle;
set_freezable();
- io_start = atomic_long_read(&clock->now);
+ io_start = atomic64_read(&clock->now);
p = rebalance_work(c);
prev_start = jiffies;
prev_cputime = curr_cputime();
@@ -210,7 +210,7 @@ static int bch2_rebalance_thread(void *arg)
(20 - w.dev_most_full_percent),
50);
- if (atomic_long_read(&clock->now) + clock->max_slop <
+ if (atomic64_read(&clock->now) + clock->max_slop <
r->throttled_until_iotime) {
r->throttled_until_cputime = start + throttle;
r->state = REBALANCE_THROTTLED;
@@ -229,7 +229,7 @@ static int bch2_rebalance_thread(void *arg)
max(p.dev_most_full_percent, 1U) /
max(w.dev_most_full_percent, 1U));
- io_start = atomic_long_read(&clock->now);
+ io_start = atomic64_read(&clock->now);
p = w;
prev_start = start;
prev_cputime = cputime;
@@ -274,7 +274,7 @@ void bch2_rebalance_work_to_text(struct printbuf *out, struct bch_fs *c)
case REBALANCE_THROTTLED:
bch2_hprint(&PBUF(h1),
(r->throttled_until_iotime -
- atomic_long_read(&c->io_clock[WRITE].now)) << 9);
+ atomic64_read(&c->io_clock[WRITE].now)) << 9);
pr_buf(out, "throttled for %lu sec or %s io\n",
(r->throttled_until_cputime - jiffies) / HZ,
h1);
diff --git a/fs/bcachefs/rebalance_types.h b/fs/bcachefs/rebalance_types.h
index 192c6be20ced..2f62a643c39f 100644
--- a/fs/bcachefs/rebalance_types.h
+++ b/fs/bcachefs/rebalance_types.h
@@ -17,7 +17,7 @@ struct bch_fs_rebalance {
atomic64_t work_unknown_dev;
enum rebalance_state state;
- unsigned long throttled_until_iotime;
+ u64 throttled_until_iotime;
unsigned long throttled_until_cputime;
struct bch_move_stats move_stats;
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index f470e0e233ce..55f7771e11c8 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -847,6 +847,12 @@ static int journal_replay_entry_early(struct bch_fs *c,
le64_to_cpu(bl_entry->end) + 1);
break;
}
+ case BCH_JSET_ENTRY_clock: {
+ struct jset_entry_clock *clock =
+ container_of(entry, struct jset_entry_clock, entry);
+
+ atomic64_set(&c->io_clock[clock->rw].now, clock->time);
+ }
}
return ret;
@@ -861,9 +867,6 @@ static int journal_replay_early(struct bch_fs *c,
int ret;
if (clean) {
- c->bucket_clock[READ].hand = le16_to_cpu(clean->read_clock);
- c->bucket_clock[WRITE].hand = le16_to_cpu(clean->write_clock);
-
for (entry = clean->start;
entry != vstruct_end(&clean->field);
entry = vstruct_next(entry)) {
@@ -876,9 +879,6 @@ static int journal_replay_early(struct bch_fs *c,
if (i->ignore)
continue;
- c->bucket_clock[READ].hand = le16_to_cpu(i->j.read_clock);
- c->bucket_clock[WRITE].hand = le16_to_cpu(i->j.write_clock);
-
vstruct_for_each(&i->j, entry) {
ret = journal_replay_entry_early(c, entry);
if (ret)
@@ -942,13 +942,6 @@ static int verify_superblock_clean(struct bch_fs *c,
return 0;
}
- mustfix_fsck_err_on(j->read_clock != clean->read_clock, c,
- "superblock read clock %u doesn't match journal %u after clean shutdown",
- clean->read_clock, j->read_clock);
- mustfix_fsck_err_on(j->write_clock != clean->write_clock, c,
- "superblock write clock %u doesn't match journal %u after clean shutdown",
- clean->write_clock, j->write_clock);
-
for (i = 0; i < BTREE_ID_NR; i++) {
char buf1[200], buf2[200];
struct bkey_i *k1, *k2;
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 61b947313c88..3b082da934fb 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -966,29 +966,25 @@ int bch2_fs_mark_dirty(struct bch_fs *c)
return ret;
}
-static void
-entry_init_u64s(struct jset_entry *entry, unsigned u64s)
+static struct jset_entry *jset_entry_init(struct jset_entry **end, size_t size)
{
- memset(entry, 0, u64s * sizeof(u64));
+ struct jset_entry *entry = *end;
+ unsigned u64s = DIV_ROUND_UP(size, sizeof(u64));
+ memset(entry, 0, u64s * sizeof(u64));
/*
* The u64s field counts from the start of data, ignoring the shared
* fields.
*/
entry->u64s = u64s - 1;
-}
-static void
-entry_init_size(struct jset_entry *entry, size_t size)
-{
- unsigned u64s = DIV_ROUND_UP(size, sizeof(u64));
- entry_init_u64s(entry, u64s);
+ *end = vstruct_next(*end);
+ return entry;
}
-struct jset_entry *
-bch2_journal_super_entries_add_common(struct bch_fs *c,
- struct jset_entry *entry,
- u64 journal_seq)
+void bch2_journal_super_entries_add_common(struct bch_fs *c,
+ struct jset_entry **end,
+ u64 journal_seq)
{
unsigned i;
@@ -1003,59 +999,59 @@ bch2_journal_super_entries_add_common(struct bch_fs *c,
{
struct jset_entry_usage *u =
- container_of(entry, struct jset_entry_usage, entry);
+ container_of(jset_entry_init(end, sizeof(*u)),
+ struct jset_entry_usage, entry);
- entry_init_size(entry, sizeof(*u));
u->entry.type = BCH_JSET_ENTRY_usage;
u->entry.btree_id = FS_USAGE_INODES;
u->v = cpu_to_le64(c->usage_base->nr_inodes);
-
- entry = vstruct_next(entry);
}
{
struct jset_entry_usage *u =
- container_of(entry, struct jset_entry_usage, entry);
+ container_of(jset_entry_init(end, sizeof(*u)),
+ struct jset_entry_usage, entry);
- entry_init_size(entry, sizeof(*u));
u->entry.type = BCH_JSET_ENTRY_usage;
u->entry.btree_id = FS_USAGE_KEY_VERSION;
u->v = cpu_to_le64(atomic64_read(&c->key_version));
-
- entry = vstruct_next(entry);
}
for (i = 0; i < BCH_REPLICAS_MAX; i++) {
struct jset_entry_usage *u =
- container_of(entry, struct jset_entry_usage, entry);
+ container_of(jset_entry_init(end, sizeof(*u)),
+ struct jset_entry_usage, entry);
- entry_init_size(entry, sizeof(*u));
u->entry.type = BCH_JSET_ENTRY_usage;
u->entry.btree_id = FS_USAGE_RESERVED;
u->entry.level = i;
u->v = cpu_to_le64(c->usage_base->persistent_reserved[i]);
-
- entry = vstruct_next(entry);
}
for (i = 0; i < c->replicas.nr; i++) {
struct bch_replicas_entry *e =
cpu_replicas_entry(&c->replicas, i);
struct jset_entry_data_usage *u =
- container_of(entry, struct jset_entry_data_usage, entry);
+ container_of(jset_entry_init(end, sizeof(*u) + e->nr_devs),
+ struct jset_entry_data_usage, entry);
- entry_init_size(entry, sizeof(*u) + e->nr_devs);
u->entry.type = BCH_JSET_ENTRY_data_usage;
u->v = cpu_to_le64(c->usage_base->replicas[i]);
unsafe_memcpy(&u->r, e, replicas_entry_bytes(e),
"embedded variable length struct");
-
- entry = vstruct_next(entry);
}
percpu_up_read(&c->mark_lock);
- return entry;
+ for (i = 0; i < 2; i++) {
+ struct jset_entry_clock *clock =
+ container_of(jset_entry_init(end, sizeof(*clock)),
+ struct jset_entry_clock, entry);
+
+ clock->entry.type = BCH_JSET_ENTRY_clock;
+ clock->rw = i;
+ clock->time = atomic64_read(&c->io_clock[i].now);
+ }
}
void bch2_fs_mark_clean(struct bch_fs *c)
@@ -1084,15 +1080,13 @@ void bch2_fs_mark_clean(struct bch_fs *c)
}
sb_clean->flags = 0;
- sb_clean->read_clock = cpu_to_le16(c->bucket_clock[READ].hand);
- sb_clean->write_clock = cpu_to_le16(c->bucket_clock[WRITE].hand);
sb_clean->journal_seq = cpu_to_le64(journal_cur_seq(&c->journal) - 1);
/* Trying to catch outstanding bug: */
BUG_ON(le64_to_cpu(sb_clean->journal_seq) > S64_MAX);
entry = sb_clean->start;
- entry = bch2_journal_super_entries_add_common(c, entry, 0);
+ bch2_journal_super_entries_add_common(c, &entry, 0);
entry = bch2_btree_roots_to_journal_entries(c, entry, entry);
BUG_ON((void *) entry > vstruct_end(&sb_clean->field));
diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h
index 402ae563b3c7..dd8d4ba911f0 100644
--- a/fs/bcachefs/super-io.h
+++ b/fs/bcachefs/super-io.h
@@ -122,9 +122,8 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi)
/* BCH_SB_FIELD_clean: */
-struct jset_entry *
-bch2_journal_super_entries_add_common(struct bch_fs *,
- struct jset_entry *, u64);
+void bch2_journal_super_entries_add_common(struct bch_fs *,
+ struct jset_entry **, u64);
void bch2_sb_clean_renumber(struct bch_sb_field_clean *, int);
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index d451a29b517b..5f5893ab9edf 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -181,9 +181,6 @@ static void __bch2_fs_read_only(struct bch_fs *c)
bch2_copygc_stop(c);
bch2_gc_thread_stop(c);
- bch2_io_timer_del(&c->io_clock[READ], &c->bucket_clock[READ].rescale);
- bch2_io_timer_del(&c->io_clock[WRITE], &c->bucket_clock[WRITE].rescale);
-
/*
* Flush journal before stopping allocators, because flushing journal
* blacklist entries involves allocating new btree nodes:
@@ -406,9 +403,6 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
bch2_dev_allocator_add(c, ca);
bch2_recalc_capacity(c);
- bch2_io_timer_add(&c->io_clock[READ], &c->bucket_clock[READ].rescale);
- bch2_io_timer_add(&c->io_clock[WRITE], &c->bucket_clock[WRITE].rescale);
-
for_each_rw_member(ca, c, i) {
ret = bch2_dev_allocator_start(ca);
if (ret) {
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 521b6d8d518f..8fdbeaf9df32 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -705,7 +705,7 @@ static unsigned bucket_last_io_fn(struct bch_fs *c, struct bch_dev *ca,
{
int rw = (private ? 1 : 0);
- return bucket_last_io(c, bucket(ca, b), rw);
+ return atomic64_read(&c->io_clock[rw].now) - bucket(ca, b)->io_time[rw];
}
static unsigned bucket_sectors_used_fn(struct bch_fs *c, struct bch_dev *ca,
@@ -718,7 +718,7 @@ static unsigned bucket_sectors_used_fn(struct bch_fs *c, struct bch_dev *ca,
static unsigned bucket_oldest_gen_fn(struct bch_fs *c, struct bch_dev *ca,
size_t b, void *private)
{
- return bucket_gc_gen(ca, b);
+ return bucket_gc_gen(bucket(ca, b));
}
static int unsigned_cmp(const void *_l, const void *_r)