19 files changed, 141 insertions, 310 deletions
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 9a670bb2ccfb..bba83011b18b 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -31,8 +31,6 @@ static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = {
 #undef x
 };
 
-static void bch2_recalc_oldest_io(struct bch_fs *, struct bch_dev *, int);
-
 /* Ratelimiting/PD controllers */
 
 static void pd_controllers_update(struct work_struct *work)
@@ -340,9 +338,7 @@ static int bch2_alloc_read_fn(struct bch_fs *c, enum btree_id id,
 
 int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys)
 {
-	struct bch_dev *ca;
-	unsigned i;
-	int ret = 0;
+	int ret;
 
 	down_read(&c->gc_lock);
 	ret = bch2_btree_and_journal_walk(c, journal_keys, BTREE_ID_ALLOC,
@@ -358,22 +354,6 @@ int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys)
 	bch2_dev_usage_from_buckets(c);
 	percpu_up_write(&c->mark_lock);
 
-	mutex_lock(&c->bucket_clock[READ].lock);
-	for_each_member_device(ca, c, i) {
-		down_read(&ca->bucket_lock);
-		bch2_recalc_oldest_io(c, ca, READ);
-		up_read(&ca->bucket_lock);
-	}
-	mutex_unlock(&c->bucket_clock[READ].lock);
-
-	mutex_lock(&c->bucket_clock[WRITE].lock);
-	for_each_member_device(ca, c, i) {
-		down_read(&ca->bucket_lock);
-		bch2_recalc_oldest_io(c, ca, WRITE);
-		up_read(&ca->bucket_lock);
-	}
-	mutex_unlock(&c->bucket_clock[WRITE].lock);
-
 	return 0;
 }
 
@@ -460,114 +440,6 @@ err:
 
 /* Bucket IO clocks: */
 
-static void bch2_recalc_oldest_io(struct bch_fs *c, struct bch_dev *ca, int rw)
-{
-	struct bucket_clock *clock = &c->bucket_clock[rw];
-	struct bucket_array *buckets = bucket_array(ca);
-	struct bucket *g;
-	u16 max_last_io = 0;
-	unsigned i;
-
-	lockdep_assert_held(&c->bucket_clock[rw].lock);
-
-	/* Recalculate max_last_io for this device: */
-	for_each_bucket(g, buckets)
-		max_last_io = max(max_last_io, bucket_last_io(c, g, rw));
-
-	ca->max_last_bucket_io[rw] = max_last_io;
-
-	/* Recalculate global max_last_io: */
-	max_last_io = 0;
-
-	for_each_member_device(ca, c, i)
-		max_last_io = max(max_last_io, ca->max_last_bucket_io[rw]);
-
-	clock->max_last_io = max_last_io;
-}
-
-static void bch2_rescale_bucket_io_times(struct bch_fs *c, int rw)
-{
-	struct bucket_clock *clock = &c->bucket_clock[rw];
-	struct bucket_array *buckets;
-	struct bch_dev *ca;
-	struct bucket *g;
-	unsigned i;
-
-	trace_rescale_prios(c);
-
-	for_each_member_device(ca, c, i) {
-		down_read(&ca->bucket_lock);
-		buckets = bucket_array(ca);
-
-		for_each_bucket(g, buckets)
-			g->io_time[rw] = clock->hand -
-			bucket_last_io(c, g, rw) / 2;
-
-		bch2_recalc_oldest_io(c, ca, rw);
-
-		up_read(&ca->bucket_lock);
-	}
-}
-
-static inline u64 bucket_clock_freq(u64 capacity)
-{
-	return max(capacity >> 10, 2028ULL);
-}
-
-static void bch2_inc_clock_hand(struct io_timer *timer)
-{
-	struct bucket_clock *clock = container_of(timer,
-						struct bucket_clock, rescale);
-	struct bch_fs *c = container_of(clock,
-					struct bch_fs, bucket_clock[clock->rw]);
-	struct bch_dev *ca;
-	u64 capacity;
-	unsigned i;
-
-	mutex_lock(&clock->lock);
-
-	/* if clock cannot be advanced more, rescale prio */
-	if (clock->max_last_io >= U16_MAX - 2)
-		bch2_rescale_bucket_io_times(c, clock->rw);
-
-	BUG_ON(clock->max_last_io >= U16_MAX - 2);
-
-	for_each_member_device(ca, c, i)
-		ca->max_last_bucket_io[clock->rw]++;
-	clock->max_last_io++;
-	clock->hand++;
-
-	mutex_unlock(&clock->lock);
-
-	capacity = READ_ONCE(c->capacity);
-
-	if (!capacity)
-		return;
-
-	/*
-	 * we only increment when 0.1% of the filesystem capacity has been read
-	 * or written too, this determines if it's time
-	 *
-	 * XXX: we shouldn't really be going off of the capacity of devices in
-	 * RW mode (that will be 0 when we're RO, yet we can still service
-	 * reads)
-	 */
-	timer->expire += bucket_clock_freq(capacity);
-
-	bch2_io_timer_add(&c->io_clock[clock->rw], timer);
-}
-
-static void bch2_bucket_clock_init(struct bch_fs *c, int rw)
-{
-	struct bucket_clock *clock = &c->bucket_clock[rw];
-
-	clock->hand		= 1;
-	clock->rw		= rw;
-	clock->rescale.fn	= bch2_inc_clock_hand;
-	clock->rescale.expire	= bucket_clock_freq(c->capacity);
-	mutex_init(&clock->lock);
-}
-
 int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
 			      size_t bucket_nr, int rw)
 {
@@ -577,7 +449,7 @@ int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
 	struct bucket *g;
 	struct bkey_alloc_buf *a;
 	struct bkey_alloc_unpacked u;
-	u64 *time;
+	u64 *time, now;
 	int ret = 0;
 
 	iter = bch2_trans_get_iter(trans, BTREE_ID_ALLOC, POS(dev, bucket_nr),
@@ -599,10 +471,11 @@ int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
 	percpu_up_read(&c->mark_lock);
 
 	time = rw == READ ? &u.read_time : &u.write_time;
-	if (*time == c->bucket_clock[rw].hand)
+	now = atomic64_read(&c->io_clock[rw].now);
+	if (*time == now)
 		goto out;
 
-	*time = c->bucket_clock[rw].hand;
+	*time = now;
 
 	bch2_alloc_pack(c, a, u);
 	ret   = bch2_trans_update(trans, iter, &a->k, 0) ?:
@@ -674,23 +547,22 @@ static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca)
 	return ret;
 }
 
-static bool bch2_can_invalidate_bucket(struct bch_dev *ca,
-				       size_t bucket,
-				       struct bucket_mark mark)
+static bool bch2_can_invalidate_bucket(struct bch_dev *ca, size_t b,
+				       struct bucket_mark m)
 {
 	u8 gc_gen;
 
-	if (!is_available_bucket(mark))
+	if (!is_available_bucket(m))
 		return false;
 
-	if (mark.owned_by_allocator)
+	if (m.owned_by_allocator)
 		return false;
 
 	if (ca->buckets_nouse &&
-	    test_bit(bucket, ca->buckets_nouse))
+	    test_bit(b, ca->buckets_nouse))
 		return false;
 
-	gc_gen = bucket_gc_gen(ca, bucket);
+	gc_gen = bucket_gc_gen(bucket(ca, b));
 
 	if (gc_gen >= BUCKET_GC_GEN_MAX / 2)
 		ca->inc_gen_needs_gc++;
@@ -704,43 +576,33 @@ static bool bch2_can_invalidate_bucket(struct bch_dev *ca,
 /*
  * Determines what order we're going to reuse buckets, smallest bucket_key()
  * first.
- *
- *
- * - We take into account the read prio of the bucket, which gives us an
- *   indication of how hot the data is -- we scale the prio so that the prio
- *   farthest from the clock is worth 1/8th of the closest.
- *
- * - The number of sectors of cached data in the bucket, which gives us an
- *   indication of the cost in cache misses this eviction will cause.
- *
- * - If hotness * sectors used compares equal, we pick the bucket with the
- *   smallest bucket_gc_gen() - since incrementing the same bucket's generation
- *   number repeatedly forces us to run mark and sweep gc to avoid generation
- *   number wraparound.
  */
 
-static unsigned long bucket_sort_key(struct bch_fs *c, struct bch_dev *ca,
-				     size_t b, struct bucket_mark m)
+static unsigned bucket_sort_key(struct bucket *g, struct bucket_mark m,
+				u64 now, u64 last_seq_ondisk)
 {
-	unsigned last_io = bucket_last_io(c, bucket(ca, b), READ);
-	unsigned max_last_io = ca->max_last_bucket_io[READ];
-
-	/*
-	 * Time since last read, scaled to [0, 8) where larger value indicates
-	 * more recently read data:
-	 */
-	unsigned long hotness = (max_last_io - last_io) * 7 / max_last_io;
-
-	/* How much we want to keep the data in this bucket: */
-	unsigned long data_wantness =
-		(hotness + 1) * bucket_sectors_used(m);
+	unsigned used = bucket_sectors_used(m);
 
-	unsigned long needs_journal_commit =
-		bucket_needs_journal_commit(m, c->journal.last_seq_ondisk);
+	if (used) {
+		/*
+		 * Prefer to keep buckets that have been read more recently, and
+		 * buckets that have more data in them:
+		 */
+		u64 last_read = max_t(s64, 0, now - g->io_time[READ]);
+		u32 last_read_scaled = max_t(u64, U32_MAX, div_u64(last_read, used));
 
-	return  (data_wantness << 9) |
-		(needs_journal_commit << 8) |
-		(bucket_gc_gen(ca, b) / 16);
+		return -last_read_scaled;
+	} else {
+		/*
+		 * Prefer to use buckets with smaller gc_gen so that we don't
+		 * have to walk the btree and recalculate oldest_gen - but shift
+		 * off the low bits so that buckets will still have equal sort
+		 * keys when there's only a small difference, so that we can
+		 * keep sequential buckets together:
+		 */
+		return  (bucket_needs_journal_commit(m, last_seq_ondisk) << 4)|
+			(bucket_gc_gen(g) >> 4);
+	}
 }
 
 static inline int bucket_alloc_cmp(alloc_heap *h,
@@ -763,16 +625,15 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
 {
 	struct bucket_array *buckets;
 	struct alloc_heap_entry e = { 0 };
+	u64 now, last_seq_ondisk;
 	size_t b, i, nr = 0;
 
-	ca->alloc_heap.used = 0;
-
-	mutex_lock(&c->bucket_clock[READ].lock);
 	down_read(&ca->bucket_lock);
 
 	buckets = bucket_array(ca);
-
-	bch2_recalc_oldest_io(c, ca, READ);
+	ca->alloc_heap.used = 0;
+	now = atomic64_read(&c->io_clock[READ].now);
+	last_seq_ondisk = c->journal.last_seq_ondisk;
 
 	/*
 	 * Find buckets with lowest read priority, by building a maxheap sorted
@@ -780,8 +641,9 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
 	 * all buckets have been visited.
 	 */
 	for (b = ca->mi.first_bucket; b < ca->mi.nbuckets; b++) {
-		struct bucket_mark m = READ_ONCE(buckets->b[b].mark);
-		unsigned long key = bucket_sort_key(c, ca, b, m);
+		struct bucket *g = &buckets->b[b];
+		struct bucket_mark m = READ_ONCE(g->mark);
+		unsigned key = bucket_sort_key(g, m, now, last_seq_ondisk);
 
 		if (!bch2_can_invalidate_bucket(ca, b, m))
 			continue;
@@ -816,7 +678,6 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
 	}
 
 	up_read(&ca->bucket_lock);
-	mutex_unlock(&c->bucket_clock[READ].lock);
 }
 
 static void find_reclaimable_buckets_fifo(struct bch_fs *c, struct bch_dev *ca)
@@ -1031,8 +892,8 @@ retry:
 	u.data_type	= 0;
 	u.dirty_sectors	= 0;
 	u.cached_sectors = 0;
-	u.read_time	= c->bucket_clock[READ].hand;
-	u.write_time	= c->bucket_clock[WRITE].hand;
+	u.read_time	= atomic64_read(&c->io_clock[READ].now);
+	u.write_time	= atomic64_read(&c->io_clock[WRITE].now);
 
 	bch2_alloc_pack(c, &a, u);
 	bch2_trans_update(trans, iter, &a.k,
@@ -1542,8 +1403,6 @@ int bch2_dev_allocator_start(struct bch_dev *ca)
 void bch2_fs_allocator_background_init(struct bch_fs *c)
 {
 	spin_lock_init(&c->freelist_lock);
-	bch2_bucket_clock_init(c, READ);
-	bch2_bucket_clock_init(c, WRITE);
 
 	c->pd_controllers_update_seconds = 5;
 	INIT_DELAYED_WORK(&c->pd_controllers_update, pd_controllers_update);
diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h
index 1abfff5290bc..be164d6108bb 100644
--- a/fs/bcachefs/alloc_types.h
+++ b/fs/bcachefs/alloc_types.h
@@ -10,30 +10,6 @@
 
 struct ec_bucket_buf;
 
-/* There's two of these clocks, one for reads and one for writes: */
-struct bucket_clock {
-	/*
-	 * "now" in (read/write) IO time - incremented whenever we do X amount
-	 * of reads or writes.
-	 *
-	 * Goes with the bucket read/write prios: when we read or write to a
-	 * bucket we reset the bucket's prio to the current hand; thus hand -
-	 * prio = time since bucket was last read/written.
-	 *
-	 * The units are some amount (bytes/sectors) of data read/written, and
-	 * the units can change on the fly if we need to rescale to fit
-	 * everything in a u16 - your only guarantee is that the units are
-	 * consistent.
-	 */
-	u16			hand;
-	u16			max_last_io;
-
-	int			rw;
-
-	struct io_timer		rescale;
-	struct mutex		lock;
-};
-
 enum alloc_reserve {
 	RESERVE_BTREE_MOVINGGC	= -2,
 	RESERVE_BTREE		= -1,
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index bd675b88b354..763cac0efa0c 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -451,9 +451,6 @@ struct bch_dev {
 
 	size_t			fifo_last_bucket;
 
-	/* last calculated minimum prio */
-	u16			max_last_bucket_io[2];
-
 	size_t			inc_gen_needs_gc;
 	size_t			inc_gen_really_needs_gc;
 
@@ -693,14 +690,6 @@ struct bch_fs {
 	struct mutex		usage_scratch_lock;
 	struct bch_fs_usage_online *usage_scratch;
 
-	/*
-	 * When we invalidate buckets, we use both the priority and the amount
-	 * of good data to determine which buckets to reuse first - to weight
-	 * those together consistently we keep track of the smallest nonzero
-	 * priority of any bucket.
-	 */
-	struct bucket_clock	bucket_clock[2];
-
 	struct io_clock		io_clock[2];
 
 	/* JOURNAL SEQ BLACKLIST */
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index b6c7e57b6bcd..5dab5bfd228a 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -1143,8 +1143,8 @@ struct bch_sb_field_clean {
 	struct bch_sb_field	field;
 
 	__le32			flags;
-	__le16			read_clock;
-	__le16			write_clock;
+	__le16			_read_clock; /* no longer used */
+	__le16			_write_clock;
 	__le64			journal_seq;
 
 	union {
@@ -1511,7 +1511,8 @@ static inline __u64 __bset_magic(struct bch_sb *sb)
 	x(blacklist,		3)		\
 	x(blacklist_v2,		4)		\
 	x(usage,		5)		\
-	x(data_usage,		6)
+	x(data_usage,		6)		\
+	x(clock,		7)
 
 enum {
 #define x(f, nr)	BCH_JSET_ENTRY_##f	= nr,
@@ -1559,6 +1560,13 @@ struct jset_entry_data_usage {
 	struct bch_replicas_entry r;
 } __attribute__((packed));
 
+struct jset_entry_clock {
+	struct jset_entry	entry;
+	__u8			rw;
+	__u8			pad[7];
+	__le64			time;
+} __attribute__((packed));
+
 /*
  * On disk format for a journal entry:
  * seq is monotonically increasing; every journal entry has its own unique
@@ -1581,8 +1589,8 @@ struct jset {
 
 	__u8			encrypted_start[0];
 
-	__le16			read_clock;
-	__le16			write_clock;
+	__le16			_read_clock; /* no longer used */
+	__le16			_write_clock;
 
 	/* Sequence number of oldest dirty journal entry */
 	__le64			last_seq;
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 9e123736a125..5ea9bae09d59 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -1489,7 +1489,7 @@ static int bch2_gc_thread(void *arg)
 {
 	struct bch_fs *c = arg;
 	struct io_clock *clock = &c->io_clock[WRITE];
-	unsigned long last = atomic_long_read(&clock->now);
+	unsigned long last = atomic64_read(&clock->now);
 	unsigned last_kick = atomic_read(&c->kick_gc);
 	int ret;
 
@@ -1510,7 +1510,7 @@ static int bch2_gc_thread(void *arg)
 			if (c->btree_gc_periodic) {
 				unsigned long next = last + c->capacity / 16;
 
-				if (atomic_long_read(&clock->now) >= next)
+				if (atomic64_read(&clock->now) >= next)
 					break;
 
 				bch2_io_clock_schedule_timeout(clock, next);
@@ -1522,7 +1522,7 @@ static int bch2_gc_thread(void *arg)
 		}
 		__set_current_state(TASK_RUNNING);
 
-		last = atomic_long_read(&clock->now);
+		last = atomic64_read(&clock->now);
 		last_kick = atomic_read(&c->kick_gc);
 
 		/*
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 4103ea7e769a..50989d286190 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -58,20 +58,13 @@ static inline struct bucket *bucket(struct bch_dev *ca, size_t b)
 	return __bucket(ca, b, false);
 }
 
-static inline u16 bucket_last_io(struct bch_fs *c, struct bucket *g, int rw)
-{
-	return c->bucket_clock[rw].hand - g->io_time[rw];
-}
-
 /*
  * bucket_gc_gen() returns the difference between the bucket's current gen and
  * the oldest gen of any pointer into that bucket in the btree.
  */
 
-static inline u8 bucket_gc_gen(struct bch_dev *ca, size_t b)
+static inline u8 bucket_gc_gen(struct bucket *g)
 {
-	struct bucket *g = bucket(ca, b);
-
 	return g->mark.gen - g->oldest_gen;
 }
 
diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h
index 99ab9f48ba9d..b6ea67506cc2 100644
--- a/fs/bcachefs/buckets_types.h
+++ b/fs/bcachefs/buckets_types.h
@@ -37,7 +37,7 @@ struct bucket {
 		const struct bucket_mark mark;
 	};
 
-	u16				io_time[2];
+	u64				io_time[2];
 	u8				oldest_gen;
 	u8				gc_gen;
 	unsigned			gen_valid:1;
diff --git a/fs/bcachefs/clock.c b/fs/bcachefs/clock.c
index 869ba1887757..da91c95e3ffc 100644
--- a/fs/bcachefs/clock.c
+++ b/fs/bcachefs/clock.c
@@ -19,7 +19,7 @@ void bch2_io_timer_add(struct io_clock *clock, struct io_timer *timer)
 
 	spin_lock(&clock->timer_lock);
 
-	if (time_after_eq((unsigned long) atomic_long_read(&clock->now),
+	if (time_after_eq((unsigned long) atomic64_read(&clock->now),
 			  timer->expire)) {
 		spin_unlock(&clock->timer_lock);
 		timer->fn(timer);
@@ -146,7 +146,7 @@ static struct io_timer *get_expired_timer(struct io_clock *clock,
 void __bch2_increment_clock(struct io_clock *clock, unsigned sectors)
 {
 	struct io_timer *timer;
-	unsigned long now = atomic_long_add_return(sectors, &clock->now);
+	unsigned long now = atomic64_add_return(sectors, &clock->now);
 
 	while ((timer = get_expired_timer(clock, now)))
 		timer->fn(timer);
@@ -158,7 +158,7 @@ void bch2_io_timers_to_text(struct printbuf *out, struct io_clock *clock)
 	unsigned i;
 
 	spin_lock(&clock->timer_lock);
-	now = atomic_long_read(&clock->now);
+	now = atomic64_read(&clock->now);
 
 	for (i = 0; i < clock->timers.used; i++)
 		pr_buf(out, "%ps:\t%li\n",
@@ -175,7 +175,7 @@ void bch2_io_clock_exit(struct io_clock *clock)
 
 int bch2_io_clock_init(struct io_clock *clock)
 {
-	atomic_long_set(&clock->now, 0);
+	atomic64_set(&clock->now, 0);
 	spin_lock_init(&clock->timer_lock);
 
 	clock->max_slop = IO_CLOCK_PCPU_SECTORS * num_possible_cpus();
diff --git a/fs/bcachefs/clock_types.h b/fs/bcachefs/clock_types.h
index 92c740a47565..5fae0012d808 100644
--- a/fs/bcachefs/clock_types.h
+++ b/fs/bcachefs/clock_types.h
@@ -26,7 +26,7 @@ struct io_timer {
 typedef HEAP(struct io_timer *)	io_timer_heap;
 
 struct io_clock {
-	atomic_long_t		now;
+	atomic64_t		now;
 	u16 __percpu		*pcpu_buf;
 	unsigned		max_slop;
 
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index ba37c78c01db..379b9ad2c0f9 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -1123,6 +1123,9 @@ int bch2_fs_journal_init(struct journal *j)
 	j->entry_u64s_reserved +=
 		BTREE_ID_NR * (JSET_KEYS_U64s + BKEY_BTREE_PTR_U64s_MAX);
 
+	j->entry_u64s_reserved +=
+		2 * (sizeof(struct jset_entry_clock) / sizeof(u64));
+
 	atomic64_set(&j->reservations.counter,
 		((union journal_res_state)
 		 { .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v);
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 7e726db77881..a82548983dbd 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -426,6 +426,32 @@ fsck_err:
 	return ret;
 }
 
+static int journal_entry_validate_clock(struct bch_fs *c,
+					struct jset *jset,
+					struct jset_entry *entry,
+					int write)
+{
+	struct jset_entry_clock *clock =
+		container_of(entry, struct jset_entry_clock, entry);
+	unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
+	int ret = 0;
+
+	if (journal_entry_err_on(bytes != sizeof(*clock),
+				 c, "invalid journal entry clock: bad size")) {
+		journal_entry_null_range(entry, vstruct_next(entry));
+		return ret;
+	}
+
+	if (journal_entry_err_on(clock->rw > 1,
+				 c, "invalid journal entry clock: bad rw")) {
+		journal_entry_null_range(entry, vstruct_next(entry));
+		return ret;
+	}
+
+fsck_err:
+	return ret;
+}
+
 struct jset_entry_ops {
 	int (*validate)(struct bch_fs *, struct jset *,
 			struct jset_entry *, int);
@@ -1361,8 +1387,8 @@ void bch2_journal_write(struct closure *cl)
 
 	end	= bch2_btree_roots_to_journal_entries(c, jset->start, end);
 
-	end	= bch2_journal_super_entries_add_common(c, end,
-						le64_to_cpu(jset->seq));
+	bch2_journal_super_entries_add_common(c, &end,
+				le64_to_cpu(jset->seq));
 	u64s	= (u64 *) end - (u64 *) start;
 	BUG_ON(u64s > j->entry_u64s_reserved);
 
@@ -1371,10 +1397,7 @@ void bch2_journal_write(struct closure *cl)
 
 	journal_write_compact(jset);
 
-	jset->read_clock	= cpu_to_le16(c->bucket_clock[READ].hand);
-	jset->write_clock	= cpu_to_le16(c->bucket_clock[WRITE].hand);
 	jset->magic		= cpu_to_le64(jset_magic(c));
-
 	jset->version		= c->sb.version < bcachefs_metadata_version_new_versioning
 		? cpu_to_le32(BCH_JSET_VERSION_OLD)
 		: cpu_to_le32(c->sb.version);
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index 8e6e4cd73886..e2472c19beaf 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -298,7 +298,7 @@ static int bch2_copygc_thread(void *arg)
 {
 	struct bch_fs *c = arg;
 	struct io_clock *clock = &c->io_clock[WRITE];
-	unsigned long last, wait;
+	u64 last, wait;
 
 	set_freezable();
 
@@ -306,7 +306,7 @@ static int bch2_copygc_thread(void *arg)
 		if (kthread_wait_freezable(c->copy_gc_enabled))
 			break;
 
-		last = atomic_long_read(&clock->now);
+		last = atomic64_read(&clock->now);
 		wait = bch2_copygc_wait_amount(c);
 
 		if (wait > clock->max_slop) {
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
index f9a12dd797a5..2263ee41c444 100644
--- a/fs/bcachefs/rebalance.c
+++ b/fs/bcachefs/rebalance.c
@@ -169,12 +169,12 @@ static int bch2_rebalance_thread(void *arg)
 	unsigned long start, prev_start;
 	unsigned long prev_run_time, prev_run_cputime;
 	unsigned long cputime, prev_cputime;
-	unsigned long io_start;
+	u64 io_start;
 	long throttle;
 
 	set_freezable();
 
-	io_start	= atomic_long_read(&clock->now);
+	io_start	= atomic64_read(&clock->now);
 	p		= rebalance_work(c);
 	prev_start	= jiffies;
 	prev_cputime	= curr_cputime();
@@ -210,7 +210,7 @@ static int bch2_rebalance_thread(void *arg)
 					(20 - w.dev_most_full_percent),
 					50);
 
-			if (atomic_long_read(&clock->now) + clock->max_slop <
+			if (atomic64_read(&clock->now) + clock->max_slop <
 			    r->throttled_until_iotime) {
 				r->throttled_until_cputime = start + throttle;
 				r->state = REBALANCE_THROTTLED;
@@ -229,7 +229,7 @@ static int bch2_rebalance_thread(void *arg)
 			      max(p.dev_most_full_percent, 1U) /
 			      max(w.dev_most_full_percent, 1U));
 
-		io_start	= atomic_long_read(&clock->now);
+		io_start	= atomic64_read(&clock->now);
 		p		= w;
 		prev_start	= start;
 		prev_cputime	= cputime;
@@ -274,7 +274,7 @@ void bch2_rebalance_work_to_text(struct printbuf *out, struct bch_fs *c)
 	case REBALANCE_THROTTLED:
 		bch2_hprint(&PBUF(h1),
 			    (r->throttled_until_iotime -
-			     atomic_long_read(&c->io_clock[WRITE].now)) << 9);
+			     atomic64_read(&c->io_clock[WRITE].now)) << 9);
 		pr_buf(out, "throttled for %lu sec or %s io\n",
 		       (r->throttled_until_cputime - jiffies) / HZ,
 		       h1);
diff --git a/fs/bcachefs/rebalance_types.h b/fs/bcachefs/rebalance_types.h
index 192c6be20ced..2f62a643c39f 100644
--- a/fs/bcachefs/rebalance_types.h
+++ b/fs/bcachefs/rebalance_types.h
@@ -17,7 +17,7 @@ struct bch_fs_rebalance {
 	atomic64_t		work_unknown_dev;
 
 	enum rebalance_state	state;
-	unsigned long		throttled_until_iotime;
+	u64			throttled_until_iotime;
 	unsigned long		throttled_until_cputime;
 	struct bch_move_stats	move_stats;
 
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index f470e0e233ce..55f7771e11c8 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -847,6 +847,12 @@ static int journal_replay_entry_early(struct bch_fs *c,
 				le64_to_cpu(bl_entry->end) + 1);
 		break;
 	}
+	case BCH_JSET_ENTRY_clock: {
+		struct jset_entry_clock *clock =
+			container_of(entry, struct jset_entry_clock, entry);
+
+		atomic64_set(&c->io_clock[clock->rw].now, clock->time);
+	}
 	}
 
 	return ret;
@@ -861,9 +867,6 @@ static int journal_replay_early(struct bch_fs *c,
 	int ret;
 
 	if (clean) {
-		c->bucket_clock[READ].hand = le16_to_cpu(clean->read_clock);
-		c->bucket_clock[WRITE].hand = le16_to_cpu(clean->write_clock);
-
 		for (entry = clean->start;
 		     entry != vstruct_end(&clean->field);
 		     entry = vstruct_next(entry)) {
@@ -876,9 +879,6 @@ static int journal_replay_early(struct bch_fs *c,
 			if (i->ignore)
 				continue;
 
-			c->bucket_clock[READ].hand = le16_to_cpu(i->j.read_clock);
-			c->bucket_clock[WRITE].hand = le16_to_cpu(i->j.write_clock);
-
 			vstruct_for_each(&i->j, entry) {
 				ret = journal_replay_entry_early(c, entry);
 				if (ret)
@@ -942,13 +942,6 @@ static int verify_superblock_clean(struct bch_fs *c,
 		return 0;
 	}
 
-	mustfix_fsck_err_on(j->read_clock != clean->read_clock, c,
-			"superblock read clock %u doesn't match journal %u after clean shutdown",
-			clean->read_clock, j->read_clock);
-	mustfix_fsck_err_on(j->write_clock != clean->write_clock, c,
-			"superblock write clock %u doesn't match journal %u after clean shutdown",
-			clean->write_clock, j->write_clock);
-
 	for (i = 0; i < BTREE_ID_NR; i++) {
 		char buf1[200], buf2[200];
 		struct bkey_i *k1, *k2;
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 61b947313c88..3b082da934fb 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -966,29 +966,25 @@ int bch2_fs_mark_dirty(struct bch_fs *c)
 	return ret;
 }
 
-static void
-entry_init_u64s(struct jset_entry *entry, unsigned u64s)
+static struct jset_entry *jset_entry_init(struct jset_entry **end, size_t size)
 {
-	memset(entry, 0, u64s * sizeof(u64));
+	struct jset_entry *entry = *end;
+	unsigned u64s = DIV_ROUND_UP(size, sizeof(u64));
 
+	memset(entry, 0, u64s * sizeof(u64));
 	/*
 	 * The u64s field counts from the start of data, ignoring the shared
 	 * fields.
 	 */
 	entry->u64s = u64s - 1;
-}
 
-static void
-entry_init_size(struct jset_entry *entry, size_t size)
-{
-	unsigned u64s = DIV_ROUND_UP(size, sizeof(u64));
-	entry_init_u64s(entry, u64s);
+	*end = vstruct_next(*end);
+	return entry;
 }
 
-struct jset_entry *
-bch2_journal_super_entries_add_common(struct bch_fs *c,
-				      struct jset_entry *entry,
-				      u64 journal_seq)
+void bch2_journal_super_entries_add_common(struct bch_fs *c,
+					   struct jset_entry **end,
+					   u64 journal_seq)
 {
 	unsigned i;
 
@@ -1003,59 +999,59 @@ bch2_journal_super_entries_add_common(struct bch_fs *c,
 
 	{
 		struct jset_entry_usage *u =
-			container_of(entry, struct jset_entry_usage, entry);
+			container_of(jset_entry_init(end, sizeof(*u)),
+				     struct jset_entry_usage, entry);
 
-		entry_init_size(entry, sizeof(*u));
 		u->entry.type	= BCH_JSET_ENTRY_usage;
 		u->entry.btree_id = FS_USAGE_INODES;
 		u->v		= cpu_to_le64(c->usage_base->nr_inodes);
-
-		entry = vstruct_next(entry);
 	}
 
 	{
 		struct jset_entry_usage *u =
-			container_of(entry, struct jset_entry_usage, entry);
+			container_of(jset_entry_init(end, sizeof(*u)),
+				     struct jset_entry_usage, entry);
 
-		entry_init_size(entry, sizeof(*u));
 		u->entry.type	= BCH_JSET_ENTRY_usage;
 		u->entry.btree_id = FS_USAGE_KEY_VERSION;
 		u->v		= cpu_to_le64(atomic64_read(&c->key_version));
-
-		entry = vstruct_next(entry);
 	}
 
 	for (i = 0; i < BCH_REPLICAS_MAX; i++) {
 		struct jset_entry_usage *u =
-			container_of(entry, struct jset_entry_usage, entry);
+			container_of(jset_entry_init(end, sizeof(*u)),
+				     struct jset_entry_usage, entry);
 
-		entry_init_size(entry, sizeof(*u));
 		u->entry.type	= BCH_JSET_ENTRY_usage;
 		u->entry.btree_id = FS_USAGE_RESERVED;
 		u->entry.level	= i;
 		u->v		= cpu_to_le64(c->usage_base->persistent_reserved[i]);
-
-		entry = vstruct_next(entry);
 	}
 
 	for (i = 0; i < c->replicas.nr; i++) {
 		struct bch_replicas_entry *e =
 			cpu_replicas_entry(&c->replicas, i);
 		struct jset_entry_data_usage *u =
-			container_of(entry, struct jset_entry_data_usage, entry);
+			container_of(jset_entry_init(end, sizeof(*u) + e->nr_devs),
+				     struct jset_entry_data_usage, entry);
 
-		entry_init_size(entry, sizeof(*u) + e->nr_devs);
 		u->entry.type	= BCH_JSET_ENTRY_data_usage;
 		u->v		= cpu_to_le64(c->usage_base->replicas[i]);
 		unsafe_memcpy(&u->r, e, replicas_entry_bytes(e),
 			      "embedded variable length struct");
-
-		entry = vstruct_next(entry);
 	}
 
 	percpu_up_read(&c->mark_lock);
 
-	return entry;
+	for (i = 0; i < 2; i++) {
+		struct jset_entry_clock *clock =
+			container_of(jset_entry_init(end, sizeof(*clock)),
+				     struct jset_entry_clock, entry);
+
+		clock->entry.type = BCH_JSET_ENTRY_clock;
+		clock->rw	= i;
+		clock->time	= atomic64_read(&c->io_clock[i].now);
+	}
 }
 
 void bch2_fs_mark_clean(struct bch_fs *c)
@@ -1084,15 +1080,13 @@ void bch2_fs_mark_clean(struct bch_fs *c)
 	}
 
 	sb_clean->flags		= 0;
-	sb_clean->read_clock	= cpu_to_le16(c->bucket_clock[READ].hand);
-	sb_clean->write_clock	= cpu_to_le16(c->bucket_clock[WRITE].hand);
 	sb_clean->journal_seq	= cpu_to_le64(journal_cur_seq(&c->journal) - 1);
 
 	/* Trying to catch outstanding bug: */
 	BUG_ON(le64_to_cpu(sb_clean->journal_seq) > S64_MAX);
 
 	entry = sb_clean->start;
-	entry = bch2_journal_super_entries_add_common(c, entry, 0);
+	bch2_journal_super_entries_add_common(c, &entry, 0);
 	entry = bch2_btree_roots_to_journal_entries(c, entry, entry);
 	BUG_ON((void *) entry > vstruct_end(&sb_clean->field));
 
diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h
index 402ae563b3c7..dd8d4ba911f0 100644
--- a/fs/bcachefs/super-io.h
+++ b/fs/bcachefs/super-io.h
@@ -122,9 +122,8 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi)
 
 /* BCH_SB_FIELD_clean: */
 
-struct jset_entry *
-bch2_journal_super_entries_add_common(struct bch_fs *,
-				      struct jset_entry *, u64);
+void bch2_journal_super_entries_add_common(struct bch_fs *,
+					   struct jset_entry **, u64);
 
 void bch2_sb_clean_renumber(struct bch_sb_field_clean *, int);
 
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index d451a29b517b..5f5893ab9edf 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -181,9 +181,6 @@ static void __bch2_fs_read_only(struct bch_fs *c)
 	bch2_copygc_stop(c);
 	bch2_gc_thread_stop(c);
 
-	bch2_io_timer_del(&c->io_clock[READ], &c->bucket_clock[READ].rescale);
-	bch2_io_timer_del(&c->io_clock[WRITE], &c->bucket_clock[WRITE].rescale);
-
 	/*
 	 * Flush journal before stopping allocators, because flushing journal
 	 * blacklist entries involves allocating new btree nodes:
@@ -406,9 +403,6 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
 		bch2_dev_allocator_add(c, ca);
 	bch2_recalc_capacity(c);
 
-	bch2_io_timer_add(&c->io_clock[READ], &c->bucket_clock[READ].rescale);
-	bch2_io_timer_add(&c->io_clock[WRITE], &c->bucket_clock[WRITE].rescale);
-
 	for_each_rw_member(ca, c, i) {
 		ret = bch2_dev_allocator_start(ca);
 		if (ret) {
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 521b6d8d518f..8fdbeaf9df32 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -705,7 +705,7 @@ static unsigned bucket_last_io_fn(struct bch_fs *c, struct bch_dev *ca,
 {
 	int rw = (private ? 1 : 0);
 
-	return bucket_last_io(c, bucket(ca, b), rw);
+	return atomic64_read(&c->io_clock[rw].now) - bucket(ca, b)->io_time[rw];
 }
 
 static unsigned bucket_sectors_used_fn(struct bch_fs *c, struct bch_dev *ca,
@@ -718,7 +718,7 @@ static unsigned bucket_sectors_used_fn(struct bch_fs *c, struct bch_dev *ca,
 static unsigned bucket_oldest_gen_fn(struct bch_fs *c, struct bch_dev *ca,
 				     size_t b, void *private)
 {
-	return bucket_gc_gen(ca, b);
+	return bucket_gc_gen(bucket(ca, b));
 }
 
 static int unsigned_cmp(const void *_l, const void *_r)