summaryrefslogtreecommitdiff
path: root/fs/xfs/scrub
diff options
context:
space:
mode:
authorDarrick J. Wong <djwong@kernel.org>2023-12-07 05:40:59 +0300
committerDarrick J. Wong <djwong@kernel.org>2023-12-07 05:45:18 +0300
commitbe408417630427984a1fddd069f30b245793234c (patch)
tree0c7a6b791f70b34ae54dd916cc580445d02bedc6 /fs/xfs/scrub
parent4c8ecd1cfdd01fb727121035014d9f654a30bdf2 (diff)
downloadlinux-be408417630427984a1fddd069f30b245793234c.tar.xz
xfs: implement block reservation accounting for btrees we're staging
Create a new xrep_newbt structure to encapsulate a fake root for creating a staged btree cursor as well as to track all the blocks that we need to reserve in order to build that btree. As for the particular choice of lowspace thresholds and btree block slack factors -- at this point one could say that the thresholds in online repair come from bulkload_estimate_ag_slack in xfs_repair[1]. But that's not the entire story, since the offline btree rebuilding code in xfs_repair was merged as a retroport of the online btree code in this patchset! Before xfs_btree_staging.[ch] came along, xfs_repair determined the slack factor (aka the number of slots to leave unfilled in each new btree block) via open-coded logic in repair/phase5.c[2]. At that point the slack factors were arbitrary quantities per btree. The rmapbt automatically left 10 slots free; everything else left zero. That had a noticeable effect on performance straight after mounting because adding records to /any/ btree would result in splits. A few years ago when this patch was first written, Dave and I decided that repair should generate btree blocks that were 75% full unless space was tight, in which case it should try to fill the blocks to nearly full. We defined tight as ~10% free to avoid repair failures but settled on 3/32 (~9%) to avoid div64. IOWs, we mostly pulled the thresholds out of thin air. We've been QAing with those geometry numbers ever since. ;) Link: https://git.kernel.org/pub/scm/fs/xfs/xfsprogs-dev.git/tree/repair/bulkload.c?h=v6.5.0#n114 Link: https://git.kernel.org/pub/scm/fs/xfs/xfsprogs-dev.git/tree/repair/phase5.c?h=v4.19.0#n1349 Signed-off-by: Darrick J. Wong <djwong@kernel.org> Reviewed-by: Dave Chinner <dchinner@redhat.com>
Diffstat (limited to 'fs/xfs/scrub')
-rw-r--r--fs/xfs/scrub/newbt.c495
-rw-r--r--fs/xfs/scrub/newbt.h62
-rw-r--r--fs/xfs/scrub/trace.h37
3 files changed, 594 insertions, 0 deletions
diff --git a/fs/xfs/scrub/newbt.c b/fs/xfs/scrub/newbt.c
new file mode 100644
index 000000000000..5d1d75d2b1ad
--- /dev/null
+++ b/fs/xfs/scrub/newbt.c
@@ -0,0 +1,495 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2022-2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_btree.h"
+#include "xfs_btree_staging.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_inode.h"
+#include "xfs_alloc.h"
+#include "xfs_rmap.h"
+#include "xfs_ag.h"
+#include "xfs_defer.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/trace.h"
+#include "scrub/repair.h"
+#include "scrub/newbt.h"
+
+/*
+ * Estimate proper slack values for a btree that's being reloaded.
+ *
+ * Under most circumstances, we'll take whatever default loading value the
+ * btree bulk loading code calculates for us. However, there are some
+ * exceptions to this rule:
+ *
+ * (1) If this is a per-AG btree and the AG has less than 10% space free.
+ * (2) If this is an inode btree and the FS has less than 10% space free.
+
+ * In either case, format the new btree blocks almost completely full to
+ * minimize space usage.
+ */
+static void
+xrep_newbt_estimate_slack(
+ struct xrep_newbt *xnr)
+{
+ struct xfs_scrub *sc = xnr->sc;
+ struct xfs_btree_bload *bload = &xnr->bload;
+ uint64_t free;
+ uint64_t sz;
+
+ /* Let the btree code compute the default slack values. */
+ bload->leaf_slack = -1;
+ bload->node_slack = -1;
+
+ if (sc->ops->type == ST_PERAG) {
+ free = sc->sa.pag->pagf_freeblks;
+ sz = xfs_ag_block_count(sc->mp, sc->sa.pag->pag_agno);
+ } else {
+ free = percpu_counter_sum(&sc->mp->m_fdblocks);
+ sz = sc->mp->m_sb.sb_dblocks;
+ }
+
+ /* No further changes if there's more than 10% free space left. */
+ if (free >= div_u64(sz, 10))
+ return;
+
+ /*
+ * We're low on space; load the btrees as tightly as possible. Leave
+ * a couple of open slots in each btree block so that we don't end up
+ * splitting the btrees like crazy after a mount.
+ */
+ if (bload->leaf_slack < 0)
+ bload->leaf_slack = 2;
+ if (bload->node_slack < 0)
+ bload->node_slack = 2;
+}
+
+/* Initialize accounting resources for staging a new AG btree. */
+void
+xrep_newbt_init_ag(
+ struct xrep_newbt *xnr,
+ struct xfs_scrub *sc,
+ const struct xfs_owner_info *oinfo,
+ xfs_fsblock_t alloc_hint,
+ enum xfs_ag_resv_type resv)
+{
+ memset(xnr, 0, sizeof(struct xrep_newbt));
+ xnr->sc = sc;
+ xnr->oinfo = *oinfo; /* structure copy */
+ xnr->alloc_hint = alloc_hint;
+ xnr->resv = resv;
+ INIT_LIST_HEAD(&xnr->resv_list);
+ xrep_newbt_estimate_slack(xnr);
+}
+
+/* Initialize accounting resources for staging a new inode fork btree. */
+int
+xrep_newbt_init_inode(
+ struct xrep_newbt *xnr,
+ struct xfs_scrub *sc,
+ int whichfork,
+ const struct xfs_owner_info *oinfo)
+{
+ struct xfs_ifork *ifp;
+
+ ifp = kmem_cache_zalloc(xfs_ifork_cache, XCHK_GFP_FLAGS);
+ if (!ifp)
+ return -ENOMEM;
+
+ xrep_newbt_init_ag(xnr, sc, oinfo,
+ XFS_INO_TO_FSB(sc->mp, sc->ip->i_ino),
+ XFS_AG_RESV_NONE);
+ xnr->ifake.if_fork = ifp;
+ xnr->ifake.if_fork_size = xfs_inode_fork_size(sc->ip, whichfork);
+ return 0;
+}
+
+/*
+ * Initialize accounting resources for staging a new btree. Callers are
+ * expected to add their own reservations (and clean them up) manually.
+ */
+void
+xrep_newbt_init_bare(
+ struct xrep_newbt *xnr,
+ struct xfs_scrub *sc)
+{
+ xrep_newbt_init_ag(xnr, sc, &XFS_RMAP_OINFO_ANY_OWNER, NULLFSBLOCK,
+ XFS_AG_RESV_NONE);
+}
+
+/*
+ * Designate specific blocks to be used to build our new btree. @pag must be
+ * a passive reference.
+ */
+STATIC int
+xrep_newbt_add_blocks(
+ struct xrep_newbt *xnr,
+ struct xfs_perag *pag,
+ const struct xfs_alloc_arg *args)
+{
+ struct xfs_mount *mp = xnr->sc->mp;
+ struct xrep_newbt_resv *resv;
+
+ resv = kmalloc(sizeof(struct xrep_newbt_resv), XCHK_GFP_FLAGS);
+ if (!resv)
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(&resv->list);
+ resv->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
+ resv->len = args->len;
+ resv->used = 0;
+ resv->pag = xfs_perag_hold(pag);
+
+ list_add_tail(&resv->list, &xnr->resv_list);
+ return 0;
+}
+
+/* Don't let our allocation hint take us beyond this AG */
+static inline void
+xrep_newbt_validate_ag_alloc_hint(
+ struct xrep_newbt *xnr)
+{
+ struct xfs_scrub *sc = xnr->sc;
+ xfs_agnumber_t agno = XFS_FSB_TO_AGNO(sc->mp, xnr->alloc_hint);
+
+ if (agno == sc->sa.pag->pag_agno &&
+ xfs_verify_fsbno(sc->mp, xnr->alloc_hint))
+ return;
+
+ xnr->alloc_hint = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno,
+ XFS_AGFL_BLOCK(sc->mp) + 1);
+}
+
+/* Allocate disk space for a new per-AG btree. */
+STATIC int
+xrep_newbt_alloc_ag_blocks(
+ struct xrep_newbt *xnr,
+ uint64_t nr_blocks)
+{
+ struct xfs_scrub *sc = xnr->sc;
+ struct xfs_mount *mp = sc->mp;
+ int error = 0;
+
+ ASSERT(sc->sa.pag != NULL);
+
+ while (nr_blocks > 0) {
+ struct xfs_alloc_arg args = {
+ .tp = sc->tp,
+ .mp = mp,
+ .oinfo = xnr->oinfo,
+ .minlen = 1,
+ .maxlen = nr_blocks,
+ .prod = 1,
+ .resv = xnr->resv,
+ };
+ xfs_agnumber_t agno;
+
+ xrep_newbt_validate_ag_alloc_hint(xnr);
+
+ error = xfs_alloc_vextent_near_bno(&args, xnr->alloc_hint);
+ if (error)
+ return error;
+ if (args.fsbno == NULLFSBLOCK)
+ return -ENOSPC;
+
+ agno = XFS_FSB_TO_AGNO(mp, args.fsbno);
+
+ trace_xrep_newbt_alloc_ag_blocks(mp, agno,
+ XFS_FSB_TO_AGBNO(mp, args.fsbno), args.len,
+ xnr->oinfo.oi_owner);
+
+ if (agno != sc->sa.pag->pag_agno) {
+ ASSERT(agno == sc->sa.pag->pag_agno);
+ return -EFSCORRUPTED;
+ }
+
+ error = xrep_newbt_add_blocks(xnr, sc->sa.pag, &args);
+ if (error)
+ return error;
+
+ nr_blocks -= args.len;
+ xnr->alloc_hint = args.fsbno + args.len;
+
+ error = xrep_defer_finish(sc);
+ if (error)
+ return error;
+ }
+
+ return 0;
+}
+
+/* Don't let our allocation hint take us beyond EOFS */
+static inline void
+xrep_newbt_validate_file_alloc_hint(
+ struct xrep_newbt *xnr)
+{
+ struct xfs_scrub *sc = xnr->sc;
+
+ if (xfs_verify_fsbno(sc->mp, xnr->alloc_hint))
+ return;
+
+ xnr->alloc_hint = XFS_AGB_TO_FSB(sc->mp, 0, XFS_AGFL_BLOCK(sc->mp) + 1);
+}
+
+/* Allocate disk space for our new file-based btree. */
+STATIC int
+xrep_newbt_alloc_file_blocks(
+ struct xrep_newbt *xnr,
+ uint64_t nr_blocks)
+{
+ struct xfs_scrub *sc = xnr->sc;
+ struct xfs_mount *mp = sc->mp;
+ int error = 0;
+
+ while (nr_blocks > 0) {
+ struct xfs_alloc_arg args = {
+ .tp = sc->tp,
+ .mp = mp,
+ .oinfo = xnr->oinfo,
+ .minlen = 1,
+ .maxlen = nr_blocks,
+ .prod = 1,
+ .resv = xnr->resv,
+ };
+ struct xfs_perag *pag;
+ xfs_agnumber_t agno;
+
+ xrep_newbt_validate_file_alloc_hint(xnr);
+
+ error = xfs_alloc_vextent_start_ag(&args, xnr->alloc_hint);
+ if (error)
+ return error;
+ if (args.fsbno == NULLFSBLOCK)
+ return -ENOSPC;
+
+ agno = XFS_FSB_TO_AGNO(mp, args.fsbno);
+
+ trace_xrep_newbt_alloc_file_blocks(mp, agno,
+ XFS_FSB_TO_AGBNO(mp, args.fsbno), args.len,
+ xnr->oinfo.oi_owner);
+
+ pag = xfs_perag_get(mp, agno);
+ if (!pag) {
+ ASSERT(0);
+ return -EFSCORRUPTED;
+ }
+
+ error = xrep_newbt_add_blocks(xnr, pag, &args);
+ xfs_perag_put(pag);
+ if (error)
+ return error;
+
+ nr_blocks -= args.len;
+ xnr->alloc_hint = args.fsbno + args.len;
+
+ error = xrep_defer_finish(sc);
+ if (error)
+ return error;
+ }
+
+ return 0;
+}
+
+/* Allocate disk space for our new btree. */
+int
+xrep_newbt_alloc_blocks(
+ struct xrep_newbt *xnr,
+ uint64_t nr_blocks)
+{
+ if (xnr->sc->ip)
+ return xrep_newbt_alloc_file_blocks(xnr, nr_blocks);
+ return xrep_newbt_alloc_ag_blocks(xnr, nr_blocks);
+}
+
+/*
+ * Free the unused part of a space extent that was reserved for a new ondisk
+ * structure. Returns the number of EFIs logged or a negative errno.
+ */
+STATIC int
+xrep_newbt_free_extent(
+ struct xrep_newbt *xnr,
+ struct xrep_newbt_resv *resv,
+ bool btree_committed)
+{
+ struct xfs_scrub *sc = xnr->sc;
+ xfs_agblock_t free_agbno = resv->agbno;
+ xfs_extlen_t free_aglen = resv->len;
+ xfs_fsblock_t fsbno;
+ int error;
+
+ if (!btree_committed || resv->used == 0) {
+ /*
+ * If we're not committing a new btree or we didn't use the
+ * space reservation, free the entire space extent.
+ */
+ goto free;
+ }
+
+ /*
+ * We used space and committed the btree. Remove the written blocks
+ * from the reservation and possibly log a new EFI to free any unused
+ * reservation space.
+ */
+ free_agbno += resv->used;
+ free_aglen -= resv->used;
+
+ if (free_aglen == 0)
+ return 0;
+
+ trace_xrep_newbt_free_blocks(sc->mp, resv->pag->pag_agno, free_agbno,
+ free_aglen, xnr->oinfo.oi_owner);
+
+ ASSERT(xnr->resv != XFS_AG_RESV_AGFL);
+
+free:
+ /*
+ * Use EFIs to free the reservations. This reduces the chance
+ * that we leak blocks if the system goes down.
+ */
+ fsbno = XFS_AGB_TO_FSB(sc->mp, resv->pag->pag_agno, free_agbno);
+ error = xfs_free_extent_later(sc->tp, fsbno, free_aglen, &xnr->oinfo,
+ xnr->resv, true);
+ if (error)
+ return error;
+
+ return 1;
+}
+
+/* Free all the accounting info and disk space we reserved for a new btree. */
+STATIC int
+xrep_newbt_free(
+ struct xrep_newbt *xnr,
+ bool btree_committed)
+{
+ struct xfs_scrub *sc = xnr->sc;
+ struct xrep_newbt_resv *resv, *n;
+ unsigned int freed = 0;
+ int error = 0;
+
+ /*
+ * If the filesystem already went down, we can't free the blocks. Skip
+ * ahead to freeing the incore metadata because we can't fix anything.
+ */
+ if (xfs_is_shutdown(sc->mp))
+ goto junkit;
+
+ list_for_each_entry_safe(resv, n, &xnr->resv_list, list) {
+ int ret;
+
+ ret = xrep_newbt_free_extent(xnr, resv, btree_committed);
+ list_del(&resv->list);
+ xfs_perag_put(resv->pag);
+ kfree(resv);
+ if (ret < 0) {
+ error = ret;
+ goto junkit;
+ }
+
+ freed += ret;
+ if (freed >= XREP_MAX_ITRUNCATE_EFIS) {
+ error = xrep_defer_finish(sc);
+ if (error)
+ goto junkit;
+ freed = 0;
+ }
+ }
+
+ if (freed)
+ error = xrep_defer_finish(sc);
+
+junkit:
+ /*
+ * If we still have reservations attached to @newbt, cleanup must have
+ * failed and the filesystem is about to go down. Clean up the incore
+ * reservations.
+ */
+ list_for_each_entry_safe(resv, n, &xnr->resv_list, list) {
+ list_del(&resv->list);
+ xfs_perag_put(resv->pag);
+ kfree(resv);
+ }
+
+ if (sc->ip) {
+ kmem_cache_free(xfs_ifork_cache, xnr->ifake.if_fork);
+ xnr->ifake.if_fork = NULL;
+ }
+
+ return error;
+}
+
+/*
+ * Free all the accounting info and unused disk space allocations after
+ * committing a new btree.
+ */
+int
+xrep_newbt_commit(
+ struct xrep_newbt *xnr)
+{
+ return xrep_newbt_free(xnr, true);
+}
+
+/*
+ * Free all the accounting info and all of the disk space we reserved for a new
+ * btree that we're not going to commit. We want to try to roll things back
+ * cleanly for things like ENOSPC midway through allocation.
+ */
+void
+xrep_newbt_cancel(
+ struct xrep_newbt *xnr)
+{
+ xrep_newbt_free(xnr, false);
+}
+
+/* Feed one of the reserved btree blocks to the bulk loader. */
+int
+xrep_newbt_claim_block(
+ struct xfs_btree_cur *cur,
+ struct xrep_newbt *xnr,
+ union xfs_btree_ptr *ptr)
+{
+ struct xrep_newbt_resv *resv;
+ struct xfs_mount *mp = cur->bc_mp;
+ xfs_agblock_t agbno;
+
+ /*
+ * The first item in the list should always have a free block unless
+ * we're completely out.
+ */
+ resv = list_first_entry(&xnr->resv_list, struct xrep_newbt_resv, list);
+ if (resv->used == resv->len)
+ return -ENOSPC;
+
+ /*
+ * Peel off a block from the start of the reservation. We allocate
+ * blocks in order to place blocks on disk in increasing record or key
+ * order. The block reservations tend to end up on the list in
+ * decreasing order, which hopefully results in leaf blocks ending up
+ * together.
+ */
+ agbno = resv->agbno + resv->used;
+ resv->used++;
+
+ /* If we used all the blocks in this reservation, move it to the end. */
+ if (resv->used == resv->len)
+ list_move_tail(&resv->list, &xnr->resv_list);
+
+ trace_xrep_newbt_claim_block(mp, resv->pag->pag_agno, agbno, 1,
+ xnr->oinfo.oi_owner);
+
+ if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+ ptr->l = cpu_to_be64(XFS_AGB_TO_FSB(mp, resv->pag->pag_agno,
+ agbno));
+ else
+ ptr->s = cpu_to_be32(agbno);
+ return 0;
+}
diff --git a/fs/xfs/scrub/newbt.h b/fs/xfs/scrub/newbt.h
new file mode 100644
index 000000000000..ca53271f3a4c
--- /dev/null
+++ b/fs/xfs/scrub/newbt.h
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2022-2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_NEWBT_H__
+#define __XFS_SCRUB_NEWBT_H__
+
+struct xrep_newbt_resv {
+ /* Link to list of extents that we've reserved. */
+ struct list_head list;
+
+ struct xfs_perag *pag;
+
+ /* AG block of the extent we reserved. */
+ xfs_agblock_t agbno;
+
+ /* Length of the reservation. */
+ xfs_extlen_t len;
+
+ /* How much of this reservation has been used. */
+ xfs_extlen_t used;
+};
+
+struct xrep_newbt {
+ struct xfs_scrub *sc;
+
+ /* List of extents that we've reserved. */
+ struct list_head resv_list;
+
+ /* Fake root for new btree. */
+ union {
+ struct xbtree_afakeroot afake;
+ struct xbtree_ifakeroot ifake;
+ };
+
+ /* rmap owner of these blocks */
+ struct xfs_owner_info oinfo;
+
+ /* btree geometry for the bulk loader */
+ struct xfs_btree_bload bload;
+
+ /* Allocation hint */
+ xfs_fsblock_t alloc_hint;
+
+ /* per-ag reservation type */
+ enum xfs_ag_resv_type resv;
+};
+
+void xrep_newbt_init_bare(struct xrep_newbt *xnr, struct xfs_scrub *sc);
+void xrep_newbt_init_ag(struct xrep_newbt *xnr, struct xfs_scrub *sc,
+ const struct xfs_owner_info *oinfo, xfs_fsblock_t alloc_hint,
+ enum xfs_ag_resv_type resv);
+int xrep_newbt_init_inode(struct xrep_newbt *xnr, struct xfs_scrub *sc,
+ int whichfork, const struct xfs_owner_info *oinfo);
+int xrep_newbt_alloc_blocks(struct xrep_newbt *xnr, uint64_t nr_blocks);
+void xrep_newbt_cancel(struct xrep_newbt *xnr);
+int xrep_newbt_commit(struct xrep_newbt *xnr);
+int xrep_newbt_claim_block(struct xfs_btree_cur *cur, struct xrep_newbt *xnr,
+ union xfs_btree_ptr *ptr);
+
+#endif /* __XFS_SCRUB_NEWBT_H__ */
diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h
index 4a8bc6f3c8f2..aa7683075319 100644
--- a/fs/xfs/scrub/trace.h
+++ b/fs/xfs/scrub/trace.h
@@ -1332,6 +1332,43 @@ TRACE_EVENT(xrep_ialloc_insert,
__entry->freemask)
)
+DECLARE_EVENT_CLASS(xrep_newbt_extent_class,
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+ xfs_agblock_t agbno, xfs_extlen_t len,
+ int64_t owner),
+ TP_ARGS(mp, agno, agbno, len, owner),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agblock_t, agbno)
+ __field(xfs_extlen_t, len)
+ __field(int64_t, owner)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->agno = agno;
+ __entry->agbno = agbno;
+ __entry->len = len;
+ __entry->owner = owner;
+ ),
+ TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x owner 0x%llx",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->agbno,
+ __entry->len,
+ __entry->owner)
+);
+#define DEFINE_NEWBT_EXTENT_EVENT(name) \
+DEFINE_EVENT(xrep_newbt_extent_class, name, \
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
+ xfs_agblock_t agbno, xfs_extlen_t len, \
+ int64_t owner), \
+ TP_ARGS(mp, agno, agbno, len, owner))
+DEFINE_NEWBT_EXTENT_EVENT(xrep_newbt_alloc_ag_blocks);
+DEFINE_NEWBT_EXTENT_EVENT(xrep_newbt_alloc_file_blocks);
+DEFINE_NEWBT_EXTENT_EVENT(xrep_newbt_free_blocks);
+DEFINE_NEWBT_EXTENT_EVENT(xrep_newbt_claim_block);
+
#endif /* IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR) */
#endif /* _TRACE_XFS_SCRUB_TRACE_H */