diff options
Diffstat (limited to 'fs')
31 files changed, 680 insertions, 39 deletions
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig index 9fac5ea8d0e4..05bc865142b8 100644 --- a/fs/xfs/Kconfig +++ b/fs/xfs/Kconfig @@ -93,10 +93,15 @@ config XFS_RT If unsure, say N. +config XFS_DRAIN_INTENTS + bool + select JUMP_LABEL if HAVE_ARCH_JUMP_LABEL + config XFS_ONLINE_SCRUB bool "XFS online metadata check support" default n depends on XFS_FS + select XFS_DRAIN_INTENTS help If you say Y here you will be able to check metadata on a mounted XFS filesystem. This feature is intended to reduce diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 92d88dc3c9f7..3bdbc838c4d1 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -136,6 +136,8 @@ ifeq ($(CONFIG_MEMORY_FAILURE),y) xfs-$(CONFIG_FS_DAX) += xfs_notify_failure.o endif +xfs-$(CONFIG_XFS_DRAIN_INTENTS) += xfs_drain.o + # online scrub/repair ifeq ($(CONFIG_XFS_ONLINE_SCRUB),y) diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c index 2d8910046ed9..1b078bbbf225 100644 --- a/fs/xfs/libxfs/xfs_ag.c +++ b/fs/xfs/libxfs/xfs_ag.c @@ -260,6 +260,7 @@ xfs_free_perag( spin_unlock(&mp->m_perag_lock); ASSERT(pag); XFS_IS_CORRUPT(pag->pag_mount, atomic_read(&pag->pag_ref) != 0); + xfs_defer_drain_free(&pag->pag_intents_drain); cancel_delayed_work_sync(&pag->pag_blockgc_work); xfs_buf_hash_destroy(pag); @@ -385,6 +386,7 @@ xfs_initialize_perag( spin_lock_init(&pag->pag_state_lock); INIT_DELAYED_WORK(&pag->pag_blockgc_work, xfs_blockgc_worker); INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC); + xfs_defer_drain_init(&pag->pag_intents_drain); init_waitqueue_head(&pag->pagb_wait); init_waitqueue_head(&pag->pag_active_wq); pag->pagb_count = 0; @@ -421,6 +423,7 @@ xfs_initialize_perag( return 0; out_remove_pag: + xfs_defer_drain_free(&pag->pag_intents_drain); radix_tree_delete(&mp->m_perag_tree, index); out_free_pag: kmem_free(pag); @@ -431,6 +434,7 @@ out_unwind_new_pags: if (!pag) break; xfs_buf_hash_destroy(pag); + xfs_defer_drain_free(&pag->pag_intents_drain); kmem_free(pag); } return error; diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h index 8092eaba977d..2e0aef87d633 100644 --- a/fs/xfs/libxfs/xfs_ag.h +++ b/fs/xfs/libxfs/xfs_ag.h @@ -101,6 +101,14 @@ struct xfs_perag { /* background prealloc block trimming */ struct delayed_work pag_blockgc_work; + /* + * We use xfs_drain to track the number of deferred log intent items + * that have been queued (but not yet processed) so that waiters (e.g. + * scrub) will not lock resources when other threads are in the middle + * of processing a chain of intent items only to find momentary + * inconsistencies. + */ + struct xfs_defer_drain pag_intents_drain; #endif /* __KERNEL__ */ }; diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index 5a321b783398..bcfb6a4203cd 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -397,6 +397,7 @@ xfs_defer_cancel_list( list_for_each_safe(pwi, n, &dfp->dfp_work) { list_del(pwi); dfp->dfp_count--; + trace_xfs_defer_cancel_item(mp, dfp, pwi); ops->cancel_item(pwi); } ASSERT(dfp->dfp_count == 0); @@ -476,6 +477,7 @@ xfs_defer_finish_one( list_for_each_safe(li, n, &dfp->dfp_work) { list_del(li); dfp->dfp_count--; + trace_xfs_defer_finish_item(tp->t_mountp, dfp, li); error = ops->finish_item(tp, dfp->dfp_done, li, &state); if (error == -EAGAIN) { int ret; @@ -623,7 +625,7 @@ xfs_defer_add( struct list_head *li) { struct xfs_defer_pending *dfp = NULL; - const struct xfs_defer_op_type *ops; + const struct xfs_defer_op_type *ops = defer_op_types[type]; ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); BUILD_BUG_ON(ARRAY_SIZE(defer_op_types) != XFS_DEFER_OPS_TYPE_MAX); @@ -636,7 +638,6 @@ xfs_defer_add( if (!list_empty(&tp->t_dfops)) { dfp = list_last_entry(&tp->t_dfops, struct xfs_defer_pending, dfp_list); - ops = defer_op_types[dfp->dfp_type]; if (dfp->dfp_type != type || (ops->max_items && dfp->dfp_count >= ops->max_items)) dfp = NULL; @@ -653,6 +654,7 @@ xfs_defer_add( } list_add_tail(li, &dfp->dfp_work); + trace_xfs_defer_add_item(tp->t_mountp, dfp, li); dfp->dfp_count++; } diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c index c91819da1f5f..87cb13a6e84a 100644 --- a/fs/xfs/scrub/agheader.c +++ b/fs/xfs/scrub/agheader.c @@ -18,6 +18,15 @@ #include "scrub/scrub.h" #include "scrub/common.h" +int +xchk_setup_agheader( + struct xfs_scrub *sc) +{ + if (xchk_need_intent_drain(sc)) + xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN); + return xchk_setup_fs(sc); +} + /* Superblock */ /* Cross-reference with the other btrees. */ diff --git a/fs/xfs/scrub/alloc.c b/fs/xfs/scrub/alloc.c index 39e79b9536bc..de313df2b15b 100644 --- a/fs/xfs/scrub/alloc.c +++ b/fs/xfs/scrub/alloc.c @@ -24,6 +24,9 @@ int xchk_setup_ag_allocbt( struct xfs_scrub *sc) { + if (xchk_need_intent_drain(sc)) + xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN); + return xchk_setup_ag_btree(sc, false); } diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c index f6d8cb938a02..a5078d63808f 100644 --- a/fs/xfs/scrub/bmap.c +++ b/fs/xfs/scrub/bmap.c @@ -31,6 +31,9 @@ xchk_setup_inode_bmap( { int error; + if (xchk_need_intent_drain(sc)) + xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN); + error = xchk_get_inode(sc); if (error) goto out; diff --git a/fs/xfs/scrub/btree.c b/fs/xfs/scrub/btree.c index e54c1cfe64bf..626282dbe2e3 100644 --- a/fs/xfs/scrub/btree.c +++ b/fs/xfs/scrub/btree.c @@ -36,6 +36,7 @@ __xchk_btree_process_error( switch (*error) { case -EDEADLOCK: + case -ECHRNG: /* Used to restart an op with deadlock avoidance. */ trace_xchk_deadlock_retry(sc->ip, sc->sm, *error); break; diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c index 597e6aca8628..dcfe66044d4a 100644 --- a/fs/xfs/scrub/common.c +++ b/fs/xfs/scrub/common.c @@ -75,6 +75,7 @@ __xchk_process_error( case 0: return true; case -EDEADLOCK: + case -ECHRNG: /* Used to restart an op with deadlock avoidance. */ trace_xchk_deadlock_retry( sc->ip ? sc->ip : XFS_I(file_inode(sc->file)), @@ -130,6 +131,7 @@ __xchk_fblock_process_error( case 0: return true; case -EDEADLOCK: + case -ECHRNG: /* Used to restart an op with deadlock avoidance. */ trace_xchk_deadlock_retry(sc->ip, sc->sm, *error); break; @@ -396,26 +398,19 @@ want_ag_read_header_failure( } /* - * Grab the perag structure and all the headers for an AG. + * Grab the AG header buffers for the attached perag structure. * * The headers should be released by xchk_ag_free, but as a fail safe we attach * all the buffers we grab to the scrub transaction so they'll all be freed - * when we cancel it. Returns ENOENT if we can't grab the perag structure. + * when we cancel it. */ -int -xchk_ag_read_headers( +static inline int +xchk_perag_read_headers( struct xfs_scrub *sc, - xfs_agnumber_t agno, struct xchk_ag *sa) { - struct xfs_mount *mp = sc->mp; int error; - ASSERT(!sa->pag); - sa->pag = xfs_perag_get(mp, agno); - if (!sa->pag) - return -ENOENT; - error = xfs_ialloc_read_agi(sa->pag, sc->tp, &sa->agi_bp); if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGI)) return error; @@ -427,6 +422,104 @@ xchk_ag_read_headers( return 0; } +/* + * Grab the AG headers for the attached perag structure and wait for pending + * intents to drain. + */ +static int +xchk_perag_drain_and_lock( + struct xfs_scrub *sc) +{ + struct xchk_ag *sa = &sc->sa; + int error = 0; + + ASSERT(sa->pag != NULL); + ASSERT(sa->agi_bp == NULL); + ASSERT(sa->agf_bp == NULL); + + do { + if (xchk_should_terminate(sc, &error)) + return error; + + error = xchk_perag_read_headers(sc, sa); + if (error) + return error; + + /* + * If we've grabbed an inode for scrubbing then we assume that + * holding its ILOCK will suffice to coordinate with any intent + * chains involving this inode. + */ + if (sc->ip) + return 0; + + /* + * Decide if this AG is quiet enough for all metadata to be + * consistent with each other. XFS allows the AG header buffer + * locks to cycle across transaction rolls while processing + * chains of deferred ops, which means that there could be + * other threads in the middle of processing a chain of + * deferred ops. For regular operations we are careful about + * ordering operations to prevent collisions between threads + * (which is why we don't need a per-AG lock), but scrub and + * repair have to serialize against chained operations. + * + * We just locked all the AG headers buffers; now take a look + * to see if there are any intents in progress. If there are, + * drop the AG headers and wait for the intents to drain. + * Since we hold all the AG header locks for the duration of + * the scrub, this is the only time we have to sample the + * intents counter; any threads increasing it after this point + * can't possibly be in the middle of a chain of AG metadata + * updates. + * + * Obviously, this should be slanted against scrub and in favor + * of runtime threads. + */ + if (!xfs_perag_intent_busy(sa->pag)) + return 0; + + if (sa->agf_bp) { + xfs_trans_brelse(sc->tp, sa->agf_bp); + sa->agf_bp = NULL; + } + + if (sa->agi_bp) { + xfs_trans_brelse(sc->tp, sa->agi_bp); + sa->agi_bp = NULL; + } + + if (!(sc->flags & XCHK_FSGATES_DRAIN)) + return -ECHRNG; + error = xfs_perag_intent_drain(sa->pag); + if (error == -ERESTARTSYS) + error = -EINTR; + } while (!error); + + return error; +} + +/* + * Grab the per-AG structure, grab all AG header buffers, and wait until there + * aren't any pending intents. Returns -ENOENT if we can't grab the perag + * structure. + */ +int +xchk_ag_read_headers( + struct xfs_scrub *sc, + xfs_agnumber_t agno, + struct xchk_ag *sa) +{ + struct xfs_mount *mp = sc->mp; + + ASSERT(!sa->pag); + sa->pag = xfs_perag_get(mp, agno); + if (!sa->pag) + return -ENOENT; + + return xchk_perag_drain_and_lock(sc); +} + /* Release all the AG btree cursors. */ void xchk_ag_btcur_free( @@ -916,3 +1009,25 @@ xchk_start_reaping( } sc->flags &= ~XCHK_REAPING_DISABLED; } + +/* + * Enable filesystem hooks (i.e. runtime code patching) before starting a scrub + * operation. Callers must not hold any locks that intersect with the CPU + * hotplug lock (e.g. writeback locks) because code patching must halt the CPUs + * to change kernel code. + */ +void +xchk_fsgates_enable( + struct xfs_scrub *sc, + unsigned int scrub_fsgates) +{ + ASSERT(!(scrub_fsgates & ~XCHK_FSGATES_ALL)); + ASSERT(!(sc->flags & scrub_fsgates)); + + trace_xchk_fsgates_enable(sc, scrub_fsgates); + + if (scrub_fsgates & XCHK_FSGATES_DRAIN) + xfs_drain_wait_enable(); + + sc->flags |= scrub_fsgates; +} diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h index 273a4331da05..83b1a392930a 100644 --- a/fs/xfs/scrub/common.h +++ b/fs/xfs/scrub/common.h @@ -72,6 +72,7 @@ bool xchk_should_check_xref(struct xfs_scrub *sc, int *error, struct xfs_btree_cur **curpp); /* Setup functions */ +int xchk_setup_agheader(struct xfs_scrub *sc); int xchk_setup_fs(struct xfs_scrub *sc); int xchk_setup_ag_allocbt(struct xfs_scrub *sc); int xchk_setup_ag_iallocbt(struct xfs_scrub *sc); @@ -151,4 +152,18 @@ int xchk_ilock_inverted(struct xfs_inode *ip, uint lock_mode); void xchk_stop_reaping(struct xfs_scrub *sc); void xchk_start_reaping(struct xfs_scrub *sc); +/* + * Setting up a hook to wait for intents to drain is costly -- we have to take + * the CPU hotplug lock and force an i-cache flush on all CPUs once to set it + * up, and again to tear it down. These costs add up quickly, so we only want + * to enable the drain waiter if the drain actually detected a conflict with + * running intent chains. + */ +static inline bool xchk_need_intent_drain(struct xfs_scrub *sc) +{ + return sc->flags & XCHK_NEED_DRAIN; +} + +void xchk_fsgates_enable(struct xfs_scrub *sc, unsigned int scrub_fshooks); + #endif /* __XFS_SCRUB_COMMON_H__ */ diff --git a/fs/xfs/scrub/dabtree.c b/fs/xfs/scrub/dabtree.c index c392c0765e5c..82b150d3b8b7 100644 --- a/fs/xfs/scrub/dabtree.c +++ b/fs/xfs/scrub/dabtree.c @@ -39,6 +39,7 @@ xchk_da_process_error( switch (*error) { case -EDEADLOCK: + case -ECHRNG: /* Used to restart an op with deadlock avoidance. */ trace_xchk_deadlock_retry(sc->ip, sc->sm, *error); break; diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c index a38006c71bff..faa315be7978 100644 --- a/fs/xfs/scrub/fscounters.c +++ b/fs/xfs/scrub/fscounters.c @@ -130,6 +130,13 @@ xchk_setup_fscounters( struct xchk_fscounters *fsc; int error; + /* + * If the AGF doesn't track btreeblks, we have to lock the AGF to count + * btree block usage by walking the actual btrees. + */ + if (!xfs_has_lazysbcount(sc->mp)) + xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN); + sc->buf = kzalloc(sizeof(struct xchk_fscounters), XCHK_GFP_FLAGS); if (!sc->buf) return -ENOMEM; diff --git a/fs/xfs/scrub/health.c b/fs/xfs/scrub/health.c index 66e99b0f6049..d2b2a1cb6533 100644 --- a/fs/xfs/scrub/health.c +++ b/fs/xfs/scrub/health.c @@ -7,6 +7,8 @@ #include "xfs_fs.h" #include "xfs_shared.h" #include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" #include "xfs_btree.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c index b14270bd1c62..9563769a8881 100644 --- a/fs/xfs/scrub/ialloc.c +++ b/fs/xfs/scrub/ialloc.c @@ -32,6 +32,8 @@ int xchk_setup_ag_iallocbt( struct xfs_scrub *sc) { + if (xchk_need_intent_drain(sc)) + xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN); return xchk_setup_ag_btree(sc, sc->flags & XCHK_TRY_HARDER); } diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c index dc66a1465f1b..bbf9432c02c2 100644 --- a/fs/xfs/scrub/inode.c +++ b/fs/xfs/scrub/inode.c @@ -32,6 +32,9 @@ xchk_setup_inode( { int error; + if (xchk_need_intent_drain(sc)) + xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN); + /* * Try to get the inode. If the verifiers fail, we try again * in raw mode. diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c index b019c70c065a..e6caa358cbda 100644 --- a/fs/xfs/scrub/quota.c +++ b/fs/xfs/scrub/quota.c @@ -53,6 +53,9 @@ xchk_setup_quota( if (!xfs_this_quota_on(sc->mp, dqtype)) return -ENOENT; + if (xchk_need_intent_drain(sc)) + xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN); + error = xchk_setup_fs(sc); if (error) return error; diff --git a/fs/xfs/scrub/refcount.c b/fs/xfs/scrub/refcount.c index a5005b1d010d..6f649cc01310 100644 --- a/fs/xfs/scrub/refcount.c +++ b/fs/xfs/scrub/refcount.c @@ -7,12 +7,15 @@ #include "xfs_fs.h" #include "xfs_shared.h" #include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" #include "xfs_btree.h" #include "xfs_rmap.h" #include "xfs_refcount.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/btree.h" +#include "scrub/trace.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" #include "xfs_ag.h" @@ -24,6 +27,8 @@ int xchk_setup_ag_refcountbt( struct xfs_scrub *sc) { + if (xchk_need_intent_drain(sc)) + xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN); return xchk_setup_ag_btree(sc, false); } @@ -300,8 +305,10 @@ xchk_refcountbt_xref_rmap( goto out_free; xchk_refcountbt_process_rmap_fragments(&refchk); - if (irec->rc_refcount != refchk.seen) + if (irec->rc_refcount != refchk.seen) { + trace_xchk_refcount_incorrect(sc->sa.pag, irec, refchk.seen); xchk_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0); + } out_free: list_for_each_entry_safe(frag, n, &refchk.fragments, list) { diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index b800341aae69..ab0758308f57 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -60,6 +60,9 @@ xrep_attempt( sc->sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT; sc->flags |= XREP_ALREADY_FIXED; return -EAGAIN; + case -ECHRNG: + sc->flags |= XCHK_NEED_DRAIN; + return -EAGAIN; case -EDEADLOCK: /* Tell the caller to try again having grabbed all the locks. */ if (!(sc->flags & XCHK_TRY_HARDER)) { diff --git a/fs/xfs/scrub/rmap.c b/fs/xfs/scrub/rmap.c index 4dc79e1a675d..c6e47ef4c79b 100644 --- a/fs/xfs/scrub/rmap.c +++ b/fs/xfs/scrub/rmap.c @@ -24,6 +24,9 @@ int xchk_setup_ag_rmapbt( struct xfs_scrub *sc) { + if (xchk_need_intent_drain(sc)) + xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN); + return xchk_setup_ag_btree(sc, false); } diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index e8e2bee001e5..787a9096ddef 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -145,6 +145,21 @@ xchk_probe( /* Scrub setup and teardown */ +static inline void +xchk_fsgates_disable( + struct xfs_scrub *sc) +{ + if (!(sc->flags & XCHK_FSGATES_ALL)) + return; + + trace_xchk_fsgates_disable(sc, sc->flags & XCHK_FSGATES_ALL); + + if (sc->flags & XCHK_FSGATES_DRAIN) + xfs_drain_wait_disable(); + + sc->flags &= ~XCHK_FSGATES_ALL; +} + /* Free all the resources and finish the transactions. */ STATIC int xchk_teardown( @@ -177,6 +192,8 @@ xchk_teardown( kvfree(sc->buf); sc->buf = NULL; } + + xchk_fsgates_disable(sc); return error; } @@ -191,25 +208,25 @@ static const struct xchk_meta_ops meta_scrub_ops[] = { }, [XFS_SCRUB_TYPE_SB] = { /* superblock */ .type = ST_PERAG, - .setup = xchk_setup_fs, + .setup = xchk_setup_agheader, .scrub = xchk_superblock, .repair = xrep_superblock, }, [XFS_SCRUB_TYPE_AGF] = { /* agf */ .type = ST_PERAG, - .setup = xchk_setup_fs, + .setup = xchk_setup_agheader, .scrub = xchk_agf, .repair = xrep_agf, }, [XFS_SCRUB_TYPE_AGFL]= { /* agfl */ .type = ST_PERAG, - .setup = xchk_setup_fs, + .setup = xchk_setup_agheader, .scrub = xchk_agfl, .repair = xrep_agfl, }, [XFS_SCRUB_TYPE_AGI] = { /* agi */ .type = ST_PERAG, - .setup = xchk_setup_fs, + .setup = xchk_setup_agheader, .scrub = xchk_agi, .repair = xrep_agi, }, @@ -491,23 +508,20 @@ retry_op: /* Set up for the operation. */ error = sc->ops->setup(sc); + if (error == -EDEADLOCK && !(sc->flags & XCHK_TRY_HARDER)) + goto try_harder; + if (error == -ECHRNG && !(sc->flags & XCHK_NEED_DRAIN)) + goto need_drain; if (error) goto out_teardown; /* Scrub for errors. */ error = sc->ops->scrub(sc); - if (!(sc->flags & XCHK_TRY_HARDER) && error == -EDEADLOCK) { - /* - * Scrubbers return -EDEADLOCK to mean 'try harder'. - * Tear down everything we hold, then set up again with - * preparation for worst-case scenarios. - */ - error = xchk_teardown(sc, 0); - if (error) - goto out_sc; - sc->flags |= XCHK_TRY_HARDER; - goto retry_op; - } else if (error || (sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE)) + if (error == -EDEADLOCK && !(sc->flags & XCHK_TRY_HARDER)) + goto try_harder; + if (error == -ECHRNG && !(sc->flags & XCHK_NEED_DRAIN)) + goto need_drain; + if (error || (sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE)) goto out_teardown; xchk_update_health(sc); @@ -565,4 +579,21 @@ out: error = 0; } return error; +need_drain: + error = xchk_teardown(sc, 0); + if (error) + goto out_sc; + sc->flags |= XCHK_NEED_DRAIN; + goto retry_op; +try_harder: + /* + * Scrubbers return -EDEADLOCK to mean 'try harder'. Tear down + * everything we hold, then set up again with preparation for + * worst-case scenarios. + */ + error = xchk_teardown(sc, 0); + if (error) + goto out_sc; + sc->flags |= XCHK_TRY_HARDER; + goto retry_op; } diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h index 72a5a8a64a87..d85c3b883b4c 100644 --- a/fs/xfs/scrub/scrub.h +++ b/fs/xfs/scrub/scrub.h @@ -96,9 +96,19 @@ struct xfs_scrub { /* XCHK state flags grow up from zero, XREP state flags grown down from 2^31 */ #define XCHK_TRY_HARDER (1 << 0) /* can't get resources, try again */ -#define XCHK_REAPING_DISABLED (1 << 2) /* background block reaping paused */ +#define XCHK_REAPING_DISABLED (1 << 1) /* background block reaping paused */ +#define XCHK_FSGATES_DRAIN (1 << 2) /* defer ops draining enabled */ +#define XCHK_NEED_DRAIN (1 << 3) /* scrub needs to drain defer ops */ #define XREP_ALREADY_FIXED (1 << 31) /* checking our repair work */ +/* + * The XCHK_FSGATES* flags reflect functionality in the main filesystem that + * are only enabled for this particular online fsck. When not in use, the + * features are gated off via dynamic code patching, which is why the state + * must be enabled during scrub setup and can only be torn down afterwards. + */ +#define XCHK_FSGATES_ALL (XCHK_FSGATES_DRAIN) + /* Metadata scrubbers */ int xchk_tester(struct xfs_scrub *sc); int xchk_superblock(struct xfs_scrub *sc); diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index 81f7c3051a1a..68efd6fda61c 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -30,6 +30,9 @@ TRACE_DEFINE_ENUM(XFS_BTNUM_FINOi); TRACE_DEFINE_ENUM(XFS_BTNUM_RMAPi); TRACE_DEFINE_ENUM(XFS_BTNUM_REFCi); +TRACE_DEFINE_ENUM(XFS_REFC_DOMAIN_SHARED); +TRACE_DEFINE_ENUM(XFS_REFC_DOMAIN_COW); + TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_PROBE); TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_SB); TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_AGF); @@ -93,6 +96,13 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_FSCOUNTERS); { XFS_SCRUB_OFLAG_WARNING, "warning" }, \ { XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED, "norepair" } +#define XFS_SCRUB_STATE_STRINGS \ + { XCHK_TRY_HARDER, "try_harder" }, \ + { XCHK_REAPING_DISABLED, "reaping_disabled" }, \ + { XCHK_FSGATES_DRAIN, "fsgates_drain" }, \ + { XCHK_NEED_DRAIN, "need_drain" }, \ + { XREP_ALREADY_FIXED, "already_fixed" } + DECLARE_EVENT_CLASS(xchk_class, TP_PROTO(struct xfs_inode *ip, struct xfs_scrub_metadata *sm, int error), @@ -139,6 +149,33 @@ DEFINE_SCRUB_EVENT(xchk_deadlock_retry); DEFINE_SCRUB_EVENT(xrep_attempt); DEFINE_SCRUB_EVENT(xrep_done); +DECLARE_EVENT_CLASS(xchk_fsgate_class, + TP_PROTO(struct xfs_scrub *sc, unsigned int fsgate_flags), + TP_ARGS(sc, fsgate_flags), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned int, type) + __field(unsigned int, fsgate_flags) + ), + TP_fast_assign( + __entry->dev = sc->mp->m_super->s_dev; + __entry->type = sc->sm->sm_type; + __entry->fsgate_flags = fsgate_flags; + ), + TP_printk("dev %d:%d type %s fsgates '%s'", + MAJOR(__entry->dev), MINOR(__entry->dev), + __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS), + __print_flags(__entry->fsgate_flags, "|", XFS_SCRUB_STATE_STRINGS)) +) + +#define DEFINE_SCRUB_FSHOOK_EVENT(name) \ +DEFINE_EVENT(xchk_fsgate_class, name, \ + TP_PROTO(struct xfs_scrub *sc, unsigned int fsgates_flags), \ + TP_ARGS(sc, fsgates_flags)) + +DEFINE_SCRUB_FSHOOK_EVENT(xchk_fsgates_enable); +DEFINE_SCRUB_FSHOOK_EVENT(xchk_fsgates_disable); + TRACE_EVENT(xchk_op_error, TP_PROTO(struct xfs_scrub *sc, xfs_agnumber_t agno, xfs_agblock_t bno, int error, void *ret_ip), @@ -657,6 +694,38 @@ TRACE_EVENT(xchk_fscounters_within_range, __entry->old_value) ) +TRACE_EVENT(xchk_refcount_incorrect, + TP_PROTO(struct xfs_perag *pag, const struct xfs_refcount_irec *irec, + xfs_nlink_t seen), + TP_ARGS(pag, irec, seen), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(enum xfs_refc_domain, domain) + __field(xfs_agblock_t, startblock) + __field(xfs_extlen_t, blockcount) + __field(xfs_nlink_t, refcount) + __field(xfs_nlink_t, seen) + ), + TP_fast_assign( + __entry->dev = pag->pag_mount->m_super->s_dev; + __entry->agno = pag->pag_agno; + __entry->domain = irec->rc_domain; + __entry->startblock = irec->rc_startblock; + __entry->blockcount = irec->rc_blockcount; + __entry->refcount = irec->rc_refcount; + __entry->seen = seen; + ), + TP_printk("dev %d:%d agno 0x%x dom %s agbno 0x%x fsbcount 0x%x refcount %u seen %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __print_symbolic(__entry->domain, XFS_REFC_DOMAIN_STRINGS), + __entry->startblock, + __entry->blockcount, + __entry->refcount, + __entry->seen) +) + /* repair tracepoints */ #if IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR) diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index 8f0f33d07d2c..7551c3ec4ea5 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -373,7 +373,15 @@ xfs_bmap_update_get_group( xfs_agnumber_t agno; agno = XFS_FSB_TO_AGNO(mp, bi->bi_bmap.br_startblock); - bi->bi_pag = xfs_perag_get(mp, agno); + + /* + * Bump the intent count on behalf of the deferred rmap and refcount + * intent items that that we can queue when we finish this bmap work. + * This new intent item will bump the intent count before the bmap + * intent drops the intent count, ensuring that the intent count + * remains nonzero across the transaction roll. + */ + bi->bi_pag = xfs_perag_intent_get(mp, agno); } /* Release a passive AG ref after finishing mapping work. */ @@ -381,7 +389,7 @@ static inline void xfs_bmap_update_put_group( struct xfs_bmap_intent *bi) { - xfs_perag_put(bi->bi_pag); + xfs_perag_intent_put(bi->bi_pag); } /* Process a deferred rmap update. */ diff --git a/fs/xfs/xfs_drain.c b/fs/xfs/xfs_drain.c new file mode 100644 index 000000000000..005a66be44a2 --- /dev/null +++ b/fs/xfs/xfs_drain.c @@ -0,0 +1,166 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2022-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_ag.h" +#include "xfs_trace.h" + +/* + * Use a static key here to reduce the overhead of xfs_drain_rele. If the + * compiler supports jump labels, the static branch will be replaced by a nop + * sled when there are no xfs_drain_wait callers. Online fsck is currently + * the only caller, so this is a reasonable tradeoff. + * + * Note: Patching the kernel code requires taking the cpu hotplug lock. Other + * parts of the kernel allocate memory with that lock held, which means that + * XFS callers cannot hold any locks that might be used by memory reclaim or + * writeback when calling the static_branch_{inc,dec} functions. + */ +static DEFINE_STATIC_KEY_FALSE(xfs_drain_waiter_gate); + +void +xfs_drain_wait_disable(void) +{ + static_branch_dec(&xfs_drain_waiter_gate); +} + +void +xfs_drain_wait_enable(void) +{ + static_branch_inc(&xfs_drain_waiter_gate); +} + +void +xfs_defer_drain_init( + struct xfs_defer_drain *dr) +{ + atomic_set(&dr->dr_count, 0); + init_waitqueue_head(&dr->dr_waiters); +} + +void +xfs_defer_drain_free(struct xfs_defer_drain *dr) +{ + ASSERT(atomic_read(&dr->dr_count) == 0); +} + +/* Increase the pending intent count. */ +static inline void xfs_defer_drain_grab(struct xfs_defer_drain *dr) +{ + atomic_inc(&dr->dr_count); +} + +static inline bool has_waiters(struct wait_queue_head *wq_head) +{ + /* + * This memory barrier is paired with the one in set_current_state on + * the waiting side. + */ + smp_mb__after_atomic(); + return waitqueue_active(wq_head); +} + +/* Decrease the pending intent count, and wake any waiters, if appropriate. */ +static inline void xfs_defer_drain_rele(struct xfs_defer_drain *dr) +{ + if (atomic_dec_and_test(&dr->dr_count) && + static_branch_unlikely(&xfs_drain_waiter_gate) && + has_waiters(&dr->dr_waiters)) + wake_up(&dr->dr_waiters); +} + +/* Are there intents pending? */ +static inline bool xfs_defer_drain_busy(struct xfs_defer_drain *dr) +{ + return atomic_read(&dr->dr_count) > 0; +} + +/* + * Wait for the pending intent count for a drain to hit zero. + * + * Callers must not hold any locks that would prevent intents from being + * finished. + */ +static inline int xfs_defer_drain_wait(struct xfs_defer_drain *dr) +{ + return wait_event_killable(dr->dr_waiters, !xfs_defer_drain_busy(dr)); +} + +/* + * Get a passive reference to an AG and declare an intent to update its + * metadata. + */ +struct xfs_perag * +xfs_perag_intent_get( + struct xfs_mount *mp, + xfs_agnumber_t agno) +{ + struct xfs_perag *pag; + + pag = xfs_perag_get(mp, agno); + if (!pag) + return NULL; + + xfs_perag_intent_hold(pag); + return pag; +} + +/* + * Release our intent to update this AG's metadata, and then release our + * passive ref to the AG. + */ +void +xfs_perag_intent_put( + struct xfs_perag *pag) +{ + xfs_perag_intent_rele(pag); + xfs_perag_put(pag); +} + +/* + * Declare an intent to update AG metadata. Other threads that need exclusive + * access can decide to back off if they see declared intentions. + */ +void +xfs_perag_intent_hold( + struct xfs_perag *pag) +{ + trace_xfs_perag_intent_hold(pag, __return_address); + xfs_defer_drain_grab(&pag->pag_intents_drain); +} + +/* Release our intent to update this AG's metadata. */ +void +xfs_perag_intent_rele( + struct xfs_perag *pag) +{ + trace_xfs_perag_intent_rele(pag, __return_address); + xfs_defer_drain_rele(&pag->pag_intents_drain); +} + +/* + * Wait for the intent update count for this AG to hit zero. + * Callers must not hold any AG header buffers. + */ +int +xfs_perag_intent_drain( + struct xfs_perag *pag) +{ + trace_xfs_perag_wait_intents(pag, __return_address); + return xfs_defer_drain_wait(&pag->pag_intents_drain); +} + +/* Has anyone declared an intent to update this AG? */ +bool +xfs_perag_intent_busy( + struct xfs_perag *pag) +{ + return xfs_defer_drain_busy(&pag->pag_intents_drain); +} diff --git a/fs/xfs/xfs_drain.h b/fs/xfs/xfs_drain.h new file mode 100644 index 000000000000..50a5772a8296 --- /dev/null +++ b/fs/xfs/xfs_drain.h @@ -0,0 +1,87 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2022-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef XFS_DRAIN_H_ +#define XFS_DRAIN_H_ + +struct xfs_perag; + +#ifdef CONFIG_XFS_DRAIN_INTENTS +/* + * Passive drain mechanism. This data structure tracks a count of some items + * and contains a waitqueue for callers who would like to wake up when the + * count hits zero. + */ +struct xfs_defer_drain { + /* Number of items pending in some part of the filesystem. */ + atomic_t dr_count; + + /* Queue to wait for dri_count to go to zero */ + struct wait_queue_head dr_waiters; +}; + +void xfs_defer_drain_init(struct xfs_defer_drain *dr); +void xfs_defer_drain_free(struct xfs_defer_drain *dr); + +void xfs_drain_wait_disable(void); +void xfs_drain_wait_enable(void); + +/* + * Deferred Work Intent Drains + * =========================== + * + * When a writer thread executes a chain of log intent items, the AG header + * buffer locks will cycle during a transaction roll to get from one intent + * item to the next in a chain. Although scrub takes all AG header buffer + * locks, this isn't sufficient to guard against scrub checking an AG while + * that writer thread is in the middle of finishing a chain because there's no + * higher level locking primitive guarding allocation groups. + * + * When there's a collision, cross-referencing between data structures (e.g. + * rmapbt and refcountbt) yields false corruption events; if repair is running, + * this results in incorrect repairs, which is catastrophic. + * + * The solution is to the perag structure the count of active intents and make + * scrub wait until it has both AG header buffer locks and the intent counter + * reaches zero. It is therefore critical that deferred work threads hold the + * AGI or AGF buffers when decrementing the intent counter. + * + * Given a list of deferred work items, the deferred work manager will complete + * a work item and all the sub-items that the parent item creates before moving + * on to the next work item in the list. This is also true for all levels of + * sub-items. Writer threads are permitted to queue multiple work items + * targetting the same AG, so a deferred work item (such as a BUI) that creates + * sub-items (such as RUIs) must bump the intent counter and maintain it until + * the sub-items can themselves bump the intent counter. + * + * Therefore, the intent count tracks entire lifetimes of deferred work items. + * All functions that create work items must increment the intent counter as + * soon as the item is added to the transaction and cannot drop the counter + * until the item is finished or cancelled. + */ +struct xfs_perag *xfs_perag_intent_get(struct xfs_mount *mp, + xfs_agnumber_t agno); +void xfs_perag_intent_put(struct xfs_perag *pag); + +void xfs_perag_intent_hold(struct xfs_perag *pag); +void xfs_perag_intent_rele(struct xfs_perag *pag); + +int xfs_perag_intent_drain(struct xfs_perag *pag); +bool xfs_perag_intent_busy(struct xfs_perag *pag); +#else +struct xfs_defer_drain { /* empty */ }; + +#define xfs_defer_drain_free(dr) ((void)0) +#define xfs_defer_drain_init(dr) ((void)0) + +#define xfs_perag_intent_get(mp, agno) xfs_perag_get((mp), (agno)) +#define xfs_perag_intent_put(pag) xfs_perag_put(pag) + +static inline void xfs_perag_intent_hold(struct xfs_perag *pag) { } +static inline void xfs_perag_intent_rele(struct xfs_perag *pag) { } + +#endif /* CONFIG_XFS_DRAIN_INTENTS */ + +#endif /* XFS_DRAIN_H_ */ diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 38b66fcfddc8..f9e36b810663 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -469,7 +469,7 @@ xfs_extent_free_get_group( xfs_agnumber_t agno; agno = XFS_FSB_TO_AGNO(mp, xefi->xefi_startblock); - xefi->xefi_pag = xfs_perag_get(mp, agno); + xefi->xefi_pag = xfs_perag_intent_get(mp, agno); } /* Release a passive AG ref after some freeing work. */ @@ -477,7 +477,7 @@ static inline void xfs_extent_free_put_group( struct xfs_extent_free_item *xefi) { - xfs_perag_put(xefi->xefi_pag); + xfs_perag_intent_put(xefi->xefi_pag); } /* Process a free extent. */ diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index e88f18f85e4b..74dcb05069e8 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -80,6 +80,7 @@ typedef __u32 xfs_nlink_t; #include "xfs_cksum.h" #include "xfs_buf.h" #include "xfs_message.h" +#include "xfs_drain.h" #ifdef __BIG_ENDIAN #define XFS_NATIVE_HOST 1 diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index 7edee9590ed6..edd8587658d5 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -374,7 +374,7 @@ xfs_refcount_update_get_group( xfs_agnumber_t agno; agno = XFS_FSB_TO_AGNO(mp, ri->ri_startblock); - ri->ri_pag = xfs_perag_get(mp, agno); + ri->ri_pag = xfs_perag_intent_get(mp, agno); } /* Release a passive AG ref after finishing refcounting work. */ @@ -382,7 +382,7 @@ static inline void xfs_refcount_update_put_group( struct xfs_refcount_intent *ri) { - xfs_perag_put(ri->ri_pag); + xfs_perag_intent_put(ri->ri_pag); } /* Process a deferred refcount update. */ diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index 739ddbd04a17..520c7ebdfed8 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -399,7 +399,7 @@ xfs_rmap_update_get_group( xfs_agnumber_t agno; agno = XFS_FSB_TO_AGNO(mp, ri->ri_bmap.br_startblock); - ri->ri_pag = xfs_perag_get(mp, agno); + ri->ri_pag = xfs_perag_intent_get(mp, agno); } /* Release a passive AG ref after finishing rmapping work. */ @@ -407,7 +407,7 @@ static inline void xfs_rmap_update_put_group( struct xfs_rmap_intent *ri) { - xfs_perag_put(ri->ri_pag); + xfs_perag_intent_put(ri->ri_pag); } /* Process a deferred rmap update. */ diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index db09bb771765..cd4ca5b1fcb0 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -2687,6 +2687,44 @@ DEFINE_BMAP_FREE_DEFERRED_EVENT(xfs_bmap_free_deferred); DEFINE_BMAP_FREE_DEFERRED_EVENT(xfs_agfl_free_defer); DEFINE_BMAP_FREE_DEFERRED_EVENT(xfs_agfl_free_deferred); +DECLARE_EVENT_CLASS(xfs_defer_pending_item_class, + TP_PROTO(struct xfs_mount *mp, struct xfs_defer_pending *dfp, + void *item), + TP_ARGS(mp, dfp, item), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(int, type) + __field(void *, intent) + __field(void *, item) + __field(char, committed) + __field(int, nr) + ), + TP_fast_assign( + __entry->dev = mp ? mp->m_super->s_dev : 0; + __entry->type = dfp->dfp_type; + __entry->intent = dfp->dfp_intent; + __entry->item = item; + __entry->committed = dfp->dfp_done != NULL; + __entry->nr = dfp->dfp_count; + ), + TP_printk("dev %d:%d optype %d intent %p item %p committed %d nr %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->type, + __entry->intent, + __entry->item, + __entry->committed, + __entry->nr) +) +#define DEFINE_DEFER_PENDING_ITEM_EVENT(name) \ +DEFINE_EVENT(xfs_defer_pending_item_class, name, \ + TP_PROTO(struct xfs_mount *mp, struct xfs_defer_pending *dfp, \ + void *item), \ + TP_ARGS(mp, dfp, item)) + +DEFINE_DEFER_PENDING_ITEM_EVENT(xfs_defer_add_item); +DEFINE_DEFER_PENDING_ITEM_EVENT(xfs_defer_cancel_item); +DEFINE_DEFER_PENDING_ITEM_EVENT(xfs_defer_finish_item); + /* rmap tracepoints */ DECLARE_EVENT_CLASS(xfs_rmap_class, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, @@ -4326,6 +4364,39 @@ TRACE_EVENT(xfs_force_shutdown, __entry->line_num) ); +#ifdef CONFIG_XFS_DRAIN_INTENTS +DECLARE_EVENT_CLASS(xfs_perag_intents_class, + TP_PROTO(struct xfs_perag *pag, void *caller_ip), + TP_ARGS(pag, caller_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(long, nr_intents) + __field(void *, caller_ip) + ), + TP_fast_assign( + __entry->dev = pag->pag_mount->m_super->s_dev; + __entry->agno = pag->pag_agno; + __entry->nr_intents = atomic_read(&pag->pag_intents_drain.dr_count); + __entry->caller_ip = caller_ip; + ), + TP_printk("dev %d:%d agno 0x%x intents %ld caller %pS", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->nr_intents, + __entry->caller_ip) +); + +#define DEFINE_PERAG_INTENTS_EVENT(name) \ +DEFINE_EVENT(xfs_perag_intents_class, name, \ + TP_PROTO(struct xfs_perag *pag, void *caller_ip), \ + TP_ARGS(pag, caller_ip)) +DEFINE_PERAG_INTENTS_EVENT(xfs_perag_intent_hold); +DEFINE_PERAG_INTENTS_EVENT(xfs_perag_intent_rele); +DEFINE_PERAG_INTENTS_EVENT(xfs_perag_wait_intents); + +#endif /* CONFIG_XFS_DRAIN_INTENTS */ + #endif /* _TRACE_XFS_H */ #undef TRACE_INCLUDE_PATH |